refine search; fix dedupe external results
This commit is contained in:
parent
8e506ad20e
commit
0c872e844a
12 changed files with 69 additions and 62 deletions
|
@ -95,7 +95,7 @@ class AbstractSite:
|
|||
# add this method to subclass to enable external search
|
||||
# @classmethod
|
||||
# async def search_task(
|
||||
# cls, query: str, page: int, category: str
|
||||
# cls, query: str, page: int, category: str, page_size:int
|
||||
# ) -> list[ExternalSearchResultItem]:
|
||||
# return []
|
||||
|
||||
|
|
|
@ -1,30 +1,46 @@
|
|||
import asyncio
|
||||
import logging
|
||||
|
||||
from django.core.cache import cache
|
||||
|
||||
from catalog.common import SiteManager
|
||||
from catalog.common.models import ItemCategory
|
||||
from catalog.search.models import ExternalSearchResultItem
|
||||
from catalog.sites.fedi import FediverseInstance
|
||||
|
||||
SEARCH_PAGE_SIZE = 5 # not all apis support page size
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExternalSources:
|
||||
@classmethod
|
||||
def search(
|
||||
cls, query: str, page: int = 1, category: str | None = None
|
||||
cls,
|
||||
query: str,
|
||||
page: int = 1,
|
||||
category: str | None = None,
|
||||
visible_categories: list[ItemCategory] = [],
|
||||
) -> list[ExternalSearchResultItem]:
|
||||
if not query or page < 1 or page > 10:
|
||||
if not query or page < 1 or page > 10 or not query or len(query) > 100:
|
||||
return []
|
||||
if category in ["", None]:
|
||||
category = "all"
|
||||
tasks = FediverseInstance.search_tasks(query, page, category)
|
||||
page_size = 5 if category == "all" else 10
|
||||
match category:
|
||||
case "all":
|
||||
cache_key = f"search_{','.join(visible_categories)}_{query}"
|
||||
case "movie":
|
||||
cache_key = f"search_movie,tv_{query}"
|
||||
case _:
|
||||
cache_key = f"search_{category}_{query}"
|
||||
results = cache.get("ext_" + cache_key, None)
|
||||
if results is None:
|
||||
tasks = FediverseInstance.search_tasks(query, page, category, page_size)
|
||||
for site in SiteManager.get_sites_for_search():
|
||||
tasks.append(site.search_task(query, page, category))
|
||||
tasks.append(site.search_task(query, page, category, page_size))
|
||||
# loop = asyncio.get_event_loop()
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
results = []
|
||||
for r in loop.run_until_complete(asyncio.gather(*tasks)):
|
||||
results.extend(r)
|
||||
cache.set("ext_" + cache_key, results, 300)
|
||||
dedupe_urls = cache.get(cache_key, [])
|
||||
results = [i for i in results if i.source_url not in dedupe_urls]
|
||||
return results
|
||||
|
|
|
@ -108,6 +108,7 @@ def query_index(keywords, categories=None, tag=None, page=1, prepare_external=Tr
|
|||
page < 1
|
||||
or page > 99
|
||||
or (not tag and isinstance(keywords, str) and len(keywords) < 2)
|
||||
or len(keywords) > 100
|
||||
):
|
||||
return [], 0, 0, []
|
||||
result = Indexer.search(keywords, page=page, categories=categories, tag=tag)
|
||||
|
|
|
@ -3,7 +3,6 @@ import re
|
|||
import django_rq
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.decorators import login_required
|
||||
from django.core.cache import cache
|
||||
from django.core.exceptions import BadRequest
|
||||
from django.shortcuts import redirect, render
|
||||
from django.utils.translation import gettext as _
|
||||
|
@ -155,15 +154,15 @@ def search(request):
|
|||
@login_required
|
||||
def external_search(request):
|
||||
category = request.GET.get("c", default="all").strip().lower()
|
||||
if category == "all":
|
||||
category = None
|
||||
keywords = request.GET.get("q", default="").strip()
|
||||
page_number = int_(request.GET.get("page"), 1)
|
||||
items = ExternalSources.search(keywords, page_number, category) if keywords else []
|
||||
cache_key = f"search_{category if category != 'movietv' else 'movie,tv'}_{keywords}"
|
||||
dedupe_urls = cache.get(cache_key, [])
|
||||
items = [i for i in items if i.source_url not in dedupe_urls]
|
||||
|
||||
items = (
|
||||
ExternalSources.search(
|
||||
keywords, page_number, category, visible_categories(request)
|
||||
)
|
||||
if keywords
|
||||
else []
|
||||
)
|
||||
return render(request, "external_search_results.html", {"external_items": items})
|
||||
|
||||
|
||||
|
|
|
@ -42,18 +42,17 @@ class ApplePodcast(AbstractSite):
|
|||
|
||||
@classmethod
|
||||
async def search_task(
|
||||
cls, q: str, page: int, category: str
|
||||
cls, q: str, page: int, category: str, page_size: int
|
||||
) -> list[ExternalSearchResultItem]:
|
||||
if category != "podcast":
|
||||
return []
|
||||
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
|
||||
results = []
|
||||
search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * SEARCH_PAGE_SIZE}&term={quote_plus(q)}"
|
||||
search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * page_size}&term={quote_plus(q)}"
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.get(search_url, timeout=2)
|
||||
r = response.json()
|
||||
for p in r["results"][(page - 1) * SEARCH_PAGE_SIZE :]:
|
||||
for p in r["results"][(page - 1) * page_size :]:
|
||||
if p.get("feedUrl"):
|
||||
results.append(
|
||||
ExternalSearchResultItem(
|
||||
|
|
|
@ -109,13 +109,12 @@ class Bandcamp(AbstractSite):
|
|||
|
||||
@classmethod
|
||||
async def search_task(
|
||||
cls, q: str, page: int, category: str
|
||||
cls, q: str, page: int, category: str, page_size: int
|
||||
) -> list[ExternalSearchResultItem]:
|
||||
if category != "music":
|
||||
return []
|
||||
SEARCH_PAGE_SIZE = 5
|
||||
p = (page - 1) * SEARCH_PAGE_SIZE // 18 + 1
|
||||
offset = (page - 1) * SEARCH_PAGE_SIZE % 18
|
||||
p = (page - 1) * page_size // 18 + 1
|
||||
offset = (page - 1) * page_size % 18
|
||||
results = []
|
||||
search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={p}&q={urllib.parse.quote_plus(q)}"
|
||||
async with httpx.AsyncClient() as client:
|
||||
|
@ -147,4 +146,4 @@ class Bandcamp(AbstractSite):
|
|||
logger.error(
|
||||
"Bandcamp search error", extra={"query": q, "exception": e}
|
||||
)
|
||||
return results[offset : offset + SEARCH_PAGE_SIZE]
|
||||
return results[offset : offset + page_size]
|
||||
|
|
|
@ -125,10 +125,9 @@ class FediverseInstance(AbstractSite):
|
|||
return d
|
||||
|
||||
@classmethod
|
||||
async def peer_search_task(cls, host, q, page, category=None):
|
||||
SEARCH_PAGE_SIZE = 5
|
||||
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
|
||||
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
|
||||
async def peer_search_task(cls, host, q, page, category=None, page_size=5):
|
||||
p = (page - 1) * page_size // 20 + 1
|
||||
offset = (page - 1) * page_size % 20
|
||||
api_url = f"https://{host}/api/catalog/search?query={quote_plus(q)}&page={p}{'&category=' + category if category and category != 'all' else ''}"
|
||||
async with httpx.AsyncClient() as client:
|
||||
results = []
|
||||
|
@ -167,12 +166,14 @@ class FediverseInstance(AbstractSite):
|
|||
item["cover_image_url"],
|
||||
)
|
||||
)
|
||||
return results[offset : offset + SEARCH_PAGE_SIZE]
|
||||
return results[offset : offset + page_size]
|
||||
|
||||
@classmethod
|
||||
def search_tasks(cls, q: str, page: int = 1, category: str | None = None):
|
||||
def search_tasks(
|
||||
cls, q: str, page: int = 1, category: str | None = None, page_size=5
|
||||
):
|
||||
from takahe.utils import Takahe
|
||||
|
||||
peers = Takahe.get_neodb_peers()
|
||||
c = category if category != "movietv" else "movie,tv"
|
||||
return [cls.peer_search_task(host, q, page, c) for host in peers]
|
||||
return [cls.peer_search_task(host, q, page, c, page_size) for host in peers]
|
||||
|
|
|
@ -123,13 +123,12 @@ class Goodreads(AbstractSite):
|
|||
|
||||
@classmethod
|
||||
async def search_task(
|
||||
cls, q: str, page: int, category: str
|
||||
cls, q: str, page: int, category: str, page_size: int
|
||||
) -> list[ExternalSearchResultItem]:
|
||||
if category not in ["all", "book"]:
|
||||
return []
|
||||
SEARCH_PAGE_SIZE = 5
|
||||
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
|
||||
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
|
||||
p = (page - 1) * page_size // 20 + 1
|
||||
offset = (page - 1) * page_size % 20
|
||||
results = []
|
||||
search_url = f"https://www.goodreads.com/search?page={p}&q={quote_plus(q)}"
|
||||
async with httpx.AsyncClient() as client:
|
||||
|
@ -195,7 +194,7 @@ class Goodreads(AbstractSite):
|
|||
logger.error(
|
||||
"Goodreads search error", extra={"query": q, "exception": e}
|
||||
)
|
||||
return results[offset : offset + SEARCH_PAGE_SIZE]
|
||||
return results[offset : offset + page_size]
|
||||
|
||||
|
||||
@SiteManager.register
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import logging
|
||||
import re
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
|
@ -10,8 +9,6 @@ from catalog.book.utils import isbn_10_to_13
|
|||
from catalog.common import *
|
||||
from catalog.models import *
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@SiteManager.register
|
||||
class GoogleBooks(AbstractSite):
|
||||
|
@ -122,13 +119,12 @@ class GoogleBooks(AbstractSite):
|
|||
|
||||
@classmethod
|
||||
async def search_task(
|
||||
cls, q: str, page: int, category: str
|
||||
cls, q: str, page: int, category: str, page_size: int
|
||||
) -> list[ExternalSearchResultItem]:
|
||||
if category not in ["all", "book"]:
|
||||
return []
|
||||
SEARCH_PAGE_SIZE = 5
|
||||
results = []
|
||||
api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE * (page - 1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE"
|
||||
api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={page_size * (page - 1)}&maxResults={page_size}&maxAllowedMaturityRating=MATURE"
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.get(api_url, timeout=2)
|
||||
|
|
|
@ -166,12 +166,11 @@ class IGDB(AbstractSite):
|
|||
|
||||
@classmethod
|
||||
async def search_task(
|
||||
cls, q: str, page: int, category: str
|
||||
cls, q: str, page: int, category: str, page_size: int
|
||||
) -> list[ExternalSearchResultItem]:
|
||||
if category != "game":
|
||||
return []
|
||||
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
|
||||
limit = SEARCH_PAGE_SIZE
|
||||
limit = page_size
|
||||
offset = (page - 1) * limit
|
||||
q = f'fields *, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name; search "{quote_plus(q)}"; limit {limit}; offset {offset};'
|
||||
_wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, _igdb_access_token())
|
||||
|
|
|
@ -111,13 +111,12 @@ class Spotify(AbstractSite):
|
|||
|
||||
@classmethod
|
||||
async def search_task(
|
||||
cls, q: str, page: int, category: str
|
||||
cls, q: str, page: int, category: str, page_size: int
|
||||
) -> list[ExternalSearchResultItem]:
|
||||
if category not in ["music", "all"]:
|
||||
return []
|
||||
SEARCH_PAGE_SIZE = 5
|
||||
results = []
|
||||
api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page * SEARCH_PAGE_SIZE}"
|
||||
api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={page_size}&offset={page * page_size}"
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
headers = {"Authorization": f"Bearer {get_spotify_token()}"}
|
||||
|
|
|
@ -180,13 +180,12 @@ class TMDB_Movie(AbstractSite):
|
|||
|
||||
@classmethod
|
||||
async def search_task(
|
||||
cls, q: str, page: int, category: str
|
||||
cls, q: str, page: int, category: str, page_size: int
|
||||
) -> list[ExternalSearchResultItem]:
|
||||
if category not in ["movietv", "all", "movie", "tv"]:
|
||||
return []
|
||||
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
|
||||
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
|
||||
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
|
||||
p = (page - 1) * page_size // 20 + 1
|
||||
offset = (page - 1) * page_size % 20
|
||||
results = []
|
||||
api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={p}&api_key={settings.TMDB_API3_KEY}&language={TMDB_DEFAULT_LANG}&include_adult=true"
|
||||
async with httpx.AsyncClient() as client:
|
||||
|
@ -225,7 +224,7 @@ class TMDB_Movie(AbstractSite):
|
|||
logger.warning(f"TMDB search '{q}' no results found.")
|
||||
except Exception as e:
|
||||
logger.error("TMDb search error", extra={"query": q, "exception": e})
|
||||
return results[offset : offset + SEARCH_PAGE_SIZE]
|
||||
return results[offset : offset + page_size]
|
||||
|
||||
|
||||
@SiteManager.register
|
||||
|
|
Loading…
Add table
Reference in a new issue