refine search; fix dedupe external results

This commit is contained in:
Your Name 2025-01-20 07:03:56 -05:00 committed by Henri Dickson
parent 8e506ad20e
commit 0c872e844a
12 changed files with 69 additions and 62 deletions

View file

@ -95,7 +95,7 @@ class AbstractSite:
# add this method to subclass to enable external search # add this method to subclass to enable external search
# @classmethod # @classmethod
# async def search_task( # async def search_task(
# cls, query: str, page: int, category: str # cls, query: str, page: int, category: str, page_size:int
# ) -> list[ExternalSearchResultItem]: # ) -> list[ExternalSearchResultItem]:
# return [] # return []

View file

@ -1,30 +1,46 @@
import asyncio import asyncio
import logging
from django.core.cache import cache
from catalog.common import SiteManager from catalog.common import SiteManager
from catalog.common.models import ItemCategory
from catalog.search.models import ExternalSearchResultItem from catalog.search.models import ExternalSearchResultItem
from catalog.sites.fedi import FediverseInstance from catalog.sites.fedi import FediverseInstance
SEARCH_PAGE_SIZE = 5 # not all apis support page size
logger = logging.getLogger(__name__)
class ExternalSources: class ExternalSources:
@classmethod @classmethod
def search( def search(
cls, query: str, page: int = 1, category: str | None = None cls,
query: str,
page: int = 1,
category: str | None = None,
visible_categories: list[ItemCategory] = [],
) -> list[ExternalSearchResultItem]: ) -> list[ExternalSearchResultItem]:
if not query or page < 1 or page > 10: if not query or page < 1 or page > 10 or not query or len(query) > 100:
return [] return []
if category in ["", None]: if category in ["", None]:
category = "all" category = "all"
tasks = FediverseInstance.search_tasks(query, page, category) page_size = 5 if category == "all" else 10
for site in SiteManager.get_sites_for_search(): match category:
tasks.append(site.search_task(query, page, category)) case "all":
# loop = asyncio.get_event_loop() cache_key = f"search_{','.join(visible_categories)}_{query}"
loop = asyncio.new_event_loop() case "movie":
asyncio.set_event_loop(loop) cache_key = f"search_movie,tv_{query}"
results = [] case _:
for r in loop.run_until_complete(asyncio.gather(*tasks)): cache_key = f"search_{category}_{query}"
results.extend(r) results = cache.get("ext_" + cache_key, None)
if results is None:
tasks = FediverseInstance.search_tasks(query, page, category, page_size)
for site in SiteManager.get_sites_for_search():
tasks.append(site.search_task(query, page, category, page_size))
# loop = asyncio.get_event_loop()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
results = []
for r in loop.run_until_complete(asyncio.gather(*tasks)):
results.extend(r)
cache.set("ext_" + cache_key, results, 300)
dedupe_urls = cache.get(cache_key, [])
results = [i for i in results if i.source_url not in dedupe_urls]
return results return results

View file

@ -108,6 +108,7 @@ def query_index(keywords, categories=None, tag=None, page=1, prepare_external=Tr
page < 1 page < 1
or page > 99 or page > 99
or (not tag and isinstance(keywords, str) and len(keywords) < 2) or (not tag and isinstance(keywords, str) and len(keywords) < 2)
or len(keywords) > 100
): ):
return [], 0, 0, [] return [], 0, 0, []
result = Indexer.search(keywords, page=page, categories=categories, tag=tag) result = Indexer.search(keywords, page=page, categories=categories, tag=tag)

View file

@ -3,7 +3,6 @@ import re
import django_rq import django_rq
from django.conf import settings from django.conf import settings
from django.contrib.auth.decorators import login_required from django.contrib.auth.decorators import login_required
from django.core.cache import cache
from django.core.exceptions import BadRequest from django.core.exceptions import BadRequest
from django.shortcuts import redirect, render from django.shortcuts import redirect, render
from django.utils.translation import gettext as _ from django.utils.translation import gettext as _
@ -155,15 +154,15 @@ def search(request):
@login_required @login_required
def external_search(request): def external_search(request):
category = request.GET.get("c", default="all").strip().lower() category = request.GET.get("c", default="all").strip().lower()
if category == "all":
category = None
keywords = request.GET.get("q", default="").strip() keywords = request.GET.get("q", default="").strip()
page_number = int_(request.GET.get("page"), 1) page_number = int_(request.GET.get("page"), 1)
items = ExternalSources.search(keywords, page_number, category) if keywords else [] items = (
cache_key = f"search_{category if category != 'movietv' else 'movie,tv'}_{keywords}" ExternalSources.search(
dedupe_urls = cache.get(cache_key, []) keywords, page_number, category, visible_categories(request)
items = [i for i in items if i.source_url not in dedupe_urls] )
if keywords
else []
)
return render(request, "external_search_results.html", {"external_items": items}) return render(request, "external_search_results.html", {"external_items": items})

View file

@ -42,18 +42,17 @@ class ApplePodcast(AbstractSite):
@classmethod @classmethod
async def search_task( async def search_task(
cls, q: str, page: int, category: str cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]: ) -> list[ExternalSearchResultItem]:
if category != "podcast": if category != "podcast":
return [] return []
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
results = [] results = []
search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * SEARCH_PAGE_SIZE}&term={quote_plus(q)}" search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * page_size}&term={quote_plus(q)}"
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
try: try:
response = await client.get(search_url, timeout=2) response = await client.get(search_url, timeout=2)
r = response.json() r = response.json()
for p in r["results"][(page - 1) * SEARCH_PAGE_SIZE :]: for p in r["results"][(page - 1) * page_size :]:
if p.get("feedUrl"): if p.get("feedUrl"):
results.append( results.append(
ExternalSearchResultItem( ExternalSearchResultItem(

View file

@ -109,13 +109,12 @@ class Bandcamp(AbstractSite):
@classmethod @classmethod
async def search_task( async def search_task(
cls, q: str, page: int, category: str cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]: ) -> list[ExternalSearchResultItem]:
if category != "music": if category != "music":
return [] return []
SEARCH_PAGE_SIZE = 5 p = (page - 1) * page_size // 18 + 1
p = (page - 1) * SEARCH_PAGE_SIZE // 18 + 1 offset = (page - 1) * page_size % 18
offset = (page - 1) * SEARCH_PAGE_SIZE % 18
results = [] results = []
search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={p}&q={urllib.parse.quote_plus(q)}" search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={p}&q={urllib.parse.quote_plus(q)}"
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
@ -147,4 +146,4 @@ class Bandcamp(AbstractSite):
logger.error( logger.error(
"Bandcamp search error", extra={"query": q, "exception": e} "Bandcamp search error", extra={"query": q, "exception": e}
) )
return results[offset : offset + SEARCH_PAGE_SIZE] return results[offset : offset + page_size]

View file

@ -125,10 +125,9 @@ class FediverseInstance(AbstractSite):
return d return d
@classmethod @classmethod
async def peer_search_task(cls, host, q, page, category=None): async def peer_search_task(cls, host, q, page, category=None, page_size=5):
SEARCH_PAGE_SIZE = 5 p = (page - 1) * page_size // 20 + 1
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1 offset = (page - 1) * page_size % 20
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
api_url = f"https://{host}/api/catalog/search?query={quote_plus(q)}&page={p}{'&category=' + category if category and category != 'all' else ''}" api_url = f"https://{host}/api/catalog/search?query={quote_plus(q)}&page={p}{'&category=' + category if category and category != 'all' else ''}"
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
results = [] results = []
@ -167,12 +166,14 @@ class FediverseInstance(AbstractSite):
item["cover_image_url"], item["cover_image_url"],
) )
) )
return results[offset : offset + SEARCH_PAGE_SIZE] return results[offset : offset + page_size]
@classmethod @classmethod
def search_tasks(cls, q: str, page: int = 1, category: str | None = None): def search_tasks(
cls, q: str, page: int = 1, category: str | None = None, page_size=5
):
from takahe.utils import Takahe from takahe.utils import Takahe
peers = Takahe.get_neodb_peers() peers = Takahe.get_neodb_peers()
c = category if category != "movietv" else "movie,tv" c = category if category != "movietv" else "movie,tv"
return [cls.peer_search_task(host, q, page, c) for host in peers] return [cls.peer_search_task(host, q, page, c, page_size) for host in peers]

View file

@ -123,13 +123,12 @@ class Goodreads(AbstractSite):
@classmethod @classmethod
async def search_task( async def search_task(
cls, q: str, page: int, category: str cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]: ) -> list[ExternalSearchResultItem]:
if category not in ["all", "book"]: if category not in ["all", "book"]:
return [] return []
SEARCH_PAGE_SIZE = 5 p = (page - 1) * page_size // 20 + 1
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1 offset = (page - 1) * page_size % 20
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
results = [] results = []
search_url = f"https://www.goodreads.com/search?page={p}&q={quote_plus(q)}" search_url = f"https://www.goodreads.com/search?page={p}&q={quote_plus(q)}"
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
@ -195,7 +194,7 @@ class Goodreads(AbstractSite):
logger.error( logger.error(
"Goodreads search error", extra={"query": q, "exception": e} "Goodreads search error", extra={"query": q, "exception": e}
) )
return results[offset : offset + SEARCH_PAGE_SIZE] return results[offset : offset + page_size]
@SiteManager.register @SiteManager.register

View file

@ -1,4 +1,3 @@
import logging
import re import re
from urllib.parse import quote_plus from urllib.parse import quote_plus
@ -10,8 +9,6 @@ from catalog.book.utils import isbn_10_to_13
from catalog.common import * from catalog.common import *
from catalog.models import * from catalog.models import *
_logger = logging.getLogger(__name__)
@SiteManager.register @SiteManager.register
class GoogleBooks(AbstractSite): class GoogleBooks(AbstractSite):
@ -122,13 +119,12 @@ class GoogleBooks(AbstractSite):
@classmethod @classmethod
async def search_task( async def search_task(
cls, q: str, page: int, category: str cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]: ) -> list[ExternalSearchResultItem]:
if category not in ["all", "book"]: if category not in ["all", "book"]:
return [] return []
SEARCH_PAGE_SIZE = 5
results = [] results = []
api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE * (page - 1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE" api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={page_size * (page - 1)}&maxResults={page_size}&maxAllowedMaturityRating=MATURE"
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
try: try:
response = await client.get(api_url, timeout=2) response = await client.get(api_url, timeout=2)

View file

@ -166,12 +166,11 @@ class IGDB(AbstractSite):
@classmethod @classmethod
async def search_task( async def search_task(
cls, q: str, page: int, category: str cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]: ) -> list[ExternalSearchResultItem]:
if category != "game": if category != "game":
return [] return []
SEARCH_PAGE_SIZE = 5 if category == "all" else 10 limit = page_size
limit = SEARCH_PAGE_SIZE
offset = (page - 1) * limit offset = (page - 1) * limit
q = f'fields *, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name; search "{quote_plus(q)}"; limit {limit}; offset {offset};' q = f'fields *, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name; search "{quote_plus(q)}"; limit {limit}; offset {offset};'
_wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, _igdb_access_token()) _wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, _igdb_access_token())

View file

@ -111,13 +111,12 @@ class Spotify(AbstractSite):
@classmethod @classmethod
async def search_task( async def search_task(
cls, q: str, page: int, category: str cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]: ) -> list[ExternalSearchResultItem]:
if category not in ["music", "all"]: if category not in ["music", "all"]:
return [] return []
SEARCH_PAGE_SIZE = 5
results = [] results = []
api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page * SEARCH_PAGE_SIZE}" api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={page_size}&offset={page * page_size}"
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
try: try:
headers = {"Authorization": f"Bearer {get_spotify_token()}"} headers = {"Authorization": f"Bearer {get_spotify_token()}"}

View file

@ -180,13 +180,12 @@ class TMDB_Movie(AbstractSite):
@classmethod @classmethod
async def search_task( async def search_task(
cls, q: str, page: int, category: str cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]: ) -> list[ExternalSearchResultItem]:
if category not in ["movietv", "all", "movie", "tv"]: if category not in ["movietv", "all", "movie", "tv"]:
return [] return []
SEARCH_PAGE_SIZE = 5 if category == "all" else 10 p = (page - 1) * page_size // 20 + 1
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1 offset = (page - 1) * page_size % 20
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
results = [] results = []
api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={p}&api_key={settings.TMDB_API3_KEY}&language={TMDB_DEFAULT_LANG}&include_adult=true" api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={p}&api_key={settings.TMDB_API3_KEY}&language={TMDB_DEFAULT_LANG}&include_adult=true"
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
@ -225,7 +224,7 @@ class TMDB_Movie(AbstractSite):
logger.warning(f"TMDB search '{q}' no results found.") logger.warning(f"TMDB search '{q}' no results found.")
except Exception as e: except Exception as e:
logger.error("TMDb search error", extra={"query": q, "exception": e}) logger.error("TMDb search error", extra={"query": q, "exception": e})
return results[offset : offset + SEARCH_PAGE_SIZE] return results[offset : offset + page_size]
@SiteManager.register @SiteManager.register