refine search; fix dedupe external results

This commit is contained in:
Your Name 2025-01-20 07:03:56 -05:00 committed by Henri Dickson
parent 8e506ad20e
commit 0c872e844a
12 changed files with 69 additions and 62 deletions

View file

@ -95,7 +95,7 @@ class AbstractSite:
# add this method to subclass to enable external search
# @classmethod
# async def search_task(
# cls, query: str, page: int, category: str
# cls, query: str, page: int, category: str, page_size:int
# ) -> list[ExternalSearchResultItem]:
# return []

View file

@ -1,30 +1,46 @@
import asyncio
import logging
from django.core.cache import cache
from catalog.common import SiteManager
from catalog.common.models import ItemCategory
from catalog.search.models import ExternalSearchResultItem
from catalog.sites.fedi import FediverseInstance
SEARCH_PAGE_SIZE = 5 # not all apis support page size
logger = logging.getLogger(__name__)
class ExternalSources:
@classmethod
def search(
cls, query: str, page: int = 1, category: str | None = None
cls,
query: str,
page: int = 1,
category: str | None = None,
visible_categories: list[ItemCategory] = [],
) -> list[ExternalSearchResultItem]:
if not query or page < 1 or page > 10:
if not query or page < 1 or page > 10 or not query or len(query) > 100:
return []
if category in ["", None]:
category = "all"
tasks = FediverseInstance.search_tasks(query, page, category)
for site in SiteManager.get_sites_for_search():
tasks.append(site.search_task(query, page, category))
# loop = asyncio.get_event_loop()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
results = []
for r in loop.run_until_complete(asyncio.gather(*tasks)):
results.extend(r)
page_size = 5 if category == "all" else 10
match category:
case "all":
cache_key = f"search_{','.join(visible_categories)}_{query}"
case "movie":
cache_key = f"search_movie,tv_{query}"
case _:
cache_key = f"search_{category}_{query}"
results = cache.get("ext_" + cache_key, None)
if results is None:
tasks = FediverseInstance.search_tasks(query, page, category, page_size)
for site in SiteManager.get_sites_for_search():
tasks.append(site.search_task(query, page, category, page_size))
# loop = asyncio.get_event_loop()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
results = []
for r in loop.run_until_complete(asyncio.gather(*tasks)):
results.extend(r)
cache.set("ext_" + cache_key, results, 300)
dedupe_urls = cache.get(cache_key, [])
results = [i for i in results if i.source_url not in dedupe_urls]
return results

View file

@ -108,6 +108,7 @@ def query_index(keywords, categories=None, tag=None, page=1, prepare_external=Tr
page < 1
or page > 99
or (not tag and isinstance(keywords, str) and len(keywords) < 2)
or len(keywords) > 100
):
return [], 0, 0, []
result = Indexer.search(keywords, page=page, categories=categories, tag=tag)

View file

@ -3,7 +3,6 @@ import re
import django_rq
from django.conf import settings
from django.contrib.auth.decorators import login_required
from django.core.cache import cache
from django.core.exceptions import BadRequest
from django.shortcuts import redirect, render
from django.utils.translation import gettext as _
@ -155,15 +154,15 @@ def search(request):
@login_required
def external_search(request):
category = request.GET.get("c", default="all").strip().lower()
if category == "all":
category = None
keywords = request.GET.get("q", default="").strip()
page_number = int_(request.GET.get("page"), 1)
items = ExternalSources.search(keywords, page_number, category) if keywords else []
cache_key = f"search_{category if category != 'movietv' else 'movie,tv'}_{keywords}"
dedupe_urls = cache.get(cache_key, [])
items = [i for i in items if i.source_url not in dedupe_urls]
items = (
ExternalSources.search(
keywords, page_number, category, visible_categories(request)
)
if keywords
else []
)
return render(request, "external_search_results.html", {"external_items": items})

View file

@ -42,18 +42,17 @@ class ApplePodcast(AbstractSite):
@classmethod
async def search_task(
cls, q: str, page: int, category: str
cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]:
if category != "podcast":
return []
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
results = []
search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * SEARCH_PAGE_SIZE}&term={quote_plus(q)}"
search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * page_size}&term={quote_plus(q)}"
async with httpx.AsyncClient() as client:
try:
response = await client.get(search_url, timeout=2)
r = response.json()
for p in r["results"][(page - 1) * SEARCH_PAGE_SIZE :]:
for p in r["results"][(page - 1) * page_size :]:
if p.get("feedUrl"):
results.append(
ExternalSearchResultItem(

View file

@ -109,13 +109,12 @@ class Bandcamp(AbstractSite):
@classmethod
async def search_task(
cls, q: str, page: int, category: str
cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]:
if category != "music":
return []
SEARCH_PAGE_SIZE = 5
p = (page - 1) * SEARCH_PAGE_SIZE // 18 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 18
p = (page - 1) * page_size // 18 + 1
offset = (page - 1) * page_size % 18
results = []
search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={p}&q={urllib.parse.quote_plus(q)}"
async with httpx.AsyncClient() as client:
@ -147,4 +146,4 @@ class Bandcamp(AbstractSite):
logger.error(
"Bandcamp search error", extra={"query": q, "exception": e}
)
return results[offset : offset + SEARCH_PAGE_SIZE]
return results[offset : offset + page_size]

View file

@ -125,10 +125,9 @@ class FediverseInstance(AbstractSite):
return d
@classmethod
async def peer_search_task(cls, host, q, page, category=None):
SEARCH_PAGE_SIZE = 5
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
async def peer_search_task(cls, host, q, page, category=None, page_size=5):
p = (page - 1) * page_size // 20 + 1
offset = (page - 1) * page_size % 20
api_url = f"https://{host}/api/catalog/search?query={quote_plus(q)}&page={p}{'&category=' + category if category and category != 'all' else ''}"
async with httpx.AsyncClient() as client:
results = []
@ -167,12 +166,14 @@ class FediverseInstance(AbstractSite):
item["cover_image_url"],
)
)
return results[offset : offset + SEARCH_PAGE_SIZE]
return results[offset : offset + page_size]
@classmethod
def search_tasks(cls, q: str, page: int = 1, category: str | None = None):
def search_tasks(
cls, q: str, page: int = 1, category: str | None = None, page_size=5
):
from takahe.utils import Takahe
peers = Takahe.get_neodb_peers()
c = category if category != "movietv" else "movie,tv"
return [cls.peer_search_task(host, q, page, c) for host in peers]
return [cls.peer_search_task(host, q, page, c, page_size) for host in peers]

View file

@ -123,13 +123,12 @@ class Goodreads(AbstractSite):
@classmethod
async def search_task(
cls, q: str, page: int, category: str
cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]:
if category not in ["all", "book"]:
return []
SEARCH_PAGE_SIZE = 5
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
p = (page - 1) * page_size // 20 + 1
offset = (page - 1) * page_size % 20
results = []
search_url = f"https://www.goodreads.com/search?page={p}&q={quote_plus(q)}"
async with httpx.AsyncClient() as client:
@ -195,7 +194,7 @@ class Goodreads(AbstractSite):
logger.error(
"Goodreads search error", extra={"query": q, "exception": e}
)
return results[offset : offset + SEARCH_PAGE_SIZE]
return results[offset : offset + page_size]
@SiteManager.register

View file

@ -1,4 +1,3 @@
import logging
import re
from urllib.parse import quote_plus
@ -10,8 +9,6 @@ from catalog.book.utils import isbn_10_to_13
from catalog.common import *
from catalog.models import *
_logger = logging.getLogger(__name__)
@SiteManager.register
class GoogleBooks(AbstractSite):
@ -122,13 +119,12 @@ class GoogleBooks(AbstractSite):
@classmethod
async def search_task(
cls, q: str, page: int, category: str
cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]:
if category not in ["all", "book"]:
return []
SEARCH_PAGE_SIZE = 5
results = []
api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE * (page - 1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE"
api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={page_size * (page - 1)}&maxResults={page_size}&maxAllowedMaturityRating=MATURE"
async with httpx.AsyncClient() as client:
try:
response = await client.get(api_url, timeout=2)

View file

@ -166,12 +166,11 @@ class IGDB(AbstractSite):
@classmethod
async def search_task(
cls, q: str, page: int, category: str
cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]:
if category != "game":
return []
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
limit = SEARCH_PAGE_SIZE
limit = page_size
offset = (page - 1) * limit
q = f'fields *, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name; search "{quote_plus(q)}"; limit {limit}; offset {offset};'
_wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, _igdb_access_token())

View file

@ -111,13 +111,12 @@ class Spotify(AbstractSite):
@classmethod
async def search_task(
cls, q: str, page: int, category: str
cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]:
if category not in ["music", "all"]:
return []
SEARCH_PAGE_SIZE = 5
results = []
api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page * SEARCH_PAGE_SIZE}"
api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={page_size}&offset={page * page_size}"
async with httpx.AsyncClient() as client:
try:
headers = {"Authorization": f"Bearer {get_spotify_token()}"}

View file

@ -180,13 +180,12 @@ class TMDB_Movie(AbstractSite):
@classmethod
async def search_task(
cls, q: str, page: int, category: str
cls, q: str, page: int, category: str, page_size: int
) -> list[ExternalSearchResultItem]:
if category not in ["movietv", "all", "movie", "tv"]:
return []
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
p = (page - 1) * page_size // 20 + 1
offset = (page - 1) * page_size % 20
results = []
api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={p}&api_key={settings.TMDB_API3_KEY}&language={TMDB_DEFAULT_LANG}&include_adult=true"
async with httpx.AsyncClient() as client:
@ -225,7 +224,7 @@ class TMDB_Movie(AbstractSite):
logger.warning(f"TMDB search '{q}' no results found.")
except Exception as e:
logger.error("TMDb search error", extra={"query": q, "exception": e})
return results[offset : offset + SEARCH_PAGE_SIZE]
return results[offset : offset + page_size]
@SiteManager.register