refine search; fix dedupe external results
This commit is contained in:
parent
8e506ad20e
commit
0c872e844a
12 changed files with 69 additions and 62 deletions
|
@ -95,7 +95,7 @@ class AbstractSite:
|
||||||
# add this method to subclass to enable external search
|
# add this method to subclass to enable external search
|
||||||
# @classmethod
|
# @classmethod
|
||||||
# async def search_task(
|
# async def search_task(
|
||||||
# cls, query: str, page: int, category: str
|
# cls, query: str, page: int, category: str, page_size:int
|
||||||
# ) -> list[ExternalSearchResultItem]:
|
# ) -> list[ExternalSearchResultItem]:
|
||||||
# return []
|
# return []
|
||||||
|
|
||||||
|
|
|
@ -1,30 +1,46 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
|
||||||
|
from django.core.cache import cache
|
||||||
|
|
||||||
from catalog.common import SiteManager
|
from catalog.common import SiteManager
|
||||||
|
from catalog.common.models import ItemCategory
|
||||||
from catalog.search.models import ExternalSearchResultItem
|
from catalog.search.models import ExternalSearchResultItem
|
||||||
from catalog.sites.fedi import FediverseInstance
|
from catalog.sites.fedi import FediverseInstance
|
||||||
|
|
||||||
SEARCH_PAGE_SIZE = 5 # not all apis support page size
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class ExternalSources:
|
class ExternalSources:
|
||||||
@classmethod
|
@classmethod
|
||||||
def search(
|
def search(
|
||||||
cls, query: str, page: int = 1, category: str | None = None
|
cls,
|
||||||
|
query: str,
|
||||||
|
page: int = 1,
|
||||||
|
category: str | None = None,
|
||||||
|
visible_categories: list[ItemCategory] = [],
|
||||||
) -> list[ExternalSearchResultItem]:
|
) -> list[ExternalSearchResultItem]:
|
||||||
if not query or page < 1 or page > 10:
|
if not query or page < 1 or page > 10 or not query or len(query) > 100:
|
||||||
return []
|
return []
|
||||||
if category in ["", None]:
|
if category in ["", None]:
|
||||||
category = "all"
|
category = "all"
|
||||||
tasks = FediverseInstance.search_tasks(query, page, category)
|
page_size = 5 if category == "all" else 10
|
||||||
for site in SiteManager.get_sites_for_search():
|
match category:
|
||||||
tasks.append(site.search_task(query, page, category))
|
case "all":
|
||||||
# loop = asyncio.get_event_loop()
|
cache_key = f"search_{','.join(visible_categories)}_{query}"
|
||||||
loop = asyncio.new_event_loop()
|
case "movie":
|
||||||
asyncio.set_event_loop(loop)
|
cache_key = f"search_movie,tv_{query}"
|
||||||
results = []
|
case _:
|
||||||
for r in loop.run_until_complete(asyncio.gather(*tasks)):
|
cache_key = f"search_{category}_{query}"
|
||||||
results.extend(r)
|
results = cache.get("ext_" + cache_key, None)
|
||||||
|
if results is None:
|
||||||
|
tasks = FediverseInstance.search_tasks(query, page, category, page_size)
|
||||||
|
for site in SiteManager.get_sites_for_search():
|
||||||
|
tasks.append(site.search_task(query, page, category, page_size))
|
||||||
|
# loop = asyncio.get_event_loop()
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
results = []
|
||||||
|
for r in loop.run_until_complete(asyncio.gather(*tasks)):
|
||||||
|
results.extend(r)
|
||||||
|
cache.set("ext_" + cache_key, results, 300)
|
||||||
|
dedupe_urls = cache.get(cache_key, [])
|
||||||
|
results = [i for i in results if i.source_url not in dedupe_urls]
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -108,6 +108,7 @@ def query_index(keywords, categories=None, tag=None, page=1, prepare_external=Tr
|
||||||
page < 1
|
page < 1
|
||||||
or page > 99
|
or page > 99
|
||||||
or (not tag and isinstance(keywords, str) and len(keywords) < 2)
|
or (not tag and isinstance(keywords, str) and len(keywords) < 2)
|
||||||
|
or len(keywords) > 100
|
||||||
):
|
):
|
||||||
return [], 0, 0, []
|
return [], 0, 0, []
|
||||||
result = Indexer.search(keywords, page=page, categories=categories, tag=tag)
|
result = Indexer.search(keywords, page=page, categories=categories, tag=tag)
|
||||||
|
|
|
@ -3,7 +3,6 @@ import re
|
||||||
import django_rq
|
import django_rq
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.contrib.auth.decorators import login_required
|
from django.contrib.auth.decorators import login_required
|
||||||
from django.core.cache import cache
|
|
||||||
from django.core.exceptions import BadRequest
|
from django.core.exceptions import BadRequest
|
||||||
from django.shortcuts import redirect, render
|
from django.shortcuts import redirect, render
|
||||||
from django.utils.translation import gettext as _
|
from django.utils.translation import gettext as _
|
||||||
|
@ -155,15 +154,15 @@ def search(request):
|
||||||
@login_required
|
@login_required
|
||||||
def external_search(request):
|
def external_search(request):
|
||||||
category = request.GET.get("c", default="all").strip().lower()
|
category = request.GET.get("c", default="all").strip().lower()
|
||||||
if category == "all":
|
|
||||||
category = None
|
|
||||||
keywords = request.GET.get("q", default="").strip()
|
keywords = request.GET.get("q", default="").strip()
|
||||||
page_number = int_(request.GET.get("page"), 1)
|
page_number = int_(request.GET.get("page"), 1)
|
||||||
items = ExternalSources.search(keywords, page_number, category) if keywords else []
|
items = (
|
||||||
cache_key = f"search_{category if category != 'movietv' else 'movie,tv'}_{keywords}"
|
ExternalSources.search(
|
||||||
dedupe_urls = cache.get(cache_key, [])
|
keywords, page_number, category, visible_categories(request)
|
||||||
items = [i for i in items if i.source_url not in dedupe_urls]
|
)
|
||||||
|
if keywords
|
||||||
|
else []
|
||||||
|
)
|
||||||
return render(request, "external_search_results.html", {"external_items": items})
|
return render(request, "external_search_results.html", {"external_items": items})
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -42,18 +42,17 @@ class ApplePodcast(AbstractSite):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def search_task(
|
async def search_task(
|
||||||
cls, q: str, page: int, category: str
|
cls, q: str, page: int, category: str, page_size: int
|
||||||
) -> list[ExternalSearchResultItem]:
|
) -> list[ExternalSearchResultItem]:
|
||||||
if category != "podcast":
|
if category != "podcast":
|
||||||
return []
|
return []
|
||||||
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
|
|
||||||
results = []
|
results = []
|
||||||
search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * SEARCH_PAGE_SIZE}&term={quote_plus(q)}"
|
search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * page_size}&term={quote_plus(q)}"
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
try:
|
try:
|
||||||
response = await client.get(search_url, timeout=2)
|
response = await client.get(search_url, timeout=2)
|
||||||
r = response.json()
|
r = response.json()
|
||||||
for p in r["results"][(page - 1) * SEARCH_PAGE_SIZE :]:
|
for p in r["results"][(page - 1) * page_size :]:
|
||||||
if p.get("feedUrl"):
|
if p.get("feedUrl"):
|
||||||
results.append(
|
results.append(
|
||||||
ExternalSearchResultItem(
|
ExternalSearchResultItem(
|
||||||
|
|
|
@ -109,13 +109,12 @@ class Bandcamp(AbstractSite):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def search_task(
|
async def search_task(
|
||||||
cls, q: str, page: int, category: str
|
cls, q: str, page: int, category: str, page_size: int
|
||||||
) -> list[ExternalSearchResultItem]:
|
) -> list[ExternalSearchResultItem]:
|
||||||
if category != "music":
|
if category != "music":
|
||||||
return []
|
return []
|
||||||
SEARCH_PAGE_SIZE = 5
|
p = (page - 1) * page_size // 18 + 1
|
||||||
p = (page - 1) * SEARCH_PAGE_SIZE // 18 + 1
|
offset = (page - 1) * page_size % 18
|
||||||
offset = (page - 1) * SEARCH_PAGE_SIZE % 18
|
|
||||||
results = []
|
results = []
|
||||||
search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={p}&q={urllib.parse.quote_plus(q)}"
|
search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={p}&q={urllib.parse.quote_plus(q)}"
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
|
@ -147,4 +146,4 @@ class Bandcamp(AbstractSite):
|
||||||
logger.error(
|
logger.error(
|
||||||
"Bandcamp search error", extra={"query": q, "exception": e}
|
"Bandcamp search error", extra={"query": q, "exception": e}
|
||||||
)
|
)
|
||||||
return results[offset : offset + SEARCH_PAGE_SIZE]
|
return results[offset : offset + page_size]
|
||||||
|
|
|
@ -125,10 +125,9 @@ class FediverseInstance(AbstractSite):
|
||||||
return d
|
return d
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def peer_search_task(cls, host, q, page, category=None):
|
async def peer_search_task(cls, host, q, page, category=None, page_size=5):
|
||||||
SEARCH_PAGE_SIZE = 5
|
p = (page - 1) * page_size // 20 + 1
|
||||||
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
|
offset = (page - 1) * page_size % 20
|
||||||
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
|
|
||||||
api_url = f"https://{host}/api/catalog/search?query={quote_plus(q)}&page={p}{'&category=' + category if category and category != 'all' else ''}"
|
api_url = f"https://{host}/api/catalog/search?query={quote_plus(q)}&page={p}{'&category=' + category if category and category != 'all' else ''}"
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
results = []
|
results = []
|
||||||
|
@ -167,12 +166,14 @@ class FediverseInstance(AbstractSite):
|
||||||
item["cover_image_url"],
|
item["cover_image_url"],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return results[offset : offset + SEARCH_PAGE_SIZE]
|
return results[offset : offset + page_size]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def search_tasks(cls, q: str, page: int = 1, category: str | None = None):
|
def search_tasks(
|
||||||
|
cls, q: str, page: int = 1, category: str | None = None, page_size=5
|
||||||
|
):
|
||||||
from takahe.utils import Takahe
|
from takahe.utils import Takahe
|
||||||
|
|
||||||
peers = Takahe.get_neodb_peers()
|
peers = Takahe.get_neodb_peers()
|
||||||
c = category if category != "movietv" else "movie,tv"
|
c = category if category != "movietv" else "movie,tv"
|
||||||
return [cls.peer_search_task(host, q, page, c) for host in peers]
|
return [cls.peer_search_task(host, q, page, c, page_size) for host in peers]
|
||||||
|
|
|
@ -123,13 +123,12 @@ class Goodreads(AbstractSite):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def search_task(
|
async def search_task(
|
||||||
cls, q: str, page: int, category: str
|
cls, q: str, page: int, category: str, page_size: int
|
||||||
) -> list[ExternalSearchResultItem]:
|
) -> list[ExternalSearchResultItem]:
|
||||||
if category not in ["all", "book"]:
|
if category not in ["all", "book"]:
|
||||||
return []
|
return []
|
||||||
SEARCH_PAGE_SIZE = 5
|
p = (page - 1) * page_size // 20 + 1
|
||||||
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
|
offset = (page - 1) * page_size % 20
|
||||||
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
|
|
||||||
results = []
|
results = []
|
||||||
search_url = f"https://www.goodreads.com/search?page={p}&q={quote_plus(q)}"
|
search_url = f"https://www.goodreads.com/search?page={p}&q={quote_plus(q)}"
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
|
@ -195,7 +194,7 @@ class Goodreads(AbstractSite):
|
||||||
logger.error(
|
logger.error(
|
||||||
"Goodreads search error", extra={"query": q, "exception": e}
|
"Goodreads search error", extra={"query": q, "exception": e}
|
||||||
)
|
)
|
||||||
return results[offset : offset + SEARCH_PAGE_SIZE]
|
return results[offset : offset + page_size]
|
||||||
|
|
||||||
|
|
||||||
@SiteManager.register
|
@SiteManager.register
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import logging
|
|
||||||
import re
|
import re
|
||||||
from urllib.parse import quote_plus
|
from urllib.parse import quote_plus
|
||||||
|
|
||||||
|
@ -10,8 +9,6 @@ from catalog.book.utils import isbn_10_to_13
|
||||||
from catalog.common import *
|
from catalog.common import *
|
||||||
from catalog.models import *
|
from catalog.models import *
|
||||||
|
|
||||||
_logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@SiteManager.register
|
@SiteManager.register
|
||||||
class GoogleBooks(AbstractSite):
|
class GoogleBooks(AbstractSite):
|
||||||
|
@ -122,13 +119,12 @@ class GoogleBooks(AbstractSite):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def search_task(
|
async def search_task(
|
||||||
cls, q: str, page: int, category: str
|
cls, q: str, page: int, category: str, page_size: int
|
||||||
) -> list[ExternalSearchResultItem]:
|
) -> list[ExternalSearchResultItem]:
|
||||||
if category not in ["all", "book"]:
|
if category not in ["all", "book"]:
|
||||||
return []
|
return []
|
||||||
SEARCH_PAGE_SIZE = 5
|
|
||||||
results = []
|
results = []
|
||||||
api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE * (page - 1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE"
|
api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={page_size * (page - 1)}&maxResults={page_size}&maxAllowedMaturityRating=MATURE"
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
try:
|
try:
|
||||||
response = await client.get(api_url, timeout=2)
|
response = await client.get(api_url, timeout=2)
|
||||||
|
|
|
@ -166,12 +166,11 @@ class IGDB(AbstractSite):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def search_task(
|
async def search_task(
|
||||||
cls, q: str, page: int, category: str
|
cls, q: str, page: int, category: str, page_size: int
|
||||||
) -> list[ExternalSearchResultItem]:
|
) -> list[ExternalSearchResultItem]:
|
||||||
if category != "game":
|
if category != "game":
|
||||||
return []
|
return []
|
||||||
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
|
limit = page_size
|
||||||
limit = SEARCH_PAGE_SIZE
|
|
||||||
offset = (page - 1) * limit
|
offset = (page - 1) * limit
|
||||||
q = f'fields *, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name; search "{quote_plus(q)}"; limit {limit}; offset {offset};'
|
q = f'fields *, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name; search "{quote_plus(q)}"; limit {limit}; offset {offset};'
|
||||||
_wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, _igdb_access_token())
|
_wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, _igdb_access_token())
|
||||||
|
|
|
@ -111,13 +111,12 @@ class Spotify(AbstractSite):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def search_task(
|
async def search_task(
|
||||||
cls, q: str, page: int, category: str
|
cls, q: str, page: int, category: str, page_size: int
|
||||||
) -> list[ExternalSearchResultItem]:
|
) -> list[ExternalSearchResultItem]:
|
||||||
if category not in ["music", "all"]:
|
if category not in ["music", "all"]:
|
||||||
return []
|
return []
|
||||||
SEARCH_PAGE_SIZE = 5
|
|
||||||
results = []
|
results = []
|
||||||
api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page * SEARCH_PAGE_SIZE}"
|
api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={page_size}&offset={page * page_size}"
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
try:
|
try:
|
||||||
headers = {"Authorization": f"Bearer {get_spotify_token()}"}
|
headers = {"Authorization": f"Bearer {get_spotify_token()}"}
|
||||||
|
|
|
@ -180,13 +180,12 @@ class TMDB_Movie(AbstractSite):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def search_task(
|
async def search_task(
|
||||||
cls, q: str, page: int, category: str
|
cls, q: str, page: int, category: str, page_size: int
|
||||||
) -> list[ExternalSearchResultItem]:
|
) -> list[ExternalSearchResultItem]:
|
||||||
if category not in ["movietv", "all", "movie", "tv"]:
|
if category not in ["movietv", "all", "movie", "tv"]:
|
||||||
return []
|
return []
|
||||||
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
|
p = (page - 1) * page_size // 20 + 1
|
||||||
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
|
offset = (page - 1) * page_size % 20
|
||||||
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
|
|
||||||
results = []
|
results = []
|
||||||
api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={p}&api_key={settings.TMDB_API3_KEY}&language={TMDB_DEFAULT_LANG}&include_adult=true"
|
api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={p}&api_key={settings.TMDB_API3_KEY}&language={TMDB_DEFAULT_LANG}&include_adult=true"
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
|
@ -225,7 +224,7 @@ class TMDB_Movie(AbstractSite):
|
||||||
logger.warning(f"TMDB search '{q}' no results found.")
|
logger.warning(f"TMDB search '{q}' no results found.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("TMDb search error", extra={"query": q, "exception": e})
|
logger.error("TMDb search error", extra={"query": q, "exception": e})
|
||||||
return results[offset : offset + SEARCH_PAGE_SIZE]
|
return results[offset : offset + page_size]
|
||||||
|
|
||||||
|
|
||||||
@SiteManager.register
|
@SiteManager.register
|
||||||
|
|
Loading…
Add table
Reference in a new issue