make external search async and configurable

This commit is contained in:
Your Name 2025-01-18 15:53:06 -05:00 committed by Henri Dickson
parent 2826bc60dc
commit 90386bbf1a
20 changed files with 523 additions and 414 deletions

View file

@ -92,6 +92,8 @@ env = environ.FileAwareEnv(
NEODB_DISCOVER_UPDATE_INTERVAL=(int, 60), NEODB_DISCOVER_UPDATE_INTERVAL=(int, 60),
# Disable cron jobs, * for all # Disable cron jobs, * for all
NEODB_DISABLE_CRON_JOBS=(list, []), NEODB_DISABLE_CRON_JOBS=(list, []),
# search sites
NEODB_SEARCH_SITES=(list, []),
# federated search peers # federated search peers
NEODB_SEARCH_PEERS=(list, []), NEODB_SEARCH_PEERS=(list, []),
# INTEGRATED TAKAHE CONFIGURATION # INTEGRATED TAKAHE CONFIGURATION
@ -282,6 +284,7 @@ DOWNLOADER_RETRIES = env("NEODB_DOWNLOADER_RETRIES")
DISABLE_CRON_JOBS = env("NEODB_DISABLE_CRON_JOBS") DISABLE_CRON_JOBS = env("NEODB_DISABLE_CRON_JOBS")
SEARCH_PEERS = env("NEODB_SEARCH_PEERS") SEARCH_PEERS = env("NEODB_SEARCH_PEERS")
SEARCH_SITES = env("NEODB_SEARCH_SITES")
FANOUT_LIMIT_DAYS = env("NEODB_FANOUT_LIMIT_DAYS") FANOUT_LIMIT_DAYS = env("NEODB_FANOUT_LIMIT_DAYS")
# ====== USER CONFIGUTRATION END ====== # ====== USER CONFIGUTRATION END ======

View file

@ -43,7 +43,7 @@ class SiteName(models.TextChoices):
Steam = "steam", _("Steam") # type:ignore[reportCallIssue] Steam = "steam", _("Steam") # type:ignore[reportCallIssue]
Bangumi = "bangumi", _("Bangumi") # type:ignore[reportCallIssue] Bangumi = "bangumi", _("Bangumi") # type:ignore[reportCallIssue]
BGG = "bgg", _("BGG") # type:ignore[reportCallIssue] BGG = "bgg", _("BGG") # type:ignore[reportCallIssue]
# ApplePodcast = "apple_podcast", _("Apple Podcast") # type:ignore[reportCallIssue] ApplePodcast = "apple_podcast", _("Apple Podcast") # type:ignore[reportCallIssue]
RSS = "rss", _("RSS") # type:ignore[reportCallIssue] RSS = "rss", _("RSS") # type:ignore[reportCallIssue]
Discogs = "discogs", _("Discogs") # type:ignore[reportCallIssue] Discogs = "discogs", _("Discogs") # type:ignore[reportCallIssue]
AppleMusic = "apple_music", _("Apple Music") # type:ignore[reportCallIssue] AppleMusic = "apple_music", _("Apple Music") # type:ignore[reportCallIssue]

View file

@ -14,6 +14,7 @@ from typing import Type, TypeVar
import django_rq import django_rq
import requests import requests
from django.conf import settings
from loguru import logger from loguru import logger
from validators import url as url_validate from validators import url as url_validate
@ -91,6 +92,13 @@ class AbstractSite:
) )
return self.resource return self.resource
# add this method to subclass to enable external search
# @classmethod
# async def search_task(
# cls, query: str, page: int, category: str
# ) -> list[ExternalSearchResultItem]:
# return []
def scrape(self) -> ResourceContent: def scrape(self) -> ResourceContent:
"""subclass should implement this, return ResourceContent object""" """subclass should implement this, return ResourceContent object"""
data = ResourceContent() data = ResourceContent()
@ -340,6 +348,17 @@ class SiteManager:
def get_all_sites(): def get_all_sites():
return SiteManager.registry.values() return SiteManager.registry.values()
@staticmethod
def get_sites_for_search():
if settings.SEARCH_SITES == ["-"]:
return []
sites = [
cls for cls in SiteManager.get_all_sites() if hasattr(cls, "search_task")
]
if settings.SEARCH_SITES == ["*"] or not settings.SEARCH_SITES:
return sites
return [s for s in sites if s.SITE_NAME.value in settings.SEARCH_SITES]
def crawl_related_resources_task(resource_pk): def crawl_related_resources_task(resource_pk):
resource = ExternalResource.objects.filter(pk=resource_pk).first() resource = ExternalResource.objects.filter(pk=resource_pk).first()

View file

@ -1,16 +1,28 @@
import time
from django.contrib.contenttypes.models import ContentType from django.contrib.contenttypes.models import ContentType
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.db.models import Count, F from django.db.models import Count, F
from tqdm import tqdm from tqdm import tqdm
from catalog.common.sites import SiteManager
from catalog.models import Edition, Item, Podcast, TVSeason, TVShow from catalog.models import Edition, Item, Podcast, TVSeason, TVShow
from catalog.search.external import ExternalSources
from common.models import detect_language, uniq from common.models import detect_language, uniq
from takahe.utils import Takahe
class Command(BaseCommand): class Command(BaseCommand):
help = "catalog app utilities" help = "catalog app utilities"
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument(
"--extsearch",
)
parser.add_argument(
"--category",
default="all",
)
parser.add_argument( parser.add_argument(
"--verbose", "--verbose",
action="store_true", action="store_true",
@ -44,8 +56,26 @@ class Command(BaseCommand):
self.integrity() self.integrity()
if options["localize"]: if options["localize"]:
self.localize() self.localize()
if options["extsearch"]:
self.external_search(options["extsearch"], options["category"])
self.stdout.write(self.style.SUCCESS("Done.")) self.stdout.write(self.style.SUCCESS("Done."))
def external_search(self, q, cat):
sites = SiteManager.get_sites_for_search()
peers = Takahe.get_neodb_peers()
self.stdout.write(f"Searching {cat} '{q}' ...")
self.stdout.write(f"Peers: {peers}")
self.stdout.write(f"Sites: {sites}")
start_time = time.time()
results = ExternalSources.search(q, 1, cat)
for r in results:
self.stdout.write(f"{r}")
self.stdout.write(
self.style.SUCCESS(
f"{time.time() - start_time} seconds, {len(results)} items."
)
)
def localize(self): def localize(self):
c = Item.objects.all().count() c = Item.objects.all().count()
qs = Item.objects.filter(is_deleted=False, merged_to_item__isnull=True) qs = Item.objects.filter(is_deleted=False, merged_to_item__isnull=True)

View file

@ -38,7 +38,7 @@ from .tv.models import (
TVShowSchema, TVShowSchema,
) )
from .search.models import Indexer # isort:skip from .search.models import Indexer, ExternalSearchResultItem # isort:skip
# class Exhibition(Item): # class Exhibition(Item):
@ -103,6 +103,7 @@ __all__ = [
"CatalogCollection", "CatalogCollection",
"AvailableItemCategory", "AvailableItemCategory",
"ExternalResource", "ExternalResource",
"ExternalSearchResultItem",
"IdType", "IdType",
"Item", "Item",
"ItemCategory", "ItemCategory",

View file

@ -1,342 +1,26 @@
import asyncio import asyncio
import logging import logging
from urllib.parse import quote_plus, urlparse
import httpx from catalog.common import SiteManager
import requests
from django.conf import settings
from lxml import html
from catalog.common import BasicDownloader, ItemCategory, SiteManager, SiteName
from catalog.search.models import ExternalSearchResultItem from catalog.search.models import ExternalSearchResultItem
from catalog.sites.igdb import IGDB as IGDB_Site from catalog.sites.fedi import FediverseInstance
from catalog.sites.spotify import get_spotify_token
from catalog.sites.tmdb import TMDB_DEFAULT_LANG
SEARCH_PAGE_SIZE = 5 # not all apis support page size SEARCH_PAGE_SIZE = 5 # not all apis support page size
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class Goodreads: class ExternalSources:
@classmethod @classmethod
def search(cls, q: str, page=1): def search(
results = [] cls, query: str, page: int = 1, category: str | None = None
search_url = f"https://www.goodreads.com/search?page={page}&q={quote_plus(q)}" ) -> list[ExternalSearchResultItem]:
try: if not query or page < 1 or page > 10:
r = requests.get(
search_url,
timeout=3,
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": BasicDownloader.get_accept_language(),
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"DNT": "1",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": "no-cache",
},
)
if r.url.startswith("https://www.goodreads.com/book/show/"):
# Goodreads will 302 if only one result matches ISBN
site = SiteManager.get_site_by_url(r.url)
if site:
res = site.get_resource_ready()
if res:
subtitle = f"{res.metadata.get('pub_year')} {', '.join(res.metadata.get('author', []))} {', '.join(res.metadata.get('translator', []))}"
results.append(
ExternalSearchResultItem(
ItemCategory.Book,
SiteName.Goodreads,
res.url,
res.metadata["title"],
subtitle,
res.metadata.get("brief", ""),
res.metadata.get("cover_image_url", ""),
)
)
else:
h = html.fromstring(r.content.decode("utf-8"))
books = h.xpath('//tr[@itemtype="http://schema.org/Book"]')
for c in books: # type:ignore
el_cover = c.xpath('.//img[@class="bookCover"]/@src')
cover = el_cover[0] if el_cover else ""
el_title = c.xpath('.//a[@class="bookTitle"]//text()')
title = "".join(el_title).strip() if el_title else "Unkown Title"
el_url = c.xpath('.//a[@class="bookTitle"]/@href')
url = "https://www.goodreads.com" + el_url[0] if el_url else ""
el_authors = c.xpath('.//a[@class="authorName"]//text()')
subtitle = ", ".join(el_authors) if el_authors else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Book,
SiteName.Goodreads,
url,
title,
subtitle,
"",
cover,
)
)
except requests.exceptions.RequestException as e:
logger.warning(f"Search {search_url} error: {e}")
except Exception as e:
logger.error("Goodreads search error", extra={"query": q, "exception": e})
return results
class GoogleBooks:
@classmethod
def search(cls, q, page=1):
results = []
api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE * (page - 1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE"
try:
j = requests.get(api_url, timeout=2).json()
if "items" in j:
for b in j["items"]:
if "title" not in b["volumeInfo"]:
continue
title = b["volumeInfo"]["title"]
subtitle = ""
if "publishedDate" in b["volumeInfo"]:
subtitle += b["volumeInfo"]["publishedDate"] + " "
if "authors" in b["volumeInfo"]:
subtitle += ", ".join(b["volumeInfo"]["authors"])
if "description" in b["volumeInfo"]:
brief = b["volumeInfo"]["description"]
elif "textSnippet" in b["volumeInfo"]:
brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
else:
brief = ""
category = ItemCategory.Book
# b['volumeInfo']['infoLink'].replace('http:', 'https:')
url = "https://books.google.com/books?id=" + b["id"]
cover = (
b["volumeInfo"]["imageLinks"]["thumbnail"]
if "imageLinks" in b["volumeInfo"]
else ""
)
results.append(
ExternalSearchResultItem(
category,
SiteName.GoogleBooks,
url,
title,
subtitle,
brief,
cover,
)
)
except requests.exceptions.RequestException as e:
logger.warning(f"Search {api_url} error: {e}")
except Exception as e:
logger.error("GoogleBooks search error", extra={"query": q, "exception": e})
return results
class TheMovieDatabase:
@classmethod
def search(cls, q, page=1):
results = []
api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={page}&api_key={settings.TMDB_API3_KEY}&language={TMDB_DEFAULT_LANG}&include_adult=true"
try:
j = requests.get(api_url, timeout=2).json()
if j.get("results"):
for m in j["results"]:
if m["media_type"] in ["tv", "movie"]:
url = f"https://www.themoviedb.org/{m['media_type']}/{m['id']}"
if m["media_type"] == "tv":
cat = ItemCategory.TV
title = m["name"]
subtitle = f"{m.get('first_air_date', '')} {m.get('original_name', '')}"
else:
cat = ItemCategory.Movie
title = m["title"]
subtitle = f"{m.get('release_date', '')} {m.get('original_name', '')}"
cover = (
f"https://image.tmdb.org/t/p/w500/{m.get('poster_path')}"
if m.get("poster_path")
else ""
)
results.append(
ExternalSearchResultItem(
cat,
SiteName.TMDB,
url,
title,
subtitle,
m.get("overview"),
cover,
)
)
else:
logger.warning(f"TMDB search '{q}' no results found.")
except requests.exceptions.RequestException as e:
logger.warning(f"Search {api_url} error: {e}")
except Exception as e:
logger.error("TMDb search error", extra={"query": q, "exception": e})
return results
class Spotify:
@classmethod
def search(cls, q, page=1):
results = []
api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page * SEARCH_PAGE_SIZE}"
try:
headers = {"Authorization": f"Bearer {get_spotify_token()}"}
j = requests.get(api_url, headers=headers, timeout=2).json()
if j.get("albums"):
for a in j["albums"]["items"]:
title = a["name"]
subtitle = a.get("release_date", "")
for artist in a.get("artists", []):
subtitle += " " + artist.get("name", "")
url = a["external_urls"]["spotify"]
cover = a["images"][0]["url"] if a.get("images") else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Music,
SiteName.Spotify,
url,
title,
subtitle,
"",
cover,
)
)
else:
logger.warning(f"Spotify search '{q}' no results found.")
except requests.exceptions.RequestException as e:
logger.warning(f"Search {api_url} error: {e}")
except Exception as e:
logger.error("Spotify search error", extra={"query": q, "exception": e})
return results
class Bandcamp:
@classmethod
def search(cls, q, page=1):
results = []
search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={page}&q={quote_plus(q)}"
try:
r = requests.get(search_url, timeout=2)
h = html.fromstring(r.content.decode("utf-8"))
albums = h.xpath('//li[@class="searchresult data-search"]')
for c in albums: # type:ignore
el_cover = c.xpath('.//div[@class="art"]/img/@src')
cover = el_cover[0] if el_cover else ""
el_title = c.xpath('.//div[@class="heading"]//text()')
title = "".join(el_title).strip() if el_title else "Unknown Title"
el_url = c.xpath('..//div[@class="itemurl"]/a/@href')
url = el_url[0] if el_url else ""
el_authors = c.xpath('.//div[@class="subhead"]//text()')
subtitle = ", ".join(el_authors) if el_authors else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Music,
SiteName.Bandcamp,
url,
title,
subtitle,
"",
cover,
)
)
except requests.exceptions.RequestException as e:
logger.warning(f"Search {search_url} error: {e}")
except Exception as e:
logger.error("Bandcamp search error", extra={"query": q, "exception": e})
return results
class ApplePodcast:
@classmethod
def search(cls, q, page=1):
results = []
search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * SEARCH_PAGE_SIZE}&term={quote_plus(q)}"
try:
r = requests.get(search_url, timeout=2).json()
for p in r["results"][(page - 1) * SEARCH_PAGE_SIZE :]:
if p.get("feedUrl"):
results.append(
ExternalSearchResultItem(
ItemCategory.Podcast,
SiteName.RSS,
p["feedUrl"],
p["trackName"],
p["artistName"],
"",
p["artworkUrl600"],
)
)
except requests.exceptions.RequestException as e:
logger.warning(f"Search {search_url} error: {e}")
except Exception as e:
logger.error(
"ApplePodcast search error", extra={"query": q, "exception": e}
)
return results
class IGDB:
@classmethod
def search(cls, q, page=1):
return IGDB_Site.search(
q, limit=SEARCH_PAGE_SIZE, offset=page * SEARCH_PAGE_SIZE
)
class Fediverse:
@staticmethod
async def search_task(host, q, category=None):
api_url = f"https://{host}/api/catalog/search?query={quote_plus(q)}{'&category=' + category if category else ''}"
async with httpx.AsyncClient() as client:
results = []
try:
response = await client.get(
api_url,
timeout=2,
)
r = response.json()
except Exception as e:
logger.error(
f"Fediverse search {host} error",
extra={"url": api_url, "query": q, "exception": e},
)
return [] return []
if "data" in r: if category in ["", None]:
for item in r["data"]: category = "all"
if any( tasks = FediverseInstance.search_tasks(query, page, category)
urlparse(res["url"]).hostname in settings.SITE_DOMAINS for site in SiteManager.get_sites_for_search():
for res in item.get("external_resources", []) tasks.append(site.search_task(query, page, category))
):
continue
url = f"https://{host}{item['url']}" # FIXME update API and use abs urls
try:
cat = ItemCategory(item["category"])
except Exception:
cat = None
results.append(
ExternalSearchResultItem(
cat,
host,
url,
item["display_title"],
"",
item["brief"],
item["cover_image_url"],
)
)
return results
@classmethod
def search(cls, q: str, page: int = 1, category: str | None = None):
from takahe.utils import Takahe
peers = Takahe.get_neodb_peers()
c = category if category != "movietv" else "movie,tv"
tasks = [Fediverse.search_task(host, q, c) for host in peers]
# loop = asyncio.get_event_loop() # loop = asyncio.get_event_loop()
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
@ -344,29 +28,3 @@ class Fediverse:
for r in loop.run_until_complete(asyncio.gather(*tasks)): for r in loop.run_until_complete(asyncio.gather(*tasks)):
results.extend(r) results.extend(r)
return results return results
class ExternalSources:
@classmethod
def search(cls, c, q, page=1):
if not q:
return []
results = []
results.extend(
Fediverse.search(q, page, category=c if c and c != "all" else None)
)
if c == "" or c is None:
c = "all"
if c == "all" or c == "movietv":
results.extend(TheMovieDatabase.search(q, page))
if c == "all" or c == "book":
results.extend(GoogleBooks.search(q, page))
results.extend(Goodreads.search(q, page))
if c == "all" or c == "game":
results.extend(IGDB.search(q, page))
if c == "all" or c == "music":
results.extend(Spotify.search(q, page))
results.extend(Bandcamp.search(q, page))
if c == "podcast":
results.extend(ApplePodcast.search(q, page))
return results

View file

@ -79,7 +79,7 @@ class ExternalSearchResultItem:
self.cover_image_url = cover_url self.cover_image_url = cover_url
def __repr__(self): def __repr__(self):
return f"[{self.category}] {self.display_title} {self.url}" return f"[{self.category}] {self.display_title} {self.source_url}"
@property @property
def verbose_category_name(self): def verbose_category_name(self):

View file

@ -159,7 +159,7 @@ def external_search(request):
category = None category = None
keywords = request.GET.get("q", default="").strip() keywords = request.GET.get("q", default="").strip()
page_number = int_(request.GET.get("page"), 1) page_number = int_(request.GET.get("page"), 1)
items = ExternalSources.search(category, keywords, page_number) if keywords else [] items = ExternalSources.search(keywords, page_number, category) if keywords else []
cache_key = f"search_{category if category != 'movietv' else 'movie,tv'}_{keywords}" cache_key = f"search_{category if category != 'movietv' else 'movie,tv'}_{keywords}"
dedupe_urls = cache.get(cache_key, []) dedupe_urls = cache.get(cache_key, [])
items = [i for i in items if i.source_url not in dedupe_urls] items = [i for i in items if i.source_url not in dedupe_urls]

View file

@ -1,6 +1,7 @@
from ..common.sites import SiteManager from ..common.sites import SiteManager
from .ao3 import ArchiveOfOurOwn from .ao3 import ArchiveOfOurOwn
from .apple_music import AppleMusic from .apple_music import AppleMusic
from .apple_podcast import ApplePodcast
from .bandcamp import Bandcamp from .bandcamp import Bandcamp
from .bangumi import Bangumi from .bangumi import Bangumi
from .bgg import BoardGameGeek from .bgg import BoardGameGeek
@ -24,12 +25,11 @@ from .steam import Steam
from .tmdb import TMDB_Movie from .tmdb import TMDB_Movie
from .ypshuo import Ypshuo from .ypshuo import Ypshuo
# from .apple_podcast import ApplePodcast
__all__ = [ __all__ = [
"SiteManager", "SiteManager",
"ArchiveOfOurOwn", "ArchiveOfOurOwn",
"AppleMusic", "AppleMusic",
"ApplePodcast",
"Bandcamp", "Bandcamp",
"Bangumi", "Bangumi",
"BoardGameGeek", "BoardGameGeek",

View file

@ -1,16 +1,17 @@
import logging from urllib.parse import quote_plus
import httpx
from loguru import logger
from catalog.common import * from catalog.common import *
from catalog.models import * from catalog.models import *
from .rss import RSS from .rss import RSS
_logger = logging.getLogger(__name__)
@SiteManager.register @SiteManager.register
class ApplePodcast(AbstractSite): class ApplePodcast(AbstractSite):
# SITE_NAME = SiteName.ApplePodcast SITE_NAME = SiteName.ApplePodcast
ID_TYPE = IdType.ApplePodcast ID_TYPE = IdType.ApplePodcast
URL_PATTERNS = [r"https://[^.]+.apple.com/\w+/podcast/*[^/?]*/id(\d+)"] URL_PATTERNS = [r"https://[^.]+.apple.com/\w+/podcast/*[^/?]*/id(\d+)"]
WIKI_PROPERTY_ID = "P5842" WIKI_PROPERTY_ID = "P5842"
@ -38,3 +39,35 @@ class ApplePodcast(AbstractSite):
) )
pd.lookup_ids[IdType.RSS] = RSS.url_to_id(feed_url) pd.lookup_ids[IdType.RSS] = RSS.url_to_id(feed_url)
return pd return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category != "podcast":
return []
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
results = []
search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * SEARCH_PAGE_SIZE}&term={quote_plus(q)}"
async with httpx.AsyncClient() as client:
try:
response = await client.get(search_url, timeout=2)
r = response.json()
for p in r["results"][(page - 1) * SEARCH_PAGE_SIZE :]:
if p.get("feedUrl"):
results.append(
ExternalSearchResultItem(
ItemCategory.Podcast,
SiteName.RSS,
p["feedUrl"],
p["trackName"],
p["artistName"],
"",
p["artworkUrl600"],
)
)
except Exception as e:
logger.error(
"ApplePodcast search error", extra={"query": q, "exception": e}
)
return results

View file

@ -5,6 +5,9 @@ import urllib.parse
import dateparser import dateparser
import dns.resolver import dns.resolver
import httpx
from loguru import logger
from lxml import html
from catalog.common import * from catalog.common import *
from catalog.models import * from catalog.models import *
@ -103,3 +106,45 @@ class Bandcamp(AbstractSite):
} }
pd = ResourceContent(metadata=data) pd = ResourceContent(metadata=data)
return pd return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category != "music":
return []
SEARCH_PAGE_SIZE = 5
p = (page - 1) * SEARCH_PAGE_SIZE // 18 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 18
results = []
search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={p}&q={urllib.parse.quote_plus(q)}"
async with httpx.AsyncClient() as client:
try:
r = await client.get(search_url, timeout=2)
h = html.fromstring(r.content.decode("utf-8"))
albums = h.xpath('//li[@class="searchresult data-search"]')
for c in albums: # type:ignore
el_cover = c.xpath('.//div[@class="art"]/img/@src')
cover = el_cover[0] if el_cover else ""
el_title = c.xpath('.//div[@class="heading"]//text()')
title = "".join(el_title).strip() if el_title else "Unknown Title"
el_url = c.xpath('..//div[@class="itemurl"]/a/@href')
url = el_url[0] if el_url else ""
el_authors = c.xpath('.//div[@class="subhead"]//text()')
subtitle = ", ".join(el_authors) if el_authors else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Music,
SiteName.Bandcamp,
url,
title,
subtitle,
"",
cover,
)
)
except Exception as e:
logger.error(
"Bandcamp search error", extra={"query": q, "exception": e}
)
return results[offset : offset + SEARCH_PAGE_SIZE]

View file

@ -1,9 +1,33 @@
from urllib.parse import quote_plus, urlparse
import httpx
from django.conf import settings from django.conf import settings
from django.core.validators import URLValidator from django.core.validators import URLValidator
from loguru import logger from loguru import logger
from catalog.common import * from catalog.common import (
from catalog.models import * AbstractSite,
BasicImageDownloader,
CachedDownloader,
IdType,
ItemCategory,
ResourceContent,
SiteManager,
SiteName,
)
from catalog.models import (
Album,
Edition,
ExternalSearchResultItem,
Game,
Movie,
Performance,
PerformanceProduction,
Podcast,
TVEpisode,
TVSeason,
TVShow,
)
@SiteManager.register @SiteManager.register
@ -99,3 +123,56 @@ class FediverseInstance(AbstractSite):
lookup_ids=ids, lookup_ids=ids,
) )
return d return d
@classmethod
async def peer_search_task(cls, host, q, page, category=None):
SEARCH_PAGE_SIZE = 5
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
api_url = f"https://{host}/api/catalog/search?query={quote_plus(q)}&page={p}{'&category=' + category if category and category != 'all' else ''}"
async with httpx.AsyncClient() as client:
results = []
try:
response = await client.get(
api_url,
timeout=2,
)
r = response.json()
except Exception as e:
logger.error(
f"Fediverse search {host} error",
extra={"url": api_url, "query": q, "exception": e},
)
return []
if "data" in r:
for item in r["data"]:
if any(
urlparse(res["url"]).hostname in settings.SITE_DOMAINS
for res in item.get("external_resources", [])
):
continue
url = f"https://{host}{item['url']}" # FIXME update API and use abs urls
try:
cat = ItemCategory(item["category"])
except Exception:
cat = None
results.append(
ExternalSearchResultItem(
cat,
host,
url,
item["display_title"],
"",
item["brief"],
item["cover_image_url"],
)
)
return results[offset : offset + SEARCH_PAGE_SIZE]
@classmethod
def search_tasks(cls, q: str, page: int = 1, category: str | None = None):
from takahe.utils import Takahe
peers = Takahe.get_neodb_peers()
c = category if category != "movietv" else "movie,tv"
return [cls.peer_search_task(host, q, page, c) for host in peers]

View file

@ -1,18 +1,18 @@
import json import json
import logging
from datetime import datetime from datetime import datetime
from urllib.parse import quote_plus
import httpx
from django.utils.timezone import make_aware from django.utils.timezone import make_aware
from loguru import logger
from lxml import html from lxml import html
from catalog.book.models import Edition, Work
from catalog.book.utils import binding_to_format, detect_isbn_asin from catalog.book.utils import binding_to_format, detect_isbn_asin
from catalog.common import * from catalog.common import *
from common.models.lang import detect_language from catalog.models import Edition, ExternalSearchResultItem, Work
from common.models import detect_language
from journal.models.renderers import html_to_text from journal.models.renderers import html_to_text
_logger = logging.getLogger(__name__)
class GoodreadsDownloader(RetryDownloader): class GoodreadsDownloader(RetryDownloader):
def validate_response(self, response): def validate_response(self, response):
@ -121,6 +121,82 @@ class Goodreads(AbstractSite):
pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN) pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)
return pd return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category not in ["all", "book"]:
return []
SEARCH_PAGE_SIZE = 5
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
results = []
search_url = f"https://www.goodreads.com/search?page={p}&q={quote_plus(q)}"
async with httpx.AsyncClient() as client:
try:
r = await client.get(
search_url,
timeout=3,
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": BasicDownloader.get_accept_language(),
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"DNT": "1",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": "no-cache",
},
)
if r.url.path.startswith("/book/show/"):
# Goodreads will 302 if only one result matches ISBN
site = SiteManager.get_site_by_url(str(r.url))
if site:
res = site.get_resource_ready()
if res:
subtitle = f"{res.metadata.get('pub_year')} {', '.join(res.metadata.get('author', []))} {', '.join(res.metadata.get('translator', []))}"
results.append(
ExternalSearchResultItem(
ItemCategory.Book,
SiteName.Goodreads,
res.url,
res.metadata["title"],
subtitle,
res.metadata.get("brief", ""),
res.metadata.get("cover_image_url", ""),
)
)
else:
h = html.fromstring(r.content.decode("utf-8"))
books = h.xpath('//tr[@itemtype="http://schema.org/Book"]')
for c in books: # type:ignore
el_cover = c.xpath('.//img[@class="bookCover"]/@src')
cover = el_cover[0] if el_cover else ""
el_title = c.xpath('.//a[@class="bookTitle"]//text()')
title = (
"".join(el_title).strip() if el_title else "Unkown Title"
)
el_url = c.xpath('.//a[@class="bookTitle"]/@href')
url = "https://www.goodreads.com" + el_url[0] if el_url else ""
el_authors = c.xpath('.//a[@class="authorName"]//text()')
subtitle = ", ".join(el_authors) if el_authors else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Book,
SiteName.Goodreads,
url,
title,
subtitle,
"",
cover,
)
)
except Exception as e:
logger.error(
"Goodreads search error", extra={"query": q, "exception": e}
)
return results[offset : offset + SEARCH_PAGE_SIZE]
@SiteManager.register @SiteManager.register
class Goodreads_Work(AbstractSite): class Goodreads_Work(AbstractSite):

View file

@ -1,7 +1,10 @@
import logging import logging
import re import re
from urllib.parse import quote_plus
import httpx
from django.conf import settings from django.conf import settings
from loguru import logger
from catalog.book.utils import isbn_10_to_13 from catalog.book.utils import isbn_10_to_13
from catalog.common import * from catalog.common import *
@ -116,3 +119,57 @@ class GoogleBooks(AbstractSite):
cover_image_extention=ext, cover_image_extention=ext,
lookup_ids={IdType.ISBN: isbn13}, lookup_ids={IdType.ISBN: isbn13},
) )
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category not in ["all", "book"]:
return []
SEARCH_PAGE_SIZE = 5
results = []
api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE * (page - 1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE"
async with httpx.AsyncClient() as client:
try:
response = await client.get(api_url, timeout=2)
j = response.json()
if "items" in j:
for b in j["items"]:
if "title" not in b["volumeInfo"]:
continue
title = b["volumeInfo"]["title"]
subtitle = ""
if "publishedDate" in b["volumeInfo"]:
subtitle += b["volumeInfo"]["publishedDate"] + " "
if "authors" in b["volumeInfo"]:
subtitle += ", ".join(b["volumeInfo"]["authors"])
if "description" in b["volumeInfo"]:
brief = b["volumeInfo"]["description"]
elif "textSnippet" in b["volumeInfo"]:
brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
else:
brief = ""
category = ItemCategory.Book
# b['volumeInfo']['infoLink'].replace('http:', 'https:')
url = "https://books.google.com/books?id=" + b["id"]
cover = (
b["volumeInfo"]["imageLinks"]["thumbnail"]
if "imageLinks" in b["volumeInfo"]
else ""
)
results.append(
ExternalSearchResultItem(
category,
SiteName.GoogleBooks,
url,
title,
subtitle,
brief,
cover,
)
)
except Exception as e:
logger.error(
"GoogleBooks search error", extra={"query": q, "exception": e}
)
return results

View file

@ -8,6 +8,7 @@ import datetime
import json import json
from urllib.parse import quote_plus from urllib.parse import quote_plus
import httpx
import requests import requests
from django.conf import settings from django.conf import settings
from django.core.cache import cache from django.core.cache import cache
@ -83,44 +84,6 @@ class IGDB(AbstractSite):
fp.write(json.dumps(r)) fp.write(json.dumps(r))
return r return r
@classmethod
def search(cls, q, limit: int, offset: int = 0):
rs = cls.api_query(
"games",
f'fields *, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name; search "{quote_plus(q)}"; limit {limit}; offset {offset};',
)
result = []
for r in rs:
subtitle = ""
if "first_release_date" in r:
subtitle = datetime.datetime.fromtimestamp(
r["first_release_date"], datetime.timezone.utc
).strftime("%Y-%m-%d ")
if "platforms" in r:
ps = sorted(r["platforms"], key=lambda p: p["id"])
subtitle += ",".join(
[(p["name"] if p["id"] != 6 else "Windows") for p in ps]
)
brief = r["summary"] if "summary" in r else ""
brief += "\n\n" + r["storyline"] if "storyline" in r else ""
cover = (
"https:" + r["cover"]["url"].replace("t_thumb", "t_cover_big")
if r.get("cover")
else ""
)
result.append(
ExternalSearchResultItem(
ItemCategory.Game,
SiteName.IGDB,
r["url"],
r["name"],
subtitle,
brief,
cover,
)
)
return result
def scrape(self): def scrape(self):
fields = "*, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name" fields = "*, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name"
r = self.api_query("games", f'fields {fields}; where url = "{self.url}";') r = self.api_query("games", f'fields {fields}; where url = "{self.url}";')
@ -200,3 +163,55 @@ class IGDB(AbstractSite):
IdType.Steam IdType.Steam
).url_to_id(steam_url) ).url_to_id(steam_url)
return pd return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category != "game":
return []
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
limit = SEARCH_PAGE_SIZE
offset = (page - 1) * limit
q = f'fields *, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name; search "{quote_plus(q)}"; limit {limit}; offset {offset};'
_wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, _igdb_access_token())
async with httpx.AsyncClient() as client:
try:
url = IGDBWrapper._build_url("games")
params = _wrapper._compose_request(q)
response = await client.post(url, **params)
rs = json.loads(response.content)
except requests.HTTPError as e:
logger.error(f"IGDB API: {e}", extra={"exception": e})
rs = []
result = []
for r in rs:
subtitle = ""
if "first_release_date" in r:
subtitle = datetime.datetime.fromtimestamp(
r["first_release_date"], datetime.timezone.utc
).strftime("%Y-%m-%d ")
if "platforms" in r:
ps = sorted(r["platforms"], key=lambda p: p["id"])
subtitle += ",".join(
[(p["name"] if p["id"] != 6 else "Windows") for p in ps]
)
brief = r["summary"] if "summary" in r else ""
brief += "\n\n" + r["storyline"] if "storyline" in r else ""
cover = (
"https:" + r["cover"]["url"].replace("t_thumb", "t_cover_big")
if r.get("cover")
else ""
)
result.append(
ExternalSearchResultItem(
ItemCategory.Game,
SiteName.IGDB,
r["url"],
r["name"],
subtitle,
brief,
cover,
)
)
return result

View file

@ -6,8 +6,10 @@ import logging
import time import time
import dateparser import dateparser
import httpx
import requests import requests
from django.conf import settings from django.conf import settings
from loguru import logger
from catalog.common import * from catalog.common import *
from catalog.models import * from catalog.models import *
@ -107,6 +109,45 @@ class Spotify(AbstractSite):
pd.lookup_ids[IdType.ISRC] = isrc pd.lookup_ids[IdType.ISRC] = isrc
return pd return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category not in ["music", "all"]:
return []
SEARCH_PAGE_SIZE = 5
results = []
api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page * SEARCH_PAGE_SIZE}"
async with httpx.AsyncClient() as client:
try:
headers = {"Authorization": f"Bearer {get_spotify_token()}"}
response = await client.get(api_url, headers=headers, timeout=2)
j = response.json()
if j.get("albums"):
for a in j["albums"]["items"]:
title = a["name"]
subtitle = a.get("release_date", "")
for artist in a.get("artists", []):
subtitle += " " + artist.get("name", "")
url = a["external_urls"]["spotify"]
cover = a["images"][0]["url"] if a.get("images") else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Music,
SiteName.Spotify,
url,
title,
subtitle,
"",
cover,
)
)
else:
logger.warning(f"Spotify search '{q}' no results found.")
except Exception as e:
logger.error("Spotify search error", extra={"query": q, "exception": e})
return results
def get_spotify_token(): def get_spotify_token():
global spotify_token, spotify_token_expire_time global spotify_token, spotify_token_expire_time

View file

@ -12,8 +12,11 @@ these language code from TMDB are not in currently iso-639-1
import logging import logging
import re import re
from urllib.parse import quote_plus
import httpx
from django.conf import settings from django.conf import settings
from loguru import logger
from catalog.common import * from catalog.common import *
from catalog.movie.models import * from catalog.movie.models import *
@ -175,6 +178,55 @@ class TMDB_Movie(AbstractSite):
pd.lookup_ids[IdType.IMDB] = imdb_code pd.lookup_ids[IdType.IMDB] = imdb_code
return pd return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category not in ["movietv", "all", "movie", "tv"]:
return []
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
results = []
api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={p}&api_key={settings.TMDB_API3_KEY}&language={TMDB_DEFAULT_LANG}&include_adult=true"
async with httpx.AsyncClient() as client:
try:
response = await client.get(api_url, timeout=2)
j = response.json()
if j.get("results"):
for m in j["results"]:
if m["media_type"] in ["tv", "movie"]:
url = f"https://www.themoviedb.org/{m['media_type']}/{m['id']}"
if m["media_type"] == "tv":
cat = ItemCategory.TV
title = m["name"]
subtitle = f"{m.get('first_air_date', '')} {m.get('original_name', '')}"
else:
cat = ItemCategory.Movie
title = m["title"]
subtitle = f"{m.get('release_date', '')} {m.get('original_name', '')}"
cover = (
f"https://image.tmdb.org/t/p/w500/{m.get('poster_path')}"
if m.get("poster_path")
else ""
)
results.append(
ExternalSearchResultItem(
cat,
SiteName.TMDB,
url,
title,
subtitle,
m.get("overview"),
cover,
)
)
else:
logger.warning(f"TMDB search '{q}' no results found.")
except Exception as e:
logger.error("TMDb search error", extra={"query": q, "exception": e})
return results[offset : offset + SEARCH_PAGE_SIZE]
@SiteManager.register @SiteManager.register
class TMDB_TV(AbstractSite): class TMDB_TV(AbstractSite):

View file

@ -33,6 +33,7 @@ x-shared:
NEODB_DISABLE_DEFAULT_RELAY: NEODB_DISABLE_DEFAULT_RELAY:
NEODB_DISABLE_CRON_JOBS: NEODB_DISABLE_CRON_JOBS:
NEODB_SEARCH_PEERS: NEODB_SEARCH_PEERS:
NEODB_SEARCH_SITES:
NEODB_MIN_MARKS_FOR_DISCOVER: NEODB_MIN_MARKS_FOR_DISCOVER:
NEODB_DISCOVER_UPDATE_INTERVAL: NEODB_DISCOVER_UPDATE_INTERVAL:
NEODB_DISCOVER_FILTER_LANGUAGE: NEODB_DISCOVER_FILTER_LANGUAGE:

View file

@ -57,6 +57,7 @@ if you are doing debug or development:
- `GOOGLE_API_KEY` - API key for [Google Books](https://developers.google.com/books/docs/v1/using) - `GOOGLE_API_KEY` - API key for [Google Books](https://developers.google.com/books/docs/v1/using)
- `DISCOGS_API_KEY` - personal access token from [Discogs](https://www.discogs.com/settings/developers) - `DISCOGS_API_KEY` - personal access token from [Discogs](https://www.discogs.com/settings/developers)
- `IGDB_API_CLIENT_ID`, `IGDB_API_CLIENT_SECRET` - IGDB [keys](https://api-docs.igdb.com/) - `IGDB_API_CLIENT_ID`, `IGDB_API_CLIENT_SECRET` - IGDB [keys](https://api-docs.igdb.com/)
- `NEODB_SEARCH_SITES` is empty by default, which means NeoDB will search all available sources. This can be set to a comma-separated list of site names (e.g. `goodreads,googlebooks,spotify,tmdb,igdb,bandcamp,apple_podcast`), so that NeoDB will only search those sites; or not search any of them if set to just `-`.
## Other maintenance tasks ## Other maintenance tasks

View file

@ -169,7 +169,7 @@ mkdocs==1.6.1
# via mkdocs-material # via mkdocs-material
mkdocs-get-deps==0.2.0 mkdocs-get-deps==0.2.0
# via mkdocs # via mkdocs
mkdocs-material==9.5.49 mkdocs-material==9.5.50
mkdocs-material-extensions==1.3.1 mkdocs-material-extensions==1.3.1
# via mkdocs-material # via mkdocs-material
multidict==6.1.0 multidict==6.1.0
@ -213,7 +213,7 @@ pygments==2.19.1
# via mkdocs-material # via mkdocs-material
pymdown-extensions==10.14 pymdown-extensions==10.14
# via mkdocs-material # via mkdocs-material
pyright==1.1.391 pyright==1.1.392.post0
python-dateutil==2.9.0.post0 python-dateutil==2.9.0.post0
# via dateparser # via dateparser
# via django-auditlog # via django-auditlog
@ -251,7 +251,7 @@ rjsmin==1.2.2
# via django-compressor # via django-compressor
rq==2.1.0 rq==2.1.0
# via django-rq # via django-rq
ruff==0.9.1 ruff==0.9.2
sentry-sdk==2.20.0 sentry-sdk==2.20.0
setproctitle==1.3.4 setproctitle==1.3.4
six==1.17.0 six==1.17.0
@ -292,7 +292,7 @@ urllib3==2.3.0
# via sentry-sdk # via sentry-sdk
urlman==2.0.2 urlman==2.0.2
validators==0.34.0 validators==0.34.0
virtualenv==20.28.1 virtualenv==20.29.1
# via pre-commit # via pre-commit
watchdog==6.0.0 watchdog==6.0.0
# via mkdocs # via mkdocs