make external search async and configurable

This commit is contained in:
Your Name 2025-01-18 15:53:06 -05:00 committed by Henri Dickson
parent 2826bc60dc
commit 90386bbf1a
20 changed files with 523 additions and 414 deletions

View file

@ -92,6 +92,8 @@ env = environ.FileAwareEnv(
NEODB_DISCOVER_UPDATE_INTERVAL=(int, 60),
# Disable cron jobs, * for all
NEODB_DISABLE_CRON_JOBS=(list, []),
# search sites
NEODB_SEARCH_SITES=(list, []),
# federated search peers
NEODB_SEARCH_PEERS=(list, []),
# INTEGRATED TAKAHE CONFIGURATION
@ -282,6 +284,7 @@ DOWNLOADER_RETRIES = env("NEODB_DOWNLOADER_RETRIES")
DISABLE_CRON_JOBS = env("NEODB_DISABLE_CRON_JOBS")
SEARCH_PEERS = env("NEODB_SEARCH_PEERS")
SEARCH_SITES = env("NEODB_SEARCH_SITES")
FANOUT_LIMIT_DAYS = env("NEODB_FANOUT_LIMIT_DAYS")
# ====== USER CONFIGUTRATION END ======

View file

@ -43,7 +43,7 @@ class SiteName(models.TextChoices):
Steam = "steam", _("Steam") # type:ignore[reportCallIssue]
Bangumi = "bangumi", _("Bangumi") # type:ignore[reportCallIssue]
BGG = "bgg", _("BGG") # type:ignore[reportCallIssue]
# ApplePodcast = "apple_podcast", _("Apple Podcast") # type:ignore[reportCallIssue]
ApplePodcast = "apple_podcast", _("Apple Podcast") # type:ignore[reportCallIssue]
RSS = "rss", _("RSS") # type:ignore[reportCallIssue]
Discogs = "discogs", _("Discogs") # type:ignore[reportCallIssue]
AppleMusic = "apple_music", _("Apple Music") # type:ignore[reportCallIssue]

View file

@ -14,6 +14,7 @@ from typing import Type, TypeVar
import django_rq
import requests
from django.conf import settings
from loguru import logger
from validators import url as url_validate
@ -91,6 +92,13 @@ class AbstractSite:
)
return self.resource
# add this method to subclass to enable external search
# @classmethod
# async def search_task(
# cls, query: str, page: int, category: str
# ) -> list[ExternalSearchResultItem]:
# return []
def scrape(self) -> ResourceContent:
"""subclass should implement this, return ResourceContent object"""
data = ResourceContent()
@ -340,6 +348,17 @@ class SiteManager:
def get_all_sites():
return SiteManager.registry.values()
@staticmethod
def get_sites_for_search():
if settings.SEARCH_SITES == ["-"]:
return []
sites = [
cls for cls in SiteManager.get_all_sites() if hasattr(cls, "search_task")
]
if settings.SEARCH_SITES == ["*"] or not settings.SEARCH_SITES:
return sites
return [s for s in sites if s.SITE_NAME.value in settings.SEARCH_SITES]
def crawl_related_resources_task(resource_pk):
resource = ExternalResource.objects.filter(pk=resource_pk).first()

View file

@ -1,16 +1,28 @@
import time
from django.contrib.contenttypes.models import ContentType
from django.core.management.base import BaseCommand
from django.db.models import Count, F
from tqdm import tqdm
from catalog.common.sites import SiteManager
from catalog.models import Edition, Item, Podcast, TVSeason, TVShow
from catalog.search.external import ExternalSources
from common.models import detect_language, uniq
from takahe.utils import Takahe
class Command(BaseCommand):
help = "catalog app utilities"
def add_arguments(self, parser):
parser.add_argument(
"--extsearch",
)
parser.add_argument(
"--category",
default="all",
)
parser.add_argument(
"--verbose",
action="store_true",
@ -44,8 +56,26 @@ class Command(BaseCommand):
self.integrity()
if options["localize"]:
self.localize()
if options["extsearch"]:
self.external_search(options["extsearch"], options["category"])
self.stdout.write(self.style.SUCCESS("Done."))
def external_search(self, q, cat):
sites = SiteManager.get_sites_for_search()
peers = Takahe.get_neodb_peers()
self.stdout.write(f"Searching {cat} '{q}' ...")
self.stdout.write(f"Peers: {peers}")
self.stdout.write(f"Sites: {sites}")
start_time = time.time()
results = ExternalSources.search(q, 1, cat)
for r in results:
self.stdout.write(f"{r}")
self.stdout.write(
self.style.SUCCESS(
f"{time.time() - start_time} seconds, {len(results)} items."
)
)
def localize(self):
c = Item.objects.all().count()
qs = Item.objects.filter(is_deleted=False, merged_to_item__isnull=True)

View file

@ -38,7 +38,7 @@ from .tv.models import (
TVShowSchema,
)
from .search.models import Indexer # isort:skip
from .search.models import Indexer, ExternalSearchResultItem # isort:skip
# class Exhibition(Item):
@ -103,6 +103,7 @@ __all__ = [
"CatalogCollection",
"AvailableItemCategory",
"ExternalResource",
"ExternalSearchResultItem",
"IdType",
"Item",
"ItemCategory",

View file

@ -1,342 +1,26 @@
import asyncio
import logging
from urllib.parse import quote_plus, urlparse
import httpx
import requests
from django.conf import settings
from lxml import html
from catalog.common import BasicDownloader, ItemCategory, SiteManager, SiteName
from catalog.common import SiteManager
from catalog.search.models import ExternalSearchResultItem
from catalog.sites.igdb import IGDB as IGDB_Site
from catalog.sites.spotify import get_spotify_token
from catalog.sites.tmdb import TMDB_DEFAULT_LANG
from catalog.sites.fedi import FediverseInstance
SEARCH_PAGE_SIZE = 5 # not all apis support page size
logger = logging.getLogger(__name__)
class Goodreads:
class ExternalSources:
@classmethod
def search(cls, q: str, page=1):
results = []
search_url = f"https://www.goodreads.com/search?page={page}&q={quote_plus(q)}"
try:
r = requests.get(
search_url,
timeout=3,
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": BasicDownloader.get_accept_language(),
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"DNT": "1",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": "no-cache",
},
)
if r.url.startswith("https://www.goodreads.com/book/show/"):
# Goodreads will 302 if only one result matches ISBN
site = SiteManager.get_site_by_url(r.url)
if site:
res = site.get_resource_ready()
if res:
subtitle = f"{res.metadata.get('pub_year')} {', '.join(res.metadata.get('author', []))} {', '.join(res.metadata.get('translator', []))}"
results.append(
ExternalSearchResultItem(
ItemCategory.Book,
SiteName.Goodreads,
res.url,
res.metadata["title"],
subtitle,
res.metadata.get("brief", ""),
res.metadata.get("cover_image_url", ""),
)
)
else:
h = html.fromstring(r.content.decode("utf-8"))
books = h.xpath('//tr[@itemtype="http://schema.org/Book"]')
for c in books: # type:ignore
el_cover = c.xpath('.//img[@class="bookCover"]/@src')
cover = el_cover[0] if el_cover else ""
el_title = c.xpath('.//a[@class="bookTitle"]//text()')
title = "".join(el_title).strip() if el_title else "Unkown Title"
el_url = c.xpath('.//a[@class="bookTitle"]/@href')
url = "https://www.goodreads.com" + el_url[0] if el_url else ""
el_authors = c.xpath('.//a[@class="authorName"]//text()')
subtitle = ", ".join(el_authors) if el_authors else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Book,
SiteName.Goodreads,
url,
title,
subtitle,
"",
cover,
)
)
except requests.exceptions.RequestException as e:
logger.warning(f"Search {search_url} error: {e}")
except Exception as e:
logger.error("Goodreads search error", extra={"query": q, "exception": e})
return results
class GoogleBooks:
@classmethod
def search(cls, q, page=1):
results = []
api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE * (page - 1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE"
try:
j = requests.get(api_url, timeout=2).json()
if "items" in j:
for b in j["items"]:
if "title" not in b["volumeInfo"]:
continue
title = b["volumeInfo"]["title"]
subtitle = ""
if "publishedDate" in b["volumeInfo"]:
subtitle += b["volumeInfo"]["publishedDate"] + " "
if "authors" in b["volumeInfo"]:
subtitle += ", ".join(b["volumeInfo"]["authors"])
if "description" in b["volumeInfo"]:
brief = b["volumeInfo"]["description"]
elif "textSnippet" in b["volumeInfo"]:
brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
else:
brief = ""
category = ItemCategory.Book
# b['volumeInfo']['infoLink'].replace('http:', 'https:')
url = "https://books.google.com/books?id=" + b["id"]
cover = (
b["volumeInfo"]["imageLinks"]["thumbnail"]
if "imageLinks" in b["volumeInfo"]
else ""
)
results.append(
ExternalSearchResultItem(
category,
SiteName.GoogleBooks,
url,
title,
subtitle,
brief,
cover,
)
)
except requests.exceptions.RequestException as e:
logger.warning(f"Search {api_url} error: {e}")
except Exception as e:
logger.error("GoogleBooks search error", extra={"query": q, "exception": e})
return results
class TheMovieDatabase:
@classmethod
def search(cls, q, page=1):
results = []
api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={page}&api_key={settings.TMDB_API3_KEY}&language={TMDB_DEFAULT_LANG}&include_adult=true"
try:
j = requests.get(api_url, timeout=2).json()
if j.get("results"):
for m in j["results"]:
if m["media_type"] in ["tv", "movie"]:
url = f"https://www.themoviedb.org/{m['media_type']}/{m['id']}"
if m["media_type"] == "tv":
cat = ItemCategory.TV
title = m["name"]
subtitle = f"{m.get('first_air_date', '')} {m.get('original_name', '')}"
else:
cat = ItemCategory.Movie
title = m["title"]
subtitle = f"{m.get('release_date', '')} {m.get('original_name', '')}"
cover = (
f"https://image.tmdb.org/t/p/w500/{m.get('poster_path')}"
if m.get("poster_path")
else ""
)
results.append(
ExternalSearchResultItem(
cat,
SiteName.TMDB,
url,
title,
subtitle,
m.get("overview"),
cover,
)
)
else:
logger.warning(f"TMDB search '{q}' no results found.")
except requests.exceptions.RequestException as e:
logger.warning(f"Search {api_url} error: {e}")
except Exception as e:
logger.error("TMDb search error", extra={"query": q, "exception": e})
return results
class Spotify:
@classmethod
def search(cls, q, page=1):
results = []
api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page * SEARCH_PAGE_SIZE}"
try:
headers = {"Authorization": f"Bearer {get_spotify_token()}"}
j = requests.get(api_url, headers=headers, timeout=2).json()
if j.get("albums"):
for a in j["albums"]["items"]:
title = a["name"]
subtitle = a.get("release_date", "")
for artist in a.get("artists", []):
subtitle += " " + artist.get("name", "")
url = a["external_urls"]["spotify"]
cover = a["images"][0]["url"] if a.get("images") else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Music,
SiteName.Spotify,
url,
title,
subtitle,
"",
cover,
)
)
else:
logger.warning(f"Spotify search '{q}' no results found.")
except requests.exceptions.RequestException as e:
logger.warning(f"Search {api_url} error: {e}")
except Exception as e:
logger.error("Spotify search error", extra={"query": q, "exception": e})
return results
class Bandcamp:
@classmethod
def search(cls, q, page=1):
results = []
search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={page}&q={quote_plus(q)}"
try:
r = requests.get(search_url, timeout=2)
h = html.fromstring(r.content.decode("utf-8"))
albums = h.xpath('//li[@class="searchresult data-search"]')
for c in albums: # type:ignore
el_cover = c.xpath('.//div[@class="art"]/img/@src')
cover = el_cover[0] if el_cover else ""
el_title = c.xpath('.//div[@class="heading"]//text()')
title = "".join(el_title).strip() if el_title else "Unknown Title"
el_url = c.xpath('..//div[@class="itemurl"]/a/@href')
url = el_url[0] if el_url else ""
el_authors = c.xpath('.//div[@class="subhead"]//text()')
subtitle = ", ".join(el_authors) if el_authors else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Music,
SiteName.Bandcamp,
url,
title,
subtitle,
"",
cover,
)
)
except requests.exceptions.RequestException as e:
logger.warning(f"Search {search_url} error: {e}")
except Exception as e:
logger.error("Bandcamp search error", extra={"query": q, "exception": e})
return results
class ApplePodcast:
@classmethod
def search(cls, q, page=1):
results = []
search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * SEARCH_PAGE_SIZE}&term={quote_plus(q)}"
try:
r = requests.get(search_url, timeout=2).json()
for p in r["results"][(page - 1) * SEARCH_PAGE_SIZE :]:
if p.get("feedUrl"):
results.append(
ExternalSearchResultItem(
ItemCategory.Podcast,
SiteName.RSS,
p["feedUrl"],
p["trackName"],
p["artistName"],
"",
p["artworkUrl600"],
)
)
except requests.exceptions.RequestException as e:
logger.warning(f"Search {search_url} error: {e}")
except Exception as e:
logger.error(
"ApplePodcast search error", extra={"query": q, "exception": e}
)
return results
class IGDB:
@classmethod
def search(cls, q, page=1):
return IGDB_Site.search(
q, limit=SEARCH_PAGE_SIZE, offset=page * SEARCH_PAGE_SIZE
)
class Fediverse:
@staticmethod
async def search_task(host, q, category=None):
api_url = f"https://{host}/api/catalog/search?query={quote_plus(q)}{'&category=' + category if category else ''}"
async with httpx.AsyncClient() as client:
results = []
try:
response = await client.get(
api_url,
timeout=2,
)
r = response.json()
except Exception as e:
logger.error(
f"Fediverse search {host} error",
extra={"url": api_url, "query": q, "exception": e},
)
return []
if "data" in r:
for item in r["data"]:
if any(
urlparse(res["url"]).hostname in settings.SITE_DOMAINS
for res in item.get("external_resources", [])
):
continue
url = f"https://{host}{item['url']}" # FIXME update API and use abs urls
try:
cat = ItemCategory(item["category"])
except Exception:
cat = None
results.append(
ExternalSearchResultItem(
cat,
host,
url,
item["display_title"],
"",
item["brief"],
item["cover_image_url"],
)
)
return results
@classmethod
def search(cls, q: str, page: int = 1, category: str | None = None):
from takahe.utils import Takahe
peers = Takahe.get_neodb_peers()
c = category if category != "movietv" else "movie,tv"
tasks = [Fediverse.search_task(host, q, c) for host in peers]
def search(
cls, query: str, page: int = 1, category: str | None = None
) -> list[ExternalSearchResultItem]:
if not query or page < 1 or page > 10:
return []
if category in ["", None]:
category = "all"
tasks = FediverseInstance.search_tasks(query, page, category)
for site in SiteManager.get_sites_for_search():
tasks.append(site.search_task(query, page, category))
# loop = asyncio.get_event_loop()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
@ -344,29 +28,3 @@ class Fediverse:
for r in loop.run_until_complete(asyncio.gather(*tasks)):
results.extend(r)
return results
class ExternalSources:
@classmethod
def search(cls, c, q, page=1):
if not q:
return []
results = []
results.extend(
Fediverse.search(q, page, category=c if c and c != "all" else None)
)
if c == "" or c is None:
c = "all"
if c == "all" or c == "movietv":
results.extend(TheMovieDatabase.search(q, page))
if c == "all" or c == "book":
results.extend(GoogleBooks.search(q, page))
results.extend(Goodreads.search(q, page))
if c == "all" or c == "game":
results.extend(IGDB.search(q, page))
if c == "all" or c == "music":
results.extend(Spotify.search(q, page))
results.extend(Bandcamp.search(q, page))
if c == "podcast":
results.extend(ApplePodcast.search(q, page))
return results

View file

@ -79,7 +79,7 @@ class ExternalSearchResultItem:
self.cover_image_url = cover_url
def __repr__(self):
return f"[{self.category}] {self.display_title} {self.url}"
return f"[{self.category}] {self.display_title} {self.source_url}"
@property
def verbose_category_name(self):

View file

@ -159,7 +159,7 @@ def external_search(request):
category = None
keywords = request.GET.get("q", default="").strip()
page_number = int_(request.GET.get("page"), 1)
items = ExternalSources.search(category, keywords, page_number) if keywords else []
items = ExternalSources.search(keywords, page_number, category) if keywords else []
cache_key = f"search_{category if category != 'movietv' else 'movie,tv'}_{keywords}"
dedupe_urls = cache.get(cache_key, [])
items = [i for i in items if i.source_url not in dedupe_urls]

View file

@ -1,6 +1,7 @@
from ..common.sites import SiteManager
from .ao3 import ArchiveOfOurOwn
from .apple_music import AppleMusic
from .apple_podcast import ApplePodcast
from .bandcamp import Bandcamp
from .bangumi import Bangumi
from .bgg import BoardGameGeek
@ -24,12 +25,11 @@ from .steam import Steam
from .tmdb import TMDB_Movie
from .ypshuo import Ypshuo
# from .apple_podcast import ApplePodcast
__all__ = [
"SiteManager",
"ArchiveOfOurOwn",
"AppleMusic",
"ApplePodcast",
"Bandcamp",
"Bangumi",
"BoardGameGeek",

View file

@ -1,16 +1,17 @@
import logging
from urllib.parse import quote_plus
import httpx
from loguru import logger
from catalog.common import *
from catalog.models import *
from .rss import RSS
_logger = logging.getLogger(__name__)
@SiteManager.register
class ApplePodcast(AbstractSite):
# SITE_NAME = SiteName.ApplePodcast
SITE_NAME = SiteName.ApplePodcast
ID_TYPE = IdType.ApplePodcast
URL_PATTERNS = [r"https://[^.]+.apple.com/\w+/podcast/*[^/?]*/id(\d+)"]
WIKI_PROPERTY_ID = "P5842"
@ -38,3 +39,35 @@ class ApplePodcast(AbstractSite):
)
pd.lookup_ids[IdType.RSS] = RSS.url_to_id(feed_url)
return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category != "podcast":
return []
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
results = []
search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * SEARCH_PAGE_SIZE}&term={quote_plus(q)}"
async with httpx.AsyncClient() as client:
try:
response = await client.get(search_url, timeout=2)
r = response.json()
for p in r["results"][(page - 1) * SEARCH_PAGE_SIZE :]:
if p.get("feedUrl"):
results.append(
ExternalSearchResultItem(
ItemCategory.Podcast,
SiteName.RSS,
p["feedUrl"],
p["trackName"],
p["artistName"],
"",
p["artworkUrl600"],
)
)
except Exception as e:
logger.error(
"ApplePodcast search error", extra={"query": q, "exception": e}
)
return results

View file

@ -5,6 +5,9 @@ import urllib.parse
import dateparser
import dns.resolver
import httpx
from loguru import logger
from lxml import html
from catalog.common import *
from catalog.models import *
@ -103,3 +106,45 @@ class Bandcamp(AbstractSite):
}
pd = ResourceContent(metadata=data)
return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category != "music":
return []
SEARCH_PAGE_SIZE = 5
p = (page - 1) * SEARCH_PAGE_SIZE // 18 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 18
results = []
search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={p}&q={urllib.parse.quote_plus(q)}"
async with httpx.AsyncClient() as client:
try:
r = await client.get(search_url, timeout=2)
h = html.fromstring(r.content.decode("utf-8"))
albums = h.xpath('//li[@class="searchresult data-search"]')
for c in albums: # type:ignore
el_cover = c.xpath('.//div[@class="art"]/img/@src')
cover = el_cover[0] if el_cover else ""
el_title = c.xpath('.//div[@class="heading"]//text()')
title = "".join(el_title).strip() if el_title else "Unknown Title"
el_url = c.xpath('..//div[@class="itemurl"]/a/@href')
url = el_url[0] if el_url else ""
el_authors = c.xpath('.//div[@class="subhead"]//text()')
subtitle = ", ".join(el_authors) if el_authors else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Music,
SiteName.Bandcamp,
url,
title,
subtitle,
"",
cover,
)
)
except Exception as e:
logger.error(
"Bandcamp search error", extra={"query": q, "exception": e}
)
return results[offset : offset + SEARCH_PAGE_SIZE]

View file

@ -1,9 +1,33 @@
from urllib.parse import quote_plus, urlparse
import httpx
from django.conf import settings
from django.core.validators import URLValidator
from loguru import logger
from catalog.common import *
from catalog.models import *
from catalog.common import (
AbstractSite,
BasicImageDownloader,
CachedDownloader,
IdType,
ItemCategory,
ResourceContent,
SiteManager,
SiteName,
)
from catalog.models import (
Album,
Edition,
ExternalSearchResultItem,
Game,
Movie,
Performance,
PerformanceProduction,
Podcast,
TVEpisode,
TVSeason,
TVShow,
)
@SiteManager.register
@ -99,3 +123,56 @@ class FediverseInstance(AbstractSite):
lookup_ids=ids,
)
return d
@classmethod
async def peer_search_task(cls, host, q, page, category=None):
SEARCH_PAGE_SIZE = 5
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
api_url = f"https://{host}/api/catalog/search?query={quote_plus(q)}&page={p}{'&category=' + category if category and category != 'all' else ''}"
async with httpx.AsyncClient() as client:
results = []
try:
response = await client.get(
api_url,
timeout=2,
)
r = response.json()
except Exception as e:
logger.error(
f"Fediverse search {host} error",
extra={"url": api_url, "query": q, "exception": e},
)
return []
if "data" in r:
for item in r["data"]:
if any(
urlparse(res["url"]).hostname in settings.SITE_DOMAINS
for res in item.get("external_resources", [])
):
continue
url = f"https://{host}{item['url']}" # FIXME update API and use abs urls
try:
cat = ItemCategory(item["category"])
except Exception:
cat = None
results.append(
ExternalSearchResultItem(
cat,
host,
url,
item["display_title"],
"",
item["brief"],
item["cover_image_url"],
)
)
return results[offset : offset + SEARCH_PAGE_SIZE]
@classmethod
def search_tasks(cls, q: str, page: int = 1, category: str | None = None):
from takahe.utils import Takahe
peers = Takahe.get_neodb_peers()
c = category if category != "movietv" else "movie,tv"
return [cls.peer_search_task(host, q, page, c) for host in peers]

View file

@ -1,18 +1,18 @@
import json
import logging
from datetime import datetime
from urllib.parse import quote_plus
import httpx
from django.utils.timezone import make_aware
from loguru import logger
from lxml import html
from catalog.book.models import Edition, Work
from catalog.book.utils import binding_to_format, detect_isbn_asin
from catalog.common import *
from common.models.lang import detect_language
from catalog.models import Edition, ExternalSearchResultItem, Work
from common.models import detect_language
from journal.models.renderers import html_to_text
_logger = logging.getLogger(__name__)
class GoodreadsDownloader(RetryDownloader):
def validate_response(self, response):
@ -121,6 +121,82 @@ class Goodreads(AbstractSite):
pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)
return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category not in ["all", "book"]:
return []
SEARCH_PAGE_SIZE = 5
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
results = []
search_url = f"https://www.goodreads.com/search?page={p}&q={quote_plus(q)}"
async with httpx.AsyncClient() as client:
try:
r = await client.get(
search_url,
timeout=3,
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": BasicDownloader.get_accept_language(),
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"DNT": "1",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": "no-cache",
},
)
if r.url.path.startswith("/book/show/"):
# Goodreads will 302 if only one result matches ISBN
site = SiteManager.get_site_by_url(str(r.url))
if site:
res = site.get_resource_ready()
if res:
subtitle = f"{res.metadata.get('pub_year')} {', '.join(res.metadata.get('author', []))} {', '.join(res.metadata.get('translator', []))}"
results.append(
ExternalSearchResultItem(
ItemCategory.Book,
SiteName.Goodreads,
res.url,
res.metadata["title"],
subtitle,
res.metadata.get("brief", ""),
res.metadata.get("cover_image_url", ""),
)
)
else:
h = html.fromstring(r.content.decode("utf-8"))
books = h.xpath('//tr[@itemtype="http://schema.org/Book"]')
for c in books: # type:ignore
el_cover = c.xpath('.//img[@class="bookCover"]/@src')
cover = el_cover[0] if el_cover else ""
el_title = c.xpath('.//a[@class="bookTitle"]//text()')
title = (
"".join(el_title).strip() if el_title else "Unkown Title"
)
el_url = c.xpath('.//a[@class="bookTitle"]/@href')
url = "https://www.goodreads.com" + el_url[0] if el_url else ""
el_authors = c.xpath('.//a[@class="authorName"]//text()')
subtitle = ", ".join(el_authors) if el_authors else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Book,
SiteName.Goodreads,
url,
title,
subtitle,
"",
cover,
)
)
except Exception as e:
logger.error(
"Goodreads search error", extra={"query": q, "exception": e}
)
return results[offset : offset + SEARCH_PAGE_SIZE]
@SiteManager.register
class Goodreads_Work(AbstractSite):

View file

@ -1,7 +1,10 @@
import logging
import re
from urllib.parse import quote_plus
import httpx
from django.conf import settings
from loguru import logger
from catalog.book.utils import isbn_10_to_13
from catalog.common import *
@ -116,3 +119,57 @@ class GoogleBooks(AbstractSite):
cover_image_extention=ext,
lookup_ids={IdType.ISBN: isbn13},
)
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category not in ["all", "book"]:
return []
SEARCH_PAGE_SIZE = 5
results = []
api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE * (page - 1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE"
async with httpx.AsyncClient() as client:
try:
response = await client.get(api_url, timeout=2)
j = response.json()
if "items" in j:
for b in j["items"]:
if "title" not in b["volumeInfo"]:
continue
title = b["volumeInfo"]["title"]
subtitle = ""
if "publishedDate" in b["volumeInfo"]:
subtitle += b["volumeInfo"]["publishedDate"] + " "
if "authors" in b["volumeInfo"]:
subtitle += ", ".join(b["volumeInfo"]["authors"])
if "description" in b["volumeInfo"]:
brief = b["volumeInfo"]["description"]
elif "textSnippet" in b["volumeInfo"]:
brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
else:
brief = ""
category = ItemCategory.Book
# b['volumeInfo']['infoLink'].replace('http:', 'https:')
url = "https://books.google.com/books?id=" + b["id"]
cover = (
b["volumeInfo"]["imageLinks"]["thumbnail"]
if "imageLinks" in b["volumeInfo"]
else ""
)
results.append(
ExternalSearchResultItem(
category,
SiteName.GoogleBooks,
url,
title,
subtitle,
brief,
cover,
)
)
except Exception as e:
logger.error(
"GoogleBooks search error", extra={"query": q, "exception": e}
)
return results

View file

@ -8,6 +8,7 @@ import datetime
import json
from urllib.parse import quote_plus
import httpx
import requests
from django.conf import settings
from django.core.cache import cache
@ -83,44 +84,6 @@ class IGDB(AbstractSite):
fp.write(json.dumps(r))
return r
@classmethod
def search(cls, q, limit: int, offset: int = 0):
rs = cls.api_query(
"games",
f'fields *, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name; search "{quote_plus(q)}"; limit {limit}; offset {offset};',
)
result = []
for r in rs:
subtitle = ""
if "first_release_date" in r:
subtitle = datetime.datetime.fromtimestamp(
r["first_release_date"], datetime.timezone.utc
).strftime("%Y-%m-%d ")
if "platforms" in r:
ps = sorted(r["platforms"], key=lambda p: p["id"])
subtitle += ",".join(
[(p["name"] if p["id"] != 6 else "Windows") for p in ps]
)
brief = r["summary"] if "summary" in r else ""
brief += "\n\n" + r["storyline"] if "storyline" in r else ""
cover = (
"https:" + r["cover"]["url"].replace("t_thumb", "t_cover_big")
if r.get("cover")
else ""
)
result.append(
ExternalSearchResultItem(
ItemCategory.Game,
SiteName.IGDB,
r["url"],
r["name"],
subtitle,
brief,
cover,
)
)
return result
def scrape(self):
fields = "*, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name"
r = self.api_query("games", f'fields {fields}; where url = "{self.url}";')
@ -200,3 +163,55 @@ class IGDB(AbstractSite):
IdType.Steam
).url_to_id(steam_url)
return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category != "game":
return []
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
limit = SEARCH_PAGE_SIZE
offset = (page - 1) * limit
q = f'fields *, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name; search "{quote_plus(q)}"; limit {limit}; offset {offset};'
_wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, _igdb_access_token())
async with httpx.AsyncClient() as client:
try:
url = IGDBWrapper._build_url("games")
params = _wrapper._compose_request(q)
response = await client.post(url, **params)
rs = json.loads(response.content)
except requests.HTTPError as e:
logger.error(f"IGDB API: {e}", extra={"exception": e})
rs = []
result = []
for r in rs:
subtitle = ""
if "first_release_date" in r:
subtitle = datetime.datetime.fromtimestamp(
r["first_release_date"], datetime.timezone.utc
).strftime("%Y-%m-%d ")
if "platforms" in r:
ps = sorted(r["platforms"], key=lambda p: p["id"])
subtitle += ",".join(
[(p["name"] if p["id"] != 6 else "Windows") for p in ps]
)
brief = r["summary"] if "summary" in r else ""
brief += "\n\n" + r["storyline"] if "storyline" in r else ""
cover = (
"https:" + r["cover"]["url"].replace("t_thumb", "t_cover_big")
if r.get("cover")
else ""
)
result.append(
ExternalSearchResultItem(
ItemCategory.Game,
SiteName.IGDB,
r["url"],
r["name"],
subtitle,
brief,
cover,
)
)
return result

View file

@ -6,8 +6,10 @@ import logging
import time
import dateparser
import httpx
import requests
from django.conf import settings
from loguru import logger
from catalog.common import *
from catalog.models import *
@ -107,6 +109,45 @@ class Spotify(AbstractSite):
pd.lookup_ids[IdType.ISRC] = isrc
return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category not in ["music", "all"]:
return []
SEARCH_PAGE_SIZE = 5
results = []
api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page * SEARCH_PAGE_SIZE}"
async with httpx.AsyncClient() as client:
try:
headers = {"Authorization": f"Bearer {get_spotify_token()}"}
response = await client.get(api_url, headers=headers, timeout=2)
j = response.json()
if j.get("albums"):
for a in j["albums"]["items"]:
title = a["name"]
subtitle = a.get("release_date", "")
for artist in a.get("artists", []):
subtitle += " " + artist.get("name", "")
url = a["external_urls"]["spotify"]
cover = a["images"][0]["url"] if a.get("images") else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Music,
SiteName.Spotify,
url,
title,
subtitle,
"",
cover,
)
)
else:
logger.warning(f"Spotify search '{q}' no results found.")
except Exception as e:
logger.error("Spotify search error", extra={"query": q, "exception": e})
return results
def get_spotify_token():
global spotify_token, spotify_token_expire_time

View file

@ -12,8 +12,11 @@ these language code from TMDB are not in currently iso-639-1
import logging
import re
from urllib.parse import quote_plus
import httpx
from django.conf import settings
from loguru import logger
from catalog.common import *
from catalog.movie.models import *
@ -175,6 +178,55 @@ class TMDB_Movie(AbstractSite):
pd.lookup_ids[IdType.IMDB] = imdb_code
return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category not in ["movietv", "all", "movie", "tv"]:
return []
SEARCH_PAGE_SIZE = 5 if category == "all" else 10
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
results = []
api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={p}&api_key={settings.TMDB_API3_KEY}&language={TMDB_DEFAULT_LANG}&include_adult=true"
async with httpx.AsyncClient() as client:
try:
response = await client.get(api_url, timeout=2)
j = response.json()
if j.get("results"):
for m in j["results"]:
if m["media_type"] in ["tv", "movie"]:
url = f"https://www.themoviedb.org/{m['media_type']}/{m['id']}"
if m["media_type"] == "tv":
cat = ItemCategory.TV
title = m["name"]
subtitle = f"{m.get('first_air_date', '')} {m.get('original_name', '')}"
else:
cat = ItemCategory.Movie
title = m["title"]
subtitle = f"{m.get('release_date', '')} {m.get('original_name', '')}"
cover = (
f"https://image.tmdb.org/t/p/w500/{m.get('poster_path')}"
if m.get("poster_path")
else ""
)
results.append(
ExternalSearchResultItem(
cat,
SiteName.TMDB,
url,
title,
subtitle,
m.get("overview"),
cover,
)
)
else:
logger.warning(f"TMDB search '{q}' no results found.")
except Exception as e:
logger.error("TMDb search error", extra={"query": q, "exception": e})
return results[offset : offset + SEARCH_PAGE_SIZE]
@SiteManager.register
class TMDB_TV(AbstractSite):

View file

@ -33,6 +33,7 @@ x-shared:
NEODB_DISABLE_DEFAULT_RELAY:
NEODB_DISABLE_CRON_JOBS:
NEODB_SEARCH_PEERS:
NEODB_SEARCH_SITES:
NEODB_MIN_MARKS_FOR_DISCOVER:
NEODB_DISCOVER_UPDATE_INTERVAL:
NEODB_DISCOVER_FILTER_LANGUAGE:

View file

@ -57,6 +57,7 @@ if you are doing debug or development:
- `GOOGLE_API_KEY` - API key for [Google Books](https://developers.google.com/books/docs/v1/using)
- `DISCOGS_API_KEY` - personal access token from [Discogs](https://www.discogs.com/settings/developers)
- `IGDB_API_CLIENT_ID`, `IGDB_API_CLIENT_SECRET` - IGDB [keys](https://api-docs.igdb.com/)
- `NEODB_SEARCH_SITES` is empty by default, which means NeoDB will search all available sources. This can be set to a comma-separated list of site names (e.g. `goodreads,googlebooks,spotify,tmdb,igdb,bandcamp,apple_podcast`), so that NeoDB will only search those sites; or not search any of them if set to just `-`.
## Other maintenance tasks

View file

@ -169,7 +169,7 @@ mkdocs==1.6.1
# via mkdocs-material
mkdocs-get-deps==0.2.0
# via mkdocs
mkdocs-material==9.5.49
mkdocs-material==9.5.50
mkdocs-material-extensions==1.3.1
# via mkdocs-material
multidict==6.1.0
@ -213,7 +213,7 @@ pygments==2.19.1
# via mkdocs-material
pymdown-extensions==10.14
# via mkdocs-material
pyright==1.1.391
pyright==1.1.392.post0
python-dateutil==2.9.0.post0
# via dateparser
# via django-auditlog
@ -251,7 +251,7 @@ rjsmin==1.2.2
# via django-compressor
rq==2.1.0
# via django-rq
ruff==0.9.1
ruff==0.9.2
sentry-sdk==2.20.0
setproctitle==1.3.4
six==1.17.0
@ -292,7 +292,7 @@ urllib3==2.3.0
# via sentry-sdk
urlman==2.0.2
validators==0.34.0
virtualenv==20.28.1
virtualenv==20.29.1
# via pre-commit
watchdog==6.0.0
# via mkdocs