make external search async and configurable

2025-01-18 15:53:06 -05:00 · 2025-01-18 15:53:06 -05:00 · 90386bbf1a
commit 90386bbf1a
parent 2826bc60dc
20 changed files with 523 additions and 414 deletions
--- a/boofilsic/settings.py
+++ b/boofilsic/settings.py
@ -92,6 +92,8 @@ env = environ.FileAwareEnv(
    NEODB_DISCOVER_UPDATE_INTERVAL=(int, 60),
    # Disable cron jobs, * for all
    NEODB_DISABLE_CRON_JOBS=(list, []),
+    # search sites
+    NEODB_SEARCH_SITES=(list, []),
    # federated search peers
    NEODB_SEARCH_PEERS=(list, []),
    # INTEGRATED TAKAHE CONFIGURATION
@ -282,6 +284,7 @@ DOWNLOADER_RETRIES = env("NEODB_DOWNLOADER_RETRIES")

 DISABLE_CRON_JOBS = env("NEODB_DISABLE_CRON_JOBS")
 SEARCH_PEERS = env("NEODB_SEARCH_PEERS")
+SEARCH_SITES = env("NEODB_SEARCH_SITES")

 FANOUT_LIMIT_DAYS = env("NEODB_FANOUT_LIMIT_DAYS")
 # ====== USER CONFIGUTRATION END ======
--- a/catalog/common/models.py
+++ b/catalog/common/models.py
@ -43,7 +43,7 @@ class SiteName(models.TextChoices):
    Steam = "steam", _("Steam")  # type:ignore[reportCallIssue]
    Bangumi = "bangumi", _("Bangumi")  # type:ignore[reportCallIssue]
    BGG = "bgg", _("BGG")  # type:ignore[reportCallIssue]
-    # ApplePodcast = "apple_podcast", _("Apple Podcast")  # type:ignore[reportCallIssue]
+    ApplePodcast = "apple_podcast", _("Apple Podcast")  # type:ignore[reportCallIssue]
    RSS = "rss", _("RSS")  # type:ignore[reportCallIssue]
    Discogs = "discogs", _("Discogs")  # type:ignore[reportCallIssue]
    AppleMusic = "apple_music", _("Apple Music")  # type:ignore[reportCallIssue]
--- a/catalog/common/sites.py
+++ b/catalog/common/sites.py
@ -14,6 +14,7 @@ from typing import Type, TypeVar

 import django_rq
 import requests
+from django.conf import settings
 from loguru import logger
 from validators import url as url_validate

@ -91,6 +92,13 @@ class AbstractSite:
                )
        return self.resource

+    # add this method to subclass to enable external search
+    # @classmethod
+    # async def search_task(
+    #     cls, query: str, page: int, category: str
+    # ) -> list[ExternalSearchResultItem]:
+    #     return []
+
    def scrape(self) -> ResourceContent:
        """subclass should implement this, return ResourceContent object"""
        data = ResourceContent()
@ -340,6 +348,17 @@ class SiteManager:
    def get_all_sites():
        return SiteManager.registry.values()

+    @staticmethod
+    def get_sites_for_search():
+        if settings.SEARCH_SITES == ["-"]:
+            return []
+        sites = [
+            cls for cls in SiteManager.get_all_sites() if hasattr(cls, "search_task")
+        ]
+        if settings.SEARCH_SITES == ["*"] or not settings.SEARCH_SITES:
+            return sites
+        return [s for s in sites if s.SITE_NAME.value in settings.SEARCH_SITES]
+

 def crawl_related_resources_task(resource_pk):
    resource = ExternalResource.objects.filter(pk=resource_pk).first()
--- a/catalog/management/commands/catalog.py
+++ b/catalog/management/commands/catalog.py
@ -1,16 +1,28 @@
+import time
+
 from django.contrib.contenttypes.models import ContentType
 from django.core.management.base import BaseCommand
 from django.db.models import Count, F
 from tqdm import tqdm

+from catalog.common.sites import SiteManager
 from catalog.models import Edition, Item, Podcast, TVSeason, TVShow
+from catalog.search.external import ExternalSources
 from common.models import detect_language, uniq
+from takahe.utils import Takahe


 class Command(BaseCommand):
    help = "catalog app utilities"

    def add_arguments(self, parser):
+        parser.add_argument(
+            "--extsearch",
+        )
+        parser.add_argument(
+            "--category",
+            default="all",
+        )
        parser.add_argument(
            "--verbose",
            action="store_true",
@ -44,8 +56,26 @@ class Command(BaseCommand):
            self.integrity()
        if options["localize"]:
            self.localize()
+        if options["extsearch"]:
+            self.external_search(options["extsearch"], options["category"])
        self.stdout.write(self.style.SUCCESS("Done."))

+    def external_search(self, q, cat):
+        sites = SiteManager.get_sites_for_search()
+        peers = Takahe.get_neodb_peers()
+        self.stdout.write(f"Searching {cat} '{q}' ...")
+        self.stdout.write(f"Peers: {peers}")
+        self.stdout.write(f"Sites: {sites}")
+        start_time = time.time()
+        results = ExternalSources.search(q, 1, cat)
+        for r in results:
+            self.stdout.write(f"{r}")
+        self.stdout.write(
+            self.style.SUCCESS(
+                f"{time.time() - start_time} seconds, {len(results)} items."
+            )
+        )
+
    def localize(self):
        c = Item.objects.all().count()
        qs = Item.objects.filter(is_deleted=False, merged_to_item__isnull=True)
--- a/catalog/models.py
+++ b/catalog/models.py
@ -38,7 +38,7 @@ from .tv.models import (
    TVShowSchema,
 )

-from .search.models import Indexer  # isort:skip
+from .search.models import Indexer, ExternalSearchResultItem  # isort:skip


 # class Exhibition(Item):
@ -103,6 +103,7 @@ __all__ = [
    "CatalogCollection",
    "AvailableItemCategory",
    "ExternalResource",
+    "ExternalSearchResultItem",
    "IdType",
    "Item",
    "ItemCategory",
--- a/catalog/search/external.py
+++ b/catalog/search/external.py
@ -1,342 +1,26 @@
 import asyncio
 import logging
-from urllib.parse import quote_plus, urlparse

-import httpx
-import requests
-from django.conf import settings
-from lxml import html
-
-from catalog.common import BasicDownloader, ItemCategory, SiteManager, SiteName
+from catalog.common import SiteManager
 from catalog.search.models import ExternalSearchResultItem
-from catalog.sites.igdb import IGDB as IGDB_Site
-from catalog.sites.spotify import get_spotify_token
-from catalog.sites.tmdb import TMDB_DEFAULT_LANG
+from catalog.sites.fedi import FediverseInstance

 SEARCH_PAGE_SIZE = 5  # not all apis support page size
 logger = logging.getLogger(__name__)


-class Goodreads:
+class ExternalSources:
    @classmethod
-    def search(cls, q: str, page=1):
-        results = []
-        search_url = f"https://www.goodreads.com/search?page={page}&q={quote_plus(q)}"
-        try:
-            r = requests.get(
-                search_url,
-                timeout=3,
-                headers={
-                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
-                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-                    "Accept-Language": BasicDownloader.get_accept_language(),
-                    "Accept-Encoding": "gzip, deflate",
-                    "Connection": "keep-alive",
-                    "DNT": "1",
-                    "Upgrade-Insecure-Requests": "1",
-                    "Cache-Control": "no-cache",
-                },
-            )
-            if r.url.startswith("https://www.goodreads.com/book/show/"):
-                # Goodreads will 302 if only one result matches ISBN
-                site = SiteManager.get_site_by_url(r.url)
-                if site:
-                    res = site.get_resource_ready()
-                    if res:
-                        subtitle = f"{res.metadata.get('pub_year')} {', '.join(res.metadata.get('author', []))} {', '.join(res.metadata.get('translator', []))}"
-                        results.append(
-                            ExternalSearchResultItem(
-                                ItemCategory.Book,
-                                SiteName.Goodreads,
-                                res.url,
-                                res.metadata["title"],
-                                subtitle,
-                                res.metadata.get("brief", ""),
-                                res.metadata.get("cover_image_url", ""),
-                            )
-                        )
-            else:
-                h = html.fromstring(r.content.decode("utf-8"))
-                books = h.xpath('//tr[@itemtype="http://schema.org/Book"]')
-                for c in books:  # type:ignore
-                    el_cover = c.xpath('.//img[@class="bookCover"]/@src')
-                    cover = el_cover[0] if el_cover else ""
-                    el_title = c.xpath('.//a[@class="bookTitle"]//text()')
-                    title = "".join(el_title).strip() if el_title else "Unkown Title"
-                    el_url = c.xpath('.//a[@class="bookTitle"]/@href')
-                    url = "https://www.goodreads.com" + el_url[0] if el_url else ""
-                    el_authors = c.xpath('.//a[@class="authorName"]//text()')
-                    subtitle = ", ".join(el_authors) if el_authors else ""
-                    results.append(
-                        ExternalSearchResultItem(
-                            ItemCategory.Book,
-                            SiteName.Goodreads,
-                            url,
-                            title,
-                            subtitle,
-                            "",
-                            cover,
-                        )
-                    )
-        except requests.exceptions.RequestException as e:
-            logger.warning(f"Search {search_url} error: {e}")
-        except Exception as e:
-            logger.error("Goodreads search error", extra={"query": q, "exception": e})
-        return results
-
-
-class GoogleBooks:
-    @classmethod
-    def search(cls, q, page=1):
-        results = []
-        api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE * (page - 1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE"
-        try:
-            j = requests.get(api_url, timeout=2).json()
-            if "items" in j:
-                for b in j["items"]:
-                    if "title" not in b["volumeInfo"]:
-                        continue
-                    title = b["volumeInfo"]["title"]
-                    subtitle = ""
-                    if "publishedDate" in b["volumeInfo"]:
-                        subtitle += b["volumeInfo"]["publishedDate"] + " "
-                    if "authors" in b["volumeInfo"]:
-                        subtitle += ", ".join(b["volumeInfo"]["authors"])
-                    if "description" in b["volumeInfo"]:
-                        brief = b["volumeInfo"]["description"]
-                    elif "textSnippet" in b["volumeInfo"]:
-                        brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
-                    else:
-                        brief = ""
-                    category = ItemCategory.Book
-                    # b['volumeInfo']['infoLink'].replace('http:', 'https:')
-                    url = "https://books.google.com/books?id=" + b["id"]
-                    cover = (
-                        b["volumeInfo"]["imageLinks"]["thumbnail"]
-                        if "imageLinks" in b["volumeInfo"]
-                        else ""
-                    )
-                    results.append(
-                        ExternalSearchResultItem(
-                            category,
-                            SiteName.GoogleBooks,
-                            url,
-                            title,
-                            subtitle,
-                            brief,
-                            cover,
-                        )
-                    )
-        except requests.exceptions.RequestException as e:
-            logger.warning(f"Search {api_url} error: {e}")
-        except Exception as e:
-            logger.error("GoogleBooks search error", extra={"query": q, "exception": e})
-        return results
-
-
-class TheMovieDatabase:
-    @classmethod
-    def search(cls, q, page=1):
-        results = []
-        api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={page}&api_key={settings.TMDB_API3_KEY}&language={TMDB_DEFAULT_LANG}&include_adult=true"
-        try:
-            j = requests.get(api_url, timeout=2).json()
-            if j.get("results"):
-                for m in j["results"]:
-                    if m["media_type"] in ["tv", "movie"]:
-                        url = f"https://www.themoviedb.org/{m['media_type']}/{m['id']}"
-                        if m["media_type"] == "tv":
-                            cat = ItemCategory.TV
-                            title = m["name"]
-                            subtitle = f"{m.get('first_air_date', '')} {m.get('original_name', '')}"
-                        else:
-                            cat = ItemCategory.Movie
-                            title = m["title"]
-                            subtitle = f"{m.get('release_date', '')} {m.get('original_name', '')}"
-                        cover = (
-                            f"https://image.tmdb.org/t/p/w500/{m.get('poster_path')}"
-                            if m.get("poster_path")
-                            else ""
-                        )
-                        results.append(
-                            ExternalSearchResultItem(
-                                cat,
-                                SiteName.TMDB,
-                                url,
-                                title,
-                                subtitle,
-                                m.get("overview"),
-                                cover,
-                            )
-                        )
-            else:
-                logger.warning(f"TMDB search '{q}' no results found.")
-        except requests.exceptions.RequestException as e:
-            logger.warning(f"Search {api_url} error: {e}")
-        except Exception as e:
-            logger.error("TMDb search error", extra={"query": q, "exception": e})
-        return results
-
-
-class Spotify:
-    @classmethod
-    def search(cls, q, page=1):
-        results = []
-        api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page * SEARCH_PAGE_SIZE}"
-        try:
-            headers = {"Authorization": f"Bearer {get_spotify_token()}"}
-            j = requests.get(api_url, headers=headers, timeout=2).json()
-            if j.get("albums"):
-                for a in j["albums"]["items"]:
-                    title = a["name"]
-                    subtitle = a.get("release_date", "")
-                    for artist in a.get("artists", []):
-                        subtitle += " " + artist.get("name", "")
-                    url = a["external_urls"]["spotify"]
-                    cover = a["images"][0]["url"] if a.get("images") else ""
-                    results.append(
-                        ExternalSearchResultItem(
-                            ItemCategory.Music,
-                            SiteName.Spotify,
-                            url,
-                            title,
-                            subtitle,
-                            "",
-                            cover,
-                        )
-                    )
-            else:
-                logger.warning(f"Spotify search '{q}' no results found.")
-        except requests.exceptions.RequestException as e:
-            logger.warning(f"Search {api_url} error: {e}")
-        except Exception as e:
-            logger.error("Spotify search error", extra={"query": q, "exception": e})
-        return results
-
-
-class Bandcamp:
-    @classmethod
-    def search(cls, q, page=1):
-        results = []
-        search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={page}&q={quote_plus(q)}"
-        try:
-            r = requests.get(search_url, timeout=2)
-            h = html.fromstring(r.content.decode("utf-8"))
-            albums = h.xpath('//li[@class="searchresult data-search"]')
-            for c in albums:  # type:ignore
-                el_cover = c.xpath('.//div[@class="art"]/img/@src')
-                cover = el_cover[0] if el_cover else ""
-                el_title = c.xpath('.//div[@class="heading"]//text()')
-                title = "".join(el_title).strip() if el_title else "Unknown Title"
-                el_url = c.xpath('..//div[@class="itemurl"]/a/@href')
-                url = el_url[0] if el_url else ""
-                el_authors = c.xpath('.//div[@class="subhead"]//text()')
-                subtitle = ", ".join(el_authors) if el_authors else ""
-                results.append(
-                    ExternalSearchResultItem(
-                        ItemCategory.Music,
-                        SiteName.Bandcamp,
-                        url,
-                        title,
-                        subtitle,
-                        "",
-                        cover,
-                    )
-                )
-        except requests.exceptions.RequestException as e:
-            logger.warning(f"Search {search_url} error: {e}")
-        except Exception as e:
-            logger.error("Bandcamp search error", extra={"query": q, "exception": e})
-        return results
-
-
-class ApplePodcast:
-    @classmethod
-    def search(cls, q, page=1):
-        results = []
-        search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * SEARCH_PAGE_SIZE}&term={quote_plus(q)}"
-        try:
-            r = requests.get(search_url, timeout=2).json()
-            for p in r["results"][(page - 1) * SEARCH_PAGE_SIZE :]:
-                if p.get("feedUrl"):
-                    results.append(
-                        ExternalSearchResultItem(
-                            ItemCategory.Podcast,
-                            SiteName.RSS,
-                            p["feedUrl"],
-                            p["trackName"],
-                            p["artistName"],
-                            "",
-                            p["artworkUrl600"],
-                        )
-                    )
-        except requests.exceptions.RequestException as e:
-            logger.warning(f"Search {search_url} error: {e}")
-        except Exception as e:
-            logger.error(
-                "ApplePodcast search error", extra={"query": q, "exception": e}
-            )
-        return results
-
-
-class IGDB:
-    @classmethod
-    def search(cls, q, page=1):
-        return IGDB_Site.search(
-            q, limit=SEARCH_PAGE_SIZE, offset=page * SEARCH_PAGE_SIZE
-        )
-
-
-class Fediverse:
-    @staticmethod
-    async def search_task(host, q, category=None):
-        api_url = f"https://{host}/api/catalog/search?query={quote_plus(q)}{'&category=' + category if category else ''}"
-        async with httpx.AsyncClient() as client:
-            results = []
-            try:
-                response = await client.get(
-                    api_url,
-                    timeout=2,
-                )
-                r = response.json()
-            except Exception as e:
-                logger.error(
-                    f"Fediverse search {host} error",
-                    extra={"url": api_url, "query": q, "exception": e},
-                )
-                return []
-            if "data" in r:
-                for item in r["data"]:
-                    if any(
-                        urlparse(res["url"]).hostname in settings.SITE_DOMAINS
-                        for res in item.get("external_resources", [])
-                    ):
-                        continue
-                    url = f"https://{host}{item['url']}"  # FIXME update API and use abs urls
-                    try:
-                        cat = ItemCategory(item["category"])
-                    except Exception:
-                        cat = None
-                    results.append(
-                        ExternalSearchResultItem(
-                            cat,
-                            host,
-                            url,
-                            item["display_title"],
-                            "",
-                            item["brief"],
-                            item["cover_image_url"],
-                        )
-                    )
-        return results
-
-    @classmethod
-    def search(cls, q: str, page: int = 1, category: str | None = None):
-        from takahe.utils import Takahe
-
-        peers = Takahe.get_neodb_peers()
-        c = category if category != "movietv" else "movie,tv"
-        tasks = [Fediverse.search_task(host, q, c) for host in peers]
+    def search(
+        cls, query: str, page: int = 1, category: str | None = None
+    ) -> list[ExternalSearchResultItem]:
+        if not query or page < 1 or page > 10:
+            return []
+        if category in ["", None]:
+            category = "all"
+        tasks = FediverseInstance.search_tasks(query, page, category)
+        for site in SiteManager.get_sites_for_search():
+            tasks.append(site.search_task(query, page, category))
        # loop = asyncio.get_event_loop()
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
@ -344,29 +28,3 @@ class Fediverse:
        for r in loop.run_until_complete(asyncio.gather(*tasks)):
            results.extend(r)
        return results
-
-
-class ExternalSources:
-    @classmethod
-    def search(cls, c, q, page=1):
-        if not q:
-            return []
-        results = []
-        results.extend(
-            Fediverse.search(q, page, category=c if c and c != "all" else None)
-        )
-        if c == "" or c is None:
-            c = "all"
-        if c == "all" or c == "movietv":
-            results.extend(TheMovieDatabase.search(q, page))
-        if c == "all" or c == "book":
-            results.extend(GoogleBooks.search(q, page))
-            results.extend(Goodreads.search(q, page))
-        if c == "all" or c == "game":
-            results.extend(IGDB.search(q, page))
-        if c == "all" or c == "music":
-            results.extend(Spotify.search(q, page))
-            results.extend(Bandcamp.search(q, page))
-        if c == "podcast":
-            results.extend(ApplePodcast.search(q, page))
-        return results
--- a/catalog/search/models.py
+++ b/catalog/search/models.py
@ -79,7 +79,7 @@ class ExternalSearchResultItem:
        self.cover_image_url = cover_url

    def __repr__(self):
-        return f"[{self.category}] {self.display_title} {self.url}"
+        return f"[{self.category}] {self.display_title} {self.source_url}"

    @property
    def verbose_category_name(self):
--- a/catalog/search/views.py
+++ b/catalog/search/views.py
@ -159,7 +159,7 @@ def external_search(request):
        category = None
    keywords = request.GET.get("q", default="").strip()
    page_number = int_(request.GET.get("page"), 1)
-    items = ExternalSources.search(category, keywords, page_number) if keywords else []
+    items = ExternalSources.search(keywords, page_number, category) if keywords else []
    cache_key = f"search_{category if category != 'movietv' else 'movie,tv'}_{keywords}"
    dedupe_urls = cache.get(cache_key, [])
    items = [i for i in items if i.source_url not in dedupe_urls]
--- a/catalog/sites/init.py
+++ b/catalog/sites/init.py
@ -1,6 +1,7 @@
 from ..common.sites import SiteManager
 from .ao3 import ArchiveOfOurOwn
 from .apple_music import AppleMusic
+from .apple_podcast import ApplePodcast
 from .bandcamp import Bandcamp
 from .bangumi import Bangumi
 from .bgg import BoardGameGeek
@ -24,12 +25,11 @@ from .steam import Steam
 from .tmdb import TMDB_Movie
 from .ypshuo import Ypshuo

-# from .apple_podcast import ApplePodcast
-
 __all__ = [
    "SiteManager",
    "ArchiveOfOurOwn",
    "AppleMusic",
+    "ApplePodcast",
    "Bandcamp",
    "Bangumi",
    "BoardGameGeek",
--- a/catalog/sites/apple_podcast.py
+++ b/catalog/sites/apple_podcast.py
@ -1,16 +1,17 @@
-import logging
+from urllib.parse import quote_plus
+
+import httpx
+from loguru import logger

 from catalog.common import *
 from catalog.models import *

 from .rss import RSS

-_logger = logging.getLogger(__name__)
-

@SiteManager.register
 class ApplePodcast(AbstractSite):
-    # SITE_NAME = SiteName.ApplePodcast
+    SITE_NAME = SiteName.ApplePodcast
    ID_TYPE = IdType.ApplePodcast
    URL_PATTERNS = [r"https://[^.]+.apple.com/\w+/podcast/*[^/?]*/id(\d+)"]
    WIKI_PROPERTY_ID = "P5842"
@ -38,3 +39,35 @@ class ApplePodcast(AbstractSite):
        )
        pd.lookup_ids[IdType.RSS] = RSS.url_to_id(feed_url)
        return pd
+
+    @classmethod
+    async def search_task(
+        cls, q: str, page: int, category: str
+    ) -> list[ExternalSearchResultItem]:
+        if category != "podcast":
+            return []
+        SEARCH_PAGE_SIZE = 5 if category == "all" else 10
+        results = []
+        search_url = f"https://itunes.apple.com/search?entity=podcast&limit={page * SEARCH_PAGE_SIZE}&term={quote_plus(q)}"
+        async with httpx.AsyncClient() as client:
+            try:
+                response = await client.get(search_url, timeout=2)
+                r = response.json()
+                for p in r["results"][(page - 1) * SEARCH_PAGE_SIZE :]:
+                    if p.get("feedUrl"):
+                        results.append(
+                            ExternalSearchResultItem(
+                                ItemCategory.Podcast,
+                                SiteName.RSS,
+                                p["feedUrl"],
+                                p["trackName"],
+                                p["artistName"],
+                                "",
+                                p["artworkUrl600"],
+                            )
+                        )
+            except Exception as e:
+                logger.error(
+                    "ApplePodcast search error", extra={"query": q, "exception": e}
+                )
+        return results
--- a/catalog/sites/bandcamp.py
+++ b/catalog/sites/bandcamp.py
@ -5,6 +5,9 @@ import urllib.parse

 import dateparser
 import dns.resolver
+import httpx
+from loguru import logger
+from lxml import html

 from catalog.common import *
 from catalog.models import *
@ -103,3 +106,45 @@ class Bandcamp(AbstractSite):
        }
        pd = ResourceContent(metadata=data)
        return pd
+
+    @classmethod
+    async def search_task(
+        cls, q: str, page: int, category: str
+    ) -> list[ExternalSearchResultItem]:
+        if category != "music":
+            return []
+        SEARCH_PAGE_SIZE = 5
+        p = (page - 1) * SEARCH_PAGE_SIZE // 18 + 1
+        offset = (page - 1) * SEARCH_PAGE_SIZE % 18
+        results = []
+        search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={p}&q={urllib.parse.quote_plus(q)}"
+        async with httpx.AsyncClient() as client:
+            try:
+                r = await client.get(search_url, timeout=2)
+                h = html.fromstring(r.content.decode("utf-8"))
+                albums = h.xpath('//li[@class="searchresult data-search"]')
+                for c in albums:  # type:ignore
+                    el_cover = c.xpath('.//div[@class="art"]/img/@src')
+                    cover = el_cover[0] if el_cover else ""
+                    el_title = c.xpath('.//div[@class="heading"]//text()')
+                    title = "".join(el_title).strip() if el_title else "Unknown Title"
+                    el_url = c.xpath('..//div[@class="itemurl"]/a/@href')
+                    url = el_url[0] if el_url else ""
+                    el_authors = c.xpath('.//div[@class="subhead"]//text()')
+                    subtitle = ", ".join(el_authors) if el_authors else ""
+                    results.append(
+                        ExternalSearchResultItem(
+                            ItemCategory.Music,
+                            SiteName.Bandcamp,
+                            url,
+                            title,
+                            subtitle,
+                            "",
+                            cover,
+                        )
+                    )
+            except Exception as e:
+                logger.error(
+                    "Bandcamp search error", extra={"query": q, "exception": e}
+                )
+        return results[offset : offset + SEARCH_PAGE_SIZE]
--- a/catalog/sites/fedi.py
+++ b/catalog/sites/fedi.py
@ -1,9 +1,33 @@
+from urllib.parse import quote_plus, urlparse
+
+import httpx
 from django.conf import settings
 from django.core.validators import URLValidator
 from loguru import logger

-from catalog.common import *
-from catalog.models import *
+from catalog.common import (
+    AbstractSite,
+    BasicImageDownloader,
+    CachedDownloader,
+    IdType,
+    ItemCategory,
+    ResourceContent,
+    SiteManager,
+    SiteName,
+)
+from catalog.models import (
+    Album,
+    Edition,
+    ExternalSearchResultItem,
+    Game,
+    Movie,
+    Performance,
+    PerformanceProduction,
+    Podcast,
+    TVEpisode,
+    TVSeason,
+    TVShow,
+)


@SiteManager.register
@ -99,3 +123,56 @@ class FediverseInstance(AbstractSite):
            lookup_ids=ids,
        )
        return d
+
+    @classmethod
+    async def peer_search_task(cls, host, q, page, category=None):
+        SEARCH_PAGE_SIZE = 5
+        p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
+        offset = (page - 1) * SEARCH_PAGE_SIZE % 20
+        api_url = f"https://{host}/api/catalog/search?query={quote_plus(q)}&page={p}{'&category=' + category if category and category != 'all' else ''}"
+        async with httpx.AsyncClient() as client:
+            results = []
+            try:
+                response = await client.get(
+                    api_url,
+                    timeout=2,
+                )
+                r = response.json()
+            except Exception as e:
+                logger.error(
+                    f"Fediverse search {host} error",
+                    extra={"url": api_url, "query": q, "exception": e},
+                )
+                return []
+            if "data" in r:
+                for item in r["data"]:
+                    if any(
+                        urlparse(res["url"]).hostname in settings.SITE_DOMAINS
+                        for res in item.get("external_resources", [])
+                    ):
+                        continue
+                    url = f"https://{host}{item['url']}"  # FIXME update API and use abs urls
+                    try:
+                        cat = ItemCategory(item["category"])
+                    except Exception:
+                        cat = None
+                    results.append(
+                        ExternalSearchResultItem(
+                            cat,
+                            host,
+                            url,
+                            item["display_title"],
+                            "",
+                            item["brief"],
+                            item["cover_image_url"],
+                        )
+                    )
+        return results[offset : offset + SEARCH_PAGE_SIZE]
+
+    @classmethod
+    def search_tasks(cls, q: str, page: int = 1, category: str | None = None):
+        from takahe.utils import Takahe
+
+        peers = Takahe.get_neodb_peers()
+        c = category if category != "movietv" else "movie,tv"
+        return [cls.peer_search_task(host, q, page, c) for host in peers]
--- a/catalog/sites/goodreads.py
+++ b/catalog/sites/goodreads.py
@ -1,18 +1,18 @@
 import json
-import logging
 from datetime import datetime
+from urllib.parse import quote_plus

+import httpx
 from django.utils.timezone import make_aware
+from loguru import logger
 from lxml import html

-from catalog.book.models import Edition, Work
 from catalog.book.utils import binding_to_format, detect_isbn_asin
 from catalog.common import *
-from common.models.lang import detect_language
+from catalog.models import Edition, ExternalSearchResultItem, Work
+from common.models import detect_language
 from journal.models.renderers import html_to_text

-_logger = logging.getLogger(__name__)
-

 class GoodreadsDownloader(RetryDownloader):
    def validate_response(self, response):
@ -121,6 +121,82 @@ class Goodreads(AbstractSite):
        pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)
        return pd

+    @classmethod
+    async def search_task(
+        cls, q: str, page: int, category: str
+    ) -> list[ExternalSearchResultItem]:
+        if category not in ["all", "book"]:
+            return []
+        SEARCH_PAGE_SIZE = 5
+        p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
+        offset = (page - 1) * SEARCH_PAGE_SIZE % 20
+        results = []
+        search_url = f"https://www.goodreads.com/search?page={p}&q={quote_plus(q)}"
+        async with httpx.AsyncClient() as client:
+            try:
+                r = await client.get(
+                    search_url,
+                    timeout=3,
+                    headers={
+                        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
+                        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                        "Accept-Language": BasicDownloader.get_accept_language(),
+                        "Accept-Encoding": "gzip, deflate",
+                        "Connection": "keep-alive",
+                        "DNT": "1",
+                        "Upgrade-Insecure-Requests": "1",
+                        "Cache-Control": "no-cache",
+                    },
+                )
+                if r.url.path.startswith("/book/show/"):
+                    # Goodreads will 302 if only one result matches ISBN
+                    site = SiteManager.get_site_by_url(str(r.url))
+                    if site:
+                        res = site.get_resource_ready()
+                        if res:
+                            subtitle = f"{res.metadata.get('pub_year')} {', '.join(res.metadata.get('author', []))} {', '.join(res.metadata.get('translator', []))}"
+                            results.append(
+                                ExternalSearchResultItem(
+                                    ItemCategory.Book,
+                                    SiteName.Goodreads,
+                                    res.url,
+                                    res.metadata["title"],
+                                    subtitle,
+                                    res.metadata.get("brief", ""),
+                                    res.metadata.get("cover_image_url", ""),
+                                )
+                            )
+                else:
+                    h = html.fromstring(r.content.decode("utf-8"))
+                    books = h.xpath('//tr[@itemtype="http://schema.org/Book"]')
+                    for c in books:  # type:ignore
+                        el_cover = c.xpath('.//img[@class="bookCover"]/@src')
+                        cover = el_cover[0] if el_cover else ""
+                        el_title = c.xpath('.//a[@class="bookTitle"]//text()')
+                        title = (
+                            "".join(el_title).strip() if el_title else "Unkown Title"
+                        )
+                        el_url = c.xpath('.//a[@class="bookTitle"]/@href')
+                        url = "https://www.goodreads.com" + el_url[0] if el_url else ""
+                        el_authors = c.xpath('.//a[@class="authorName"]//text()')
+                        subtitle = ", ".join(el_authors) if el_authors else ""
+                        results.append(
+                            ExternalSearchResultItem(
+                                ItemCategory.Book,
+                                SiteName.Goodreads,
+                                url,
+                                title,
+                                subtitle,
+                                "",
+                                cover,
+                            )
+                        )
+            except Exception as e:
+                logger.error(
+                    "Goodreads search error", extra={"query": q, "exception": e}
+                )
+        return results[offset : offset + SEARCH_PAGE_SIZE]
+

@SiteManager.register
 class Goodreads_Work(AbstractSite):
--- a/catalog/sites/google_books.py
+++ b/catalog/sites/google_books.py
@ -1,7 +1,10 @@
 import logging
 import re
+from urllib.parse import quote_plus

+import httpx
 from django.conf import settings
+from loguru import logger

 from catalog.book.utils import isbn_10_to_13
 from catalog.common import *
@ -116,3 +119,57 @@ class GoogleBooks(AbstractSite):
            cover_image_extention=ext,
            lookup_ids={IdType.ISBN: isbn13},
        )
+
+    @classmethod
+    async def search_task(
+        cls, q: str, page: int, category: str
+    ) -> list[ExternalSearchResultItem]:
+        if category not in ["all", "book"]:
+            return []
+        SEARCH_PAGE_SIZE = 5
+        results = []
+        api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE * (page - 1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE"
+        async with httpx.AsyncClient() as client:
+            try:
+                response = await client.get(api_url, timeout=2)
+                j = response.json()
+                if "items" in j:
+                    for b in j["items"]:
+                        if "title" not in b["volumeInfo"]:
+                            continue
+                        title = b["volumeInfo"]["title"]
+                        subtitle = ""
+                        if "publishedDate" in b["volumeInfo"]:
+                            subtitle += b["volumeInfo"]["publishedDate"] + " "
+                        if "authors" in b["volumeInfo"]:
+                            subtitle += ", ".join(b["volumeInfo"]["authors"])
+                        if "description" in b["volumeInfo"]:
+                            brief = b["volumeInfo"]["description"]
+                        elif "textSnippet" in b["volumeInfo"]:
+                            brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
+                        else:
+                            brief = ""
+                        category = ItemCategory.Book
+                        # b['volumeInfo']['infoLink'].replace('http:', 'https:')
+                        url = "https://books.google.com/books?id=" + b["id"]
+                        cover = (
+                            b["volumeInfo"]["imageLinks"]["thumbnail"]
+                            if "imageLinks" in b["volumeInfo"]
+                            else ""
+                        )
+                        results.append(
+                            ExternalSearchResultItem(
+                                category,
+                                SiteName.GoogleBooks,
+                                url,
+                                title,
+                                subtitle,
+                                brief,
+                                cover,
+                            )
+                        )
+            except Exception as e:
+                logger.error(
+                    "GoogleBooks search error", extra={"query": q, "exception": e}
+                )
+        return results
--- a/catalog/sites/igdb.py
+++ b/catalog/sites/igdb.py
@ -8,6 +8,7 @@ import datetime
 import json
 from urllib.parse import quote_plus

+import httpx
 import requests
 from django.conf import settings
 from django.core.cache import cache
@ -83,44 +84,6 @@ class IGDB(AbstractSite):
                    fp.write(json.dumps(r))
        return r

-    @classmethod
-    def search(cls, q, limit: int, offset: int = 0):
-        rs = cls.api_query(
-            "games",
-            f'fields *, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name; search "{quote_plus(q)}"; limit {limit}; offset {offset};',
-        )
-        result = []
-        for r in rs:
-            subtitle = ""
-            if "first_release_date" in r:
-                subtitle = datetime.datetime.fromtimestamp(
-                    r["first_release_date"], datetime.timezone.utc
-                ).strftime("%Y-%m-%d ")
-            if "platforms" in r:
-                ps = sorted(r["platforms"], key=lambda p: p["id"])
-                subtitle += ",".join(
-                    [(p["name"] if p["id"] != 6 else "Windows") for p in ps]
-                )
-            brief = r["summary"] if "summary" in r else ""
-            brief += "\n\n" + r["storyline"] if "storyline" in r else ""
-            cover = (
-                "https:" + r["cover"]["url"].replace("t_thumb", "t_cover_big")
-                if r.get("cover")
-                else ""
-            )
-            result.append(
-                ExternalSearchResultItem(
-                    ItemCategory.Game,
-                    SiteName.IGDB,
-                    r["url"],
-                    r["name"],
-                    subtitle,
-                    brief,
-                    cover,
-                )
-            )
-        return result
-
    def scrape(self):
        fields = "*, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name"
        r = self.api_query("games", f'fields {fields}; where url = "{self.url}";')
@ -200,3 +163,55 @@ class IGDB(AbstractSite):
                IdType.Steam
            ).url_to_id(steam_url)
        return pd
+
+    @classmethod
+    async def search_task(
+        cls, q: str, page: int, category: str
+    ) -> list[ExternalSearchResultItem]:
+        if category != "game":
+            return []
+        SEARCH_PAGE_SIZE = 5 if category == "all" else 10
+        limit = SEARCH_PAGE_SIZE
+        offset = (page - 1) * limit
+        q = f'fields *, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name; search "{quote_plus(q)}"; limit {limit}; offset {offset};'
+        _wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, _igdb_access_token())
+        async with httpx.AsyncClient() as client:
+            try:
+                url = IGDBWrapper._build_url("games")
+                params = _wrapper._compose_request(q)
+                response = await client.post(url, **params)
+                rs = json.loads(response.content)
+            except requests.HTTPError as e:
+                logger.error(f"IGDB API: {e}", extra={"exception": e})
+                rs = []
+        result = []
+        for r in rs:
+            subtitle = ""
+            if "first_release_date" in r:
+                subtitle = datetime.datetime.fromtimestamp(
+                    r["first_release_date"], datetime.timezone.utc
+                ).strftime("%Y-%m-%d ")
+            if "platforms" in r:
+                ps = sorted(r["platforms"], key=lambda p: p["id"])
+                subtitle += ",".join(
+                    [(p["name"] if p["id"] != 6 else "Windows") for p in ps]
+                )
+            brief = r["summary"] if "summary" in r else ""
+            brief += "\n\n" + r["storyline"] if "storyline" in r else ""
+            cover = (
+                "https:" + r["cover"]["url"].replace("t_thumb", "t_cover_big")
+                if r.get("cover")
+                else ""
+            )
+            result.append(
+                ExternalSearchResultItem(
+                    ItemCategory.Game,
+                    SiteName.IGDB,
+                    r["url"],
+                    r["name"],
+                    subtitle,
+                    brief,
+                    cover,
+                )
+            )
+        return result
--- a/catalog/sites/spotify.py
+++ b/catalog/sites/spotify.py
@ -6,8 +6,10 @@ import logging
 import time

 import dateparser
+import httpx
 import requests
 from django.conf import settings
+from loguru import logger

 from catalog.common import *
 from catalog.models import *
@ -107,6 +109,45 @@ class Spotify(AbstractSite):
            pd.lookup_ids[IdType.ISRC] = isrc
        return pd

+    @classmethod
+    async def search_task(
+        cls, q: str, page: int, category: str
+    ) -> list[ExternalSearchResultItem]:
+        if category not in ["music", "all"]:
+            return []
+        SEARCH_PAGE_SIZE = 5
+        results = []
+        api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page * SEARCH_PAGE_SIZE}"
+        async with httpx.AsyncClient() as client:
+            try:
+                headers = {"Authorization": f"Bearer {get_spotify_token()}"}
+                response = await client.get(api_url, headers=headers, timeout=2)
+                j = response.json()
+                if j.get("albums"):
+                    for a in j["albums"]["items"]:
+                        title = a["name"]
+                        subtitle = a.get("release_date", "")
+                        for artist in a.get("artists", []):
+                            subtitle += " " + artist.get("name", "")
+                        url = a["external_urls"]["spotify"]
+                        cover = a["images"][0]["url"] if a.get("images") else ""
+                        results.append(
+                            ExternalSearchResultItem(
+                                ItemCategory.Music,
+                                SiteName.Spotify,
+                                url,
+                                title,
+                                subtitle,
+                                "",
+                                cover,
+                            )
+                        )
+                else:
+                    logger.warning(f"Spotify search '{q}' no results found.")
+            except Exception as e:
+                logger.error("Spotify search error", extra={"query": q, "exception": e})
+        return results
+

 def get_spotify_token():
    global spotify_token, spotify_token_expire_time
--- a/catalog/sites/tmdb.py
+++ b/catalog/sites/tmdb.py
@ -12,8 +12,11 @@ these language code from TMDB are not in currently iso-639-1

 import logging
 import re
+from urllib.parse import quote_plus

+import httpx
 from django.conf import settings
+from loguru import logger

 from catalog.common import *
 from catalog.movie.models import *
@ -175,6 +178,55 @@ class TMDB_Movie(AbstractSite):
            pd.lookup_ids[IdType.IMDB] = imdb_code
        return pd

+    @classmethod
+    async def search_task(
+        cls, q: str, page: int, category: str
+    ) -> list[ExternalSearchResultItem]:
+        if category not in ["movietv", "all", "movie", "tv"]:
+            return []
+        SEARCH_PAGE_SIZE = 5 if category == "all" else 10
+        p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
+        offset = (page - 1) * SEARCH_PAGE_SIZE % 20
+        results = []
+        api_url = f"https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={p}&api_key={settings.TMDB_API3_KEY}&language={TMDB_DEFAULT_LANG}&include_adult=true"
+        async with httpx.AsyncClient() as client:
+            try:
+                response = await client.get(api_url, timeout=2)
+                j = response.json()
+                if j.get("results"):
+                    for m in j["results"]:
+                        if m["media_type"] in ["tv", "movie"]:
+                            url = f"https://www.themoviedb.org/{m['media_type']}/{m['id']}"
+                            if m["media_type"] == "tv":
+                                cat = ItemCategory.TV
+                                title = m["name"]
+                                subtitle = f"{m.get('first_air_date', '')} {m.get('original_name', '')}"
+                            else:
+                                cat = ItemCategory.Movie
+                                title = m["title"]
+                                subtitle = f"{m.get('release_date', '')} {m.get('original_name', '')}"
+                            cover = (
+                                f"https://image.tmdb.org/t/p/w500/{m.get('poster_path')}"
+                                if m.get("poster_path")
+                                else ""
+                            )
+                            results.append(
+                                ExternalSearchResultItem(
+                                    cat,
+                                    SiteName.TMDB,
+                                    url,
+                                    title,
+                                    subtitle,
+                                    m.get("overview"),
+                                    cover,
+                                )
+                            )
+                else:
+                    logger.warning(f"TMDB search '{q}' no results found.")
+            except Exception as e:
+                logger.error("TMDb search error", extra={"query": q, "exception": e})
+        return results[offset : offset + SEARCH_PAGE_SIZE]
+

@SiteManager.register
 class TMDB_TV(AbstractSite):
--- a/compose.yml
+++ b/compose.yml
@ -33,6 +33,7 @@ x-shared:
      NEODB_DISABLE_DEFAULT_RELAY:
      NEODB_DISABLE_CRON_JOBS:
      NEODB_SEARCH_PEERS:
+      NEODB_SEARCH_SITES:
      NEODB_MIN_MARKS_FOR_DISCOVER:
      NEODB_DISCOVER_UPDATE_INTERVAL:
      NEODB_DISCOVER_FILTER_LANGUAGE:
--- a/docs/configuration.md
+++ b/docs/configuration.md
@ -57,6 +57,7 @@ if you are doing debug or development:
 - `GOOGLE_API_KEY` - API key for [Google Books](https://developers.google.com/books/docs/v1/using)
 - `DISCOGS_API_KEY` - personal access token from [Discogs](https://www.discogs.com/settings/developers)
 - `IGDB_API_CLIENT_ID`, `IGDB_API_CLIENT_SECRET` - IGDB [keys](https://api-docs.igdb.com/)
+- `NEODB_SEARCH_SITES` is empty by default, which means NeoDB will search all available sources. This can be set to a comma-separated list of site names (e.g. `goodreads,googlebooks,spotify,tmdb,igdb,bandcamp,apple_podcast`), so that NeoDB will only search those sites; or not search any of them if set to just `-`.


 ## Other maintenance tasks
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@ -169,7 +169,7 @@ mkdocs==1.6.1
    # via mkdocs-material
 mkdocs-get-deps==0.2.0
    # via mkdocs
-mkdocs-material==9.5.49
+mkdocs-material==9.5.50
 mkdocs-material-extensions==1.3.1
    # via mkdocs-material
 multidict==6.1.0
@ -213,7 +213,7 @@ pygments==2.19.1
    # via mkdocs-material
 pymdown-extensions==10.14
    # via mkdocs-material
-pyright==1.1.391
+pyright==1.1.392.post0
 python-dateutil==2.9.0.post0
    # via dateparser
    # via django-auditlog
@ -251,7 +251,7 @@ rjsmin==1.2.2
    # via django-compressor
 rq==2.1.0
    # via django-rq
-ruff==0.9.1
+ruff==0.9.2
 sentry-sdk==2.20.0
 setproctitle==1.3.4
 six==1.17.0
@ -292,7 +292,7 @@ urllib3==2.3.0
    # via sentry-sdk
 urlman==2.0.2
 validators==0.34.0
-virtualenv==20.28.1
+virtualenv==20.29.1
    # via pre-commit
 watchdog==6.0.0
    # via mkdocs