lib.itmens/catalog/sites/goodreads.py

import json
from datetime import datetime
from urllib.parse import quote_plus

import httpx
from django.utils.timezone import make_aware
from loguru import logger
from lxml import html

from catalog.book.utils import binding_to_format, detect_isbn_asin
from catalog.common import *
from catalog.models import Edition, ExternalSearchResultItem, Work
from common.models import detect_language
from journal.models.renderers import html_to_text


class GoodreadsDownloader(RetryDownloader):
    def validate_response(self, response):
        if response is None:
            return RESPONSE_NETWORK_ERROR
        elif response.status_code == 200:
            if (
                response.text.find("__NEXT_DATA__") != -1
                and response.text.find('"title"') != -1
            ):
                return RESPONSE_OK
            # Goodreads may return legacy version for a/b testing
            # retry if so
            return RESPONSE_NETWORK_ERROR
        else:
            return RESPONSE_INVALID_CONTENT


@SiteManager.register
class Goodreads(AbstractSite):
    SITE_NAME = SiteName.Goodreads
    ID_TYPE = IdType.Goodreads
    WIKI_PROPERTY_ID = "P2968"
    DEFAULT_MODEL = Edition
    URL_PATTERNS = [
        r".+goodreads\.com/.*book/show/(\d+)",
        r".+goodreads\.com/.*book/(\d+)",
    ]

    @classmethod
    def id_to_url(cls, id_value):
        return "https://www.goodreads.com/book/show/" + id_value

    def scrape(self, response=None):
        data = {}
        if response is not None:
            h = html.fromstring(response.text.strip())
        else:
            dl = GoodreadsDownloader(self.url)
            h = dl.download().html()
        # Next.JS version of GoodReads
        # JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']
        src = self.query_str(h, '//script[@id="__NEXT_DATA__"]/text()')
        if not src:
            raise ParseError(self, "__NEXT_DATA__ element")
        d = json.loads(src)["props"]["pageProps"]["apolloState"]
        o = {"Book": [], "Work": [], "Series": [], "Contributor": []}
        for v in d.values():
            t = v.get("__typename")
            if t and t in o:
                o[t].append(v)
        b = next(filter(lambda x: x.get("title"), o["Book"]), None)
        if not b:
            # Goodreads may return empty page template when internal service timeouts
            raise ParseError(self, "Book in __NEXT_DATA__ json")
        data["title"] = b["title"]
        data["brief"] = html_to_text(b["description"] or "").strip()
        lang = detect_language(b["title"] + " " + data["brief"])
        data["localized_title"] = [{"lang": lang, "text": b["title"]}]
        data["localized_subtitle"] = []  # Goodreads does not support subtitle
        if data["brief"]:
            data["brief"] = html_to_text(data["brief"])
        data["localized_description"] = (
            [{"lang": lang, "text": data["brief"]}] if data["brief"] else []
        )
        data["author"] = [c["name"] for c in o["Contributor"] if c.get("name")]
        ids = {}
        t, n = detect_isbn_asin(b["details"].get("asin"))
        if t:
            ids[t] = n
        # amazon has a known problem to use another book's isbn as asin
        # so we alway overwrite asin-converted isbn with real isbn
        t, n = detect_isbn_asin(b["details"].get("isbn13"))
        if t:
            ids[t] = n
        else:
            t, n = detect_isbn_asin(b["details"].get("isbn"))
            if t:
                ids[t] = n
        data["pages"] = b["details"].get("numPages")
        data["binding"] = b["details"].get("format")
        data["format"] = binding_to_format(b["details"].get("format"))
        data["pub_house"] = b["details"].get("publisher")
        if b["details"].get("publicationTime"):
            dt = make_aware(
                datetime.fromtimestamp(b["details"].get("publicationTime") / 1000)
            )
            data["pub_year"] = dt.year
            data["pub_month"] = dt.month
        if b["details"].get("language", {}).get("name"):
            data["language"] = [b["details"].get("language").get("name")]
        data["cover_image_url"] = b["imageUrl"]
        w = next(filter(lambda x: x.get("details"), o["Work"]), None)
        if w:
            data["required_resources"] = [
                {
                    "model": "Work",
                    "id_type": IdType.Goodreads_Work,
                    "id_value": str(w["legacyId"]),
                    "title": w["details"]["originalTitle"],
                    "url": w["editions"]["webUrl"],
                }
            ]
        pd = ResourceContent(metadata=data)
        pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN)
        pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)
        return pd

    @classmethod
    async def search_task(
        cls, q: str, page: int, category: str
    ) -> list[ExternalSearchResultItem]:
        if category not in ["all", "book"]:
            return []
        SEARCH_PAGE_SIZE = 5
        p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
        offset = (page - 1) * SEARCH_PAGE_SIZE % 20
        results = []
        search_url = f"https://www.goodreads.com/search?page={p}&q={quote_plus(q)}"
        async with httpx.AsyncClient() as client:
            try:
                r = await client.get(
                    search_url,
                    timeout=3,
                    headers={
                        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
                        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                        "Accept-Language": BasicDownloader.get_accept_language(),
                        "Accept-Encoding": "gzip, deflate",
                        "Connection": "keep-alive",
                        "DNT": "1",
                        "Upgrade-Insecure-Requests": "1",
                        "Cache-Control": "no-cache",
                    },
                )
                if r.url.path.startswith("/book/show/"):
                    # Goodreads will 302 if only one result matches ISBN
                    site = SiteManager.get_site_by_url(str(r.url))
                    if site:
                        res = site.get_resource_ready()
                        if res:
                            subtitle = f"{res.metadata.get('pub_year')} {', '.join(res.metadata.get('author', []))} {', '.join(res.metadata.get('translator', []))}"
                            results.append(
                                ExternalSearchResultItem(
                                    ItemCategory.Book,
                                    SiteName.Goodreads,
                                    res.url,
                                    res.metadata["title"],
                                    subtitle,
                                    res.metadata.get("brief", ""),
                                    res.metadata.get("cover_image_url", ""),
                                )
                            )
                else:
                    h = html.fromstring(r.content.decode("utf-8"))
                    books = h.xpath('//tr[@itemtype="http://schema.org/Book"]')
                    for c in books:  # type:ignore
                        el_cover = c.xpath('.//img[@class="bookCover"]/@src')
                        cover = el_cover[0] if el_cover else ""
                        el_title = c.xpath('.//a[@class="bookTitle"]//text()')
                        title = (
                            "".join(el_title).strip() if el_title else "Unkown Title"
                        )
                        el_url = c.xpath('.//a[@class="bookTitle"]/@href')
                        url = "https://www.goodreads.com" + el_url[0] if el_url else ""
                        el_authors = c.xpath('.//a[@class="authorName"]//text()')
                        subtitle = ", ".join(el_authors) if el_authors else ""
                        results.append(
                            ExternalSearchResultItem(
                                ItemCategory.Book,
                                SiteName.Goodreads,
                                url,
                                title,
                                subtitle,
                                "",
                                cover,
                            )
                        )
            except Exception as e:
                logger.error(
                    "Goodreads search error", extra={"query": q, "exception": e}
                )
        return results[offset : offset + SEARCH_PAGE_SIZE]


@SiteManager.register
class Goodreads_Work(AbstractSite):
    SITE_NAME = SiteName.Goodreads
    ID_TYPE = IdType.Goodreads_Work
    WIKI_PROPERTY_ID = ""
    DEFAULT_MODEL = Work
    URL_PATTERNS = [r".+goodreads\.com/work/editions/(\d+)"]

    @classmethod
    def id_to_url(cls, id_value):
        return "https://www.goodreads.com/work/editions/" + id_value

    def scrape(self, response=None):
        content = BasicDownloader(self.url).download().html()
        title = self.query_str(content, "//h1/a/text()")
        if not title:
            raise ParseError(self, "title")
        author = self.query_str(content, "//h2/a/text()")
        try:
            first_published = self.query_str(content, "//h2/span/text()")
        except Exception:
            first_published = None
        pd = ResourceContent(
            metadata={
                "title": title,
                "localized_title": [{"lang": "en", "text": title}],
                "author": [author] if author else [],
                "first_published": first_published,
            }
        )
        return pd
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`import json`
fix unicode site name; add isort; split journal and users models 2023-08-10 11:27:31 -04:00			`from datetime import datetime`
make external search async and configurable 2025-01-18 15:53:06 -05:00			`from urllib.parse import quote_plus`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00
make external search async and configurable 2025-01-18 15:53:06 -05:00			`import httpx`
fix unicode site name; add isort; split journal and users models 2023-08-10 11:27:31 -04:00			`from django.utils.timezone import make_aware`
make external search async and configurable 2025-01-18 15:53:06 -05:00			`from loguru import logger`
fix unicode site name; add isort; split journal and users models 2023-08-10 11:27:31 -04:00			`from lxml import html`

add book format enum to edition 2024-07-28 16:08:36 -04:00			`from catalog.book.utils import binding_to_format, detect_isbn_asin`
fix unicode site name; add isort; split journal and users models 2023-08-10 11:27:31 -04:00			`from catalog.common import *`
make external search async and configurable 2025-01-18 15:53:06 -05:00			`from catalog.models import Edition, ExternalSearchResultItem, Work`
			`from common.models import detect_language`
improve goodreads parser 2024-07-16 13:51:55 -04:00			`from journal.models.renderers import html_to_text`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00

			`class GoodreadsDownloader(RetryDownloader):`
			`def validate_response(self, response):`
			`if response is None:`
			`return RESPONSE_NETWORK_ERROR`
			`elif response.status_code == 200:`
improve goodreads import 2023-04-24 00:06:27 -04:00			`if (`
			`response.text.find("__NEXT_DATA__") != -1`
			`and response.text.find('"title"') != -1`
			`):`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`return RESPONSE_OK`
improve goodreads import 2023-04-24 00:06:27 -04:00			`# Goodreads may return legacy version for a/b testing`
			`# retry if so`
			`return RESPONSE_NETWORK_ERROR`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`else:`
			`return RESPONSE_INVALID_CONTENT`


new data model: /book/<uid> 2022-12-15 17:29:35 -05:00			`@SiteManager.register`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`class Goodreads(AbstractSite):`
new data model: /book/<uid> 2022-12-15 17:29:35 -05:00			`SITE_NAME = SiteName.Goodreads`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`ID_TYPE = IdType.Goodreads`
reformat new code with black 2022-12-29 23:57:02 -05:00			`WIKI_PROPERTY_ID = "P2968"`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`DEFAULT_MODEL = Edition`
reformat new code with black 2022-12-29 23:57:02 -05:00			`URL_PATTERNS = [`
fix security concerns 2023-11-14 22:56:37 -05:00			`r".+goodreads\.com/.*book/show/(\d+)",`
			`r".+goodreads\.com/.*book/(\d+)",`
reformat new code with black 2022-12-29 23:57:02 -05:00			`]`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00
			`@classmethod`
fix goodreads parse 2023-02-13 00:47:46 -05:00			`def id_to_url(cls, id_value):`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`return "https://www.goodreads.com/book/show/" + id_value`

			`def scrape(self, response=None):`
			`data = {}`
			`if response is not None:`
rename a few methods 2022-12-08 16:59:03 +00:00			`h = html.fromstring(response.text.strip())`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`else:`
			`dl = GoodreadsDownloader(self.url)`
rename a few methods 2022-12-08 16:59:03 +00:00			`h = dl.download().html()`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`# Next.JS version of GoodReads`
			`# JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']`
lint fix site and import 2023-08-11 11:55:42 -04:00			`src = self.query_str(h, '//script[@id="__NEXT_DATA__"]/text()')`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`if not src:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`raise ParseError(self, "__NEXT_DATA__ element")`
			`d = json.loads(src)["props"]["pageProps"]["apolloState"]`
			`o = {"Book": [], "Work": [], "Series": [], "Contributor": []}`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`for v in d.values():`
reformat new code with black 2022-12-29 23:57:02 -05:00			`t = v.get("__typename")`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`if t and t in o:`
			`o[t].append(v)`
reformat new code with black 2022-12-29 23:57:02 -05:00			`b = next(filter(lambda x: x.get("title"), o["Book"]), None)`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`if not b:`
rename a few methods 2022-12-08 16:59:03 +00:00			`# Goodreads may return empty page template when internal service timeouts`
reformat new code with black 2022-12-29 23:57:02 -05:00			`raise ParseError(self, "Book in __NEXT_DATA__ json")`
			`data["title"] = b["title"]`
improve goodreads parser 2024-07-16 13:51:55 -04:00			`data["brief"] = html_to_text(b["description"] or "").strip()`
			`lang = detect_language(b["title"] + " " + data["brief"])`
fix tests 2024-07-13 18:14:40 -04:00			`data["localized_title"] = [{"lang": lang, "text": b["title"]}]`
			`data["localized_subtitle"] = [] # Goodreads does not support subtitle`
more fix for goodreads and bamgumi 2024-07-16 22:55:29 -04:00			`if data["brief"]:`
			`data["brief"] = html_to_text(data["brief"])`
more data checks in scrapers 2024-07-16 00:51:05 -04:00			`data["localized_description"] = (`
improve goodreads parser 2024-07-16 13:51:55 -04:00			`[{"lang": lang, "text": data["brief"]}] if data["brief"] else []`
more data checks in scrapers 2024-07-16 00:51:05 -04:00			`)`
more fix for goodreads and bamgumi 2024-07-16 22:55:29 -04:00			`data["author"] = [c["name"] for c in o["Contributor"] if c.get("name")]`
fix goodreads asin mix up 2022-12-16 07:58:34 -05:00			`ids = {}`
reformat new code with black 2022-12-29 23:57:02 -05:00			`t, n = detect_isbn_asin(b["details"].get("asin"))`
fix goodreads asin mix up 2022-12-16 07:58:34 -05:00			`if t:`
			`ids[t] = n`
fix goodreads parse 2023-02-13 00:47:46 -05:00			`# amazon has a known problem to use another book's isbn as asin`
			`# so we alway overwrite asin-converted isbn with real isbn`
reformat new code with black 2022-12-29 23:57:02 -05:00			`t, n = detect_isbn_asin(b["details"].get("isbn13"))`
fix goodreads asin mix up 2022-12-16 07:58:34 -05:00			`if t:`
			`ids[t] = n`
fix goodreads parse 2023-02-13 00:47:46 -05:00			`else:`
			`t, n = detect_isbn_asin(b["details"].get("isbn"))`
			`if t:`
			`ids[t] = n`
reformat new code with black 2022-12-29 23:57:02 -05:00			`data["pages"] = b["details"].get("numPages")`
fix goodreads parse 2023-02-13 00:47:46 -05:00			`data["binding"] = b["details"].get("format")`
add book format enum to edition 2024-07-28 16:08:36 -04:00			`data["format"] = binding_to_format(b["details"].get("format"))`
fix goodreads parse 2023-02-13 00:47:46 -05:00			`data["pub_house"] = b["details"].get("publisher")`
			`if b["details"].get("publicationTime"):`
			`dt = make_aware(`
			`datetime.fromtimestamp(b["details"].get("publicationTime") / 1000)`
			`)`
			`data["pub_year"] = dt.year`
			`data["pub_month"] = dt.month`
improve goodreads parser 2024-07-16 13:51:55 -04:00			`if b["details"].get("language", {}).get("name"):`
fix tests 2024-07-13 18:14:40 -04:00			`data["language"] = [b["details"].get("language").get("name")]`
reformat new code with black 2022-12-29 23:57:02 -05:00			`data["cover_image_url"] = b["imageUrl"]`
			`w = next(filter(lambda x: x.get("details"), o["Work"]), None)`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`if w:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`data["required_resources"] = [`
			`{`
			`"model": "Work",`
			`"id_type": IdType.Goodreads_Work,`
			`"id_value": str(w["legacyId"]),`
			`"title": w["details"]["originalTitle"],`
			`"url": w["editions"]["webUrl"],`
			`}`
			`]`
new data model: rename some classes 2022-12-08 16:08:59 +00:00			`pd = ResourceContent(metadata=data)`
fix goodreads asin mix up 2022-12-16 07:58:34 -05:00			`pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN)`
			`pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`return pd`

make external search async and configurable 2025-01-18 15:53:06 -05:00			`@classmethod`
			`async def search_task(`
			`cls, q: str, page: int, category: str`
			`) -> list[ExternalSearchResultItem]:`
			`if category not in ["all", "book"]:`
			`return []`
			`SEARCH_PAGE_SIZE = 5`
			`p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1`
			`offset = (page - 1) * SEARCH_PAGE_SIZE % 20`
			`results = []`
			`search_url = f"https://www.goodreads.com/search?page={p}&q={quote_plus(q)}"`
			`async with httpx.AsyncClient() as client:`
			`try:`
			`r = await client.get(`
			`search_url,`
			`timeout=3,`
			`headers={`
			`"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",`
			`"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",`
			`"Accept-Language": BasicDownloader.get_accept_language(),`
			`"Accept-Encoding": "gzip, deflate",`
			`"Connection": "keep-alive",`
			`"DNT": "1",`
			`"Upgrade-Insecure-Requests": "1",`
			`"Cache-Control": "no-cache",`
			`},`
			`)`
			`if r.url.path.startswith("/book/show/"):`
			`# Goodreads will 302 if only one result matches ISBN`
			`site = SiteManager.get_site_by_url(str(r.url))`
			`if site:`
			`res = site.get_resource_ready()`
			`if res:`
			`subtitle = f"{res.metadata.get('pub_year')} {', '.join(res.metadata.get('author', []))} {', '.join(res.metadata.get('translator', []))}"`
			`results.append(`
			`ExternalSearchResultItem(`
			`ItemCategory.Book,`
			`SiteName.Goodreads,`
			`res.url,`
			`res.metadata["title"],`
			`subtitle,`
			`res.metadata.get("brief", ""),`
			`res.metadata.get("cover_image_url", ""),`
			`)`
			`)`
			`else:`
			`h = html.fromstring(r.content.decode("utf-8"))`
			`books = h.xpath('//tr[@itemtype="http://schema.org/Book"]')`
			`for c in books: # type:ignore`
			`el_cover = c.xpath('.//img[@class="bookCover"]/@src')`
			`cover = el_cover[0] if el_cover else ""`
			`el_title = c.xpath('.//a[@class="bookTitle"]//text()')`
			`title = (`
			`"".join(el_title).strip() if el_title else "Unkown Title"`
			`)`
			`el_url = c.xpath('.//a[@class="bookTitle"]/@href')`
			`url = "https://www.goodreads.com" + el_url[0] if el_url else ""`
			`el_authors = c.xpath('.//a[@class="authorName"]//text()')`
			`subtitle = ", ".join(el_authors) if el_authors else ""`
			`results.append(`
			`ExternalSearchResultItem(`
			`ItemCategory.Book,`
			`SiteName.Goodreads,`
			`url,`
			`title,`
			`subtitle,`
			`"",`
			`cover,`
			`)`
			`)`
			`except Exception as e:`
			`logger.error(`
			`"Goodreads search error", extra={"query": q, "exception": e}`
			`)`
			`return results[offset : offset + SEARCH_PAGE_SIZE]`

add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00
new data model: /book/<uid> 2022-12-15 17:29:35 -05:00			`@SiteManager.register`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`class Goodreads_Work(AbstractSite):`
new data model: /book/<uid> 2022-12-15 17:29:35 -05:00			`SITE_NAME = SiteName.Goodreads`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`ID_TYPE = IdType.Goodreads_Work`
reformat new code with black 2022-12-29 23:57:02 -05:00			`WIKI_PROPERTY_ID = ""`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`DEFAULT_MODEL = Work`
fix security concerns 2023-11-14 22:56:37 -05:00			`URL_PATTERNS = [r".+goodreads\.com/work/editions/(\d+)"]`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00
			`@classmethod`
fix goodreads parse 2023-02-13 00:47:46 -05:00			`def id_to_url(cls, id_value):`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`return "https://www.goodreads.com/work/editions/" + id_value`

			`def scrape(self, response=None):`
rename a few methods 2022-12-08 16:59:03 +00:00			`content = BasicDownloader(self.url).download().html()`
lint fix site and import 2023-08-11 11:55:42 -04:00			`title = self.query_str(content, "//h1/a/text()")`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`if not title:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`raise ParseError(self, "title")`
lint fix site and import 2023-08-11 11:55:42 -04:00			`author = self.query_str(content, "//h2/a/text()")`
			`try:`
			`first_published = self.query_str(content, "//h2/span/text()")`
add lint by ruff 2024-04-06 00:13:50 -04:00			`except Exception:`
lint fix site and import 2023-08-11 11:55:42 -04:00			`first_published = None`
reformat new code with black 2022-12-29 23:57:02 -05:00			`pd = ResourceContent(`
			`metadata={`
			`"title": title,`
fix tests 2024-07-13 18:14:40 -04:00			`"localized_title": [{"lang": "en", "text": title}],`
more fix for goodreads and bamgumi 2024-07-16 22:55:29 -04:00			`"author": [author] if author else [],`
reformat new code with black 2022-12-29 23:57:02 -05:00			`"first_published": first_published,`
			`}`
			`)`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`return pd`