lib.itmens/catalog/sites/goodreads.py

from catalog.book.models import Edition, Work
from catalog.common import *
from catalog.book.utils import detect_isbn_asin
from lxml import html
import json
import logging


_logger = logging.getLogger(__name__)


class GoodreadsDownloader(RetryDownloader):
    def validate_response(self, response):
        if response is None:
            return RESPONSE_NETWORK_ERROR
        elif response.status_code == 200:
            if response.text.find("__NEXT_DATA__") != -1:
                return RESPONSE_OK
            else:
                # Goodreads may return legacy version for a/b testing
                # retry if so
                return RESPONSE_NETWORK_ERROR
        else:
            return RESPONSE_INVALID_CONTENT


@SiteManager.register
class Goodreads(AbstractSite):
    SITE_NAME = SiteName.Goodreads
    ID_TYPE = IdType.Goodreads
    WIKI_PROPERTY_ID = "P2968"
    DEFAULT_MODEL = Edition
    URL_PATTERNS = [
        r".+goodreads.com/.*book/show/(\d+)",
        r".+goodreads.com/.*book/(\d+)",
    ]

    @classmethod
    def id_to_url(self, id_value):
        return "https://www.goodreads.com/book/show/" + id_value

    def scrape(self, response=None):
        data = {}
        if response is not None:
            h = html.fromstring(response.text.strip())
        else:
            dl = GoodreadsDownloader(self.url)
            h = dl.download().html()
        # Next.JS version of GoodReads
        # JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']
        elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()')
        src = elem[0].strip() if elem else None
        if not src:
            raise ParseError(self, "__NEXT_DATA__ element")
        d = json.loads(src)["props"]["pageProps"]["apolloState"]
        o = {"Book": [], "Work": [], "Series": [], "Contributor": []}
        for v in d.values():
            t = v.get("__typename")
            if t and t in o:
                o[t].append(v)
        b = next(filter(lambda x: x.get("title"), o["Book"]), None)
        if not b:
            # Goodreads may return empty page template when internal service timeouts
            raise ParseError(self, "Book in __NEXT_DATA__ json")
        data["title"] = b["title"]
        data["brief"] = b["description"]
        ids = {}
        t, n = detect_isbn_asin(b["details"].get("asin"))
        if t:
            ids[t] = n
        t, n = detect_isbn_asin(b["details"].get("isbn13"))
        if t:
            ids[t] = n
        # amazon has a known problem to use another book's isbn as asin
        # so we alway overwrite asin-converted isbn with real isbn
        data["pages"] = b["details"].get("numPages")
        data["cover_image_url"] = b["imageUrl"]
        w = next(filter(lambda x: x.get("details"), o["Work"]), None)
        if w:
            data["required_resources"] = [
                {
                    "model": "Work",
                    "id_type": IdType.Goodreads_Work,
                    "id_value": str(w["legacyId"]),
                    "title": w["details"]["originalTitle"],
                    "url": w["editions"]["webUrl"],
                }
            ]
        pd = ResourceContent(metadata=data)
        pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN)
        pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)
        if data["cover_image_url"]:
            imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
            try:
                pd.cover_image = imgdl.download().content
                pd.cover_image_extention = imgdl.extention
            except Exception:
                _logger.debug(
                    f'failed to download cover for {self.url} from {data["cover_image_url"]}'
                )
        return pd


@SiteManager.register
class Goodreads_Work(AbstractSite):
    SITE_NAME = SiteName.Goodreads
    ID_TYPE = IdType.Goodreads_Work
    WIKI_PROPERTY_ID = ""
    DEFAULT_MODEL = Work
    URL_PATTERNS = [r".+goodreads.com/work/editions/(\d+)"]

    @classmethod
    def id_to_url(self, id_value):
        return "https://www.goodreads.com/work/editions/" + id_value

    def scrape(self, response=None):
        content = BasicDownloader(self.url).download().html()
        title_elem = content.xpath("//h1/a/text()")
        title = title_elem[0].strip() if title_elem else None
        if not title:
            raise ParseError(self, "title")
        author_elem = content.xpath("//h2/a/text()")
        author = author_elem[0].strip() if author_elem else None
        first_published_elem = content.xpath("//h2/span/text()")
        first_published = (
            first_published_elem[0].strip() if first_published_elem else None
        )
        pd = ResourceContent(
            metadata={
                "title": title,
                "author": author,
                "first_published": first_published,
            }
        )
        return pd
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`from catalog.book.models import Edition, Work`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`from catalog.common import *`
fix goodreads asin mix up 2022-12-16 07:58:34 -05:00			`from catalog.book.utils import detect_isbn_asin`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`from lxml import html`
			`import json`
			`import logging`


add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`_logger = logging.getLogger(__name__)`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00

			`class GoodreadsDownloader(RetryDownloader):`
			`def validate_response(self, response):`
			`if response is None:`
			`return RESPONSE_NETWORK_ERROR`
			`elif response.status_code == 200:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`if response.text.find("__NEXT_DATA__") != -1:`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`return RESPONSE_OK`
rename a few methods 2022-12-08 16:59:03 +00:00			`else:`
			`# Goodreads may return legacy version for a/b testing`
			`# retry if so`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`return RESPONSE_NETWORK_ERROR`
			`else:`
			`return RESPONSE_INVALID_CONTENT`


new data model: /book/<uid> 2022-12-15 17:29:35 -05:00			`@SiteManager.register`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`class Goodreads(AbstractSite):`
new data model: /book/<uid> 2022-12-15 17:29:35 -05:00			`SITE_NAME = SiteName.Goodreads`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`ID_TYPE = IdType.Goodreads`
reformat new code with black 2022-12-29 23:57:02 -05:00			`WIKI_PROPERTY_ID = "P2968"`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`DEFAULT_MODEL = Edition`
reformat new code with black 2022-12-29 23:57:02 -05:00			`URL_PATTERNS = [`
			`r".+goodreads.com/.*book/show/(\d+)",`
			`r".+goodreads.com/.*book/(\d+)",`
			`]`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00
			`@classmethod`
			`def id_to_url(self, id_value):`
			`return "https://www.goodreads.com/book/show/" + id_value`

			`def scrape(self, response=None):`
			`data = {}`
			`if response is not None:`
rename a few methods 2022-12-08 16:59:03 +00:00			`h = html.fromstring(response.text.strip())`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`else:`
			`dl = GoodreadsDownloader(self.url)`
rename a few methods 2022-12-08 16:59:03 +00:00			`h = dl.download().html()`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`# Next.JS version of GoodReads`
			`# JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']`
			`elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()')`
			`src = elem[0].strip() if elem else None`
			`if not src:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`raise ParseError(self, "__NEXT_DATA__ element")`
			`d = json.loads(src)["props"]["pageProps"]["apolloState"]`
			`o = {"Book": [], "Work": [], "Series": [], "Contributor": []}`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`for v in d.values():`
reformat new code with black 2022-12-29 23:57:02 -05:00			`t = v.get("__typename")`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`if t and t in o:`
			`o[t].append(v)`
reformat new code with black 2022-12-29 23:57:02 -05:00			`b = next(filter(lambda x: x.get("title"), o["Book"]), None)`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`if not b:`
rename a few methods 2022-12-08 16:59:03 +00:00			`# Goodreads may return empty page template when internal service timeouts`
reformat new code with black 2022-12-29 23:57:02 -05:00			`raise ParseError(self, "Book in __NEXT_DATA__ json")`
			`data["title"] = b["title"]`
			`data["brief"] = b["description"]`
fix goodreads asin mix up 2022-12-16 07:58:34 -05:00			`ids = {}`
reformat new code with black 2022-12-29 23:57:02 -05:00			`t, n = detect_isbn_asin(b["details"].get("asin"))`
fix goodreads asin mix up 2022-12-16 07:58:34 -05:00			`if t:`
			`ids[t] = n`
reformat new code with black 2022-12-29 23:57:02 -05:00			`t, n = detect_isbn_asin(b["details"].get("isbn13"))`
fix goodreads asin mix up 2022-12-16 07:58:34 -05:00			`if t:`
			`ids[t] = n`
			`# amazon has a known problem to use another book's isbn as asin`
			`# so we alway overwrite asin-converted isbn with real isbn`
reformat new code with black 2022-12-29 23:57:02 -05:00			`data["pages"] = b["details"].get("numPages")`
			`data["cover_image_url"] = b["imageUrl"]`
			`w = next(filter(lambda x: x.get("details"), o["Work"]), None)`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`if w:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`data["required_resources"] = [`
			`{`
			`"model": "Work",`
			`"id_type": IdType.Goodreads_Work,`
			`"id_value": str(w["legacyId"]),`
			`"title": w["details"]["originalTitle"],`
			`"url": w["editions"]["webUrl"],`
			`}`
			`]`
new data model: rename some classes 2022-12-08 16:08:59 +00:00			`pd = ResourceContent(metadata=data)`
fix goodreads asin mix up 2022-12-16 07:58:34 -05:00			`pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN)`
			`pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`if data["cover_image_url"]:`
			`imgdl = BasicImageDownloader(data["cover_image_url"], self.url)`
			`try:`
			`pd.cover_image = imgdl.download().content`
			`pd.cover_image_extention = imgdl.extention`
			`except Exception:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`_logger.debug(`
			`f'failed to download cover for {self.url} from {data["cover_image_url"]}'`
			`)`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`return pd`


new data model: /book/<uid> 2022-12-15 17:29:35 -05:00			`@SiteManager.register`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`class Goodreads_Work(AbstractSite):`
new data model: /book/<uid> 2022-12-15 17:29:35 -05:00			`SITE_NAME = SiteName.Goodreads`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`ID_TYPE = IdType.Goodreads_Work`
reformat new code with black 2022-12-29 23:57:02 -05:00			`WIKI_PROPERTY_ID = ""`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`DEFAULT_MODEL = Work`
			`URL_PATTERNS = [r".+goodreads.com/work/editions/(\d+)"]`

			`@classmethod`
			`def id_to_url(self, id_value):`
			`return "https://www.goodreads.com/work/editions/" + id_value`

			`def scrape(self, response=None):`
rename a few methods 2022-12-08 16:59:03 +00:00			`content = BasicDownloader(self.url).download().html()`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`title_elem = content.xpath("//h1/a/text()")`
			`title = title_elem[0].strip() if title_elem else None`
			`if not title:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`raise ParseError(self, "title")`
add douban book works; add goodread works; auto link season to show 2022-12-08 05:53:00 +00:00			`author_elem = content.xpath("//h2/a/text()")`
			`author = author_elem[0].strip() if author_elem else None`
			`first_published_elem = content.xpath("//h2/span/text()")`
reformat new code with black 2022-12-29 23:57:02 -05:00			`first_published = (`
			`first_published_elem[0].strip() if first_published_elem else None`
			`)`
			`pd = ResourceContent(`
			`metadata={`
			`"title": title,`
			`"author": author,`
			`"first_published": first_published,`
			`}`
			`)`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`return pd`