lib.itmens/catalog/sites/goodreads.py

136 lines
4.8 KiB
Python
Raw Normal View History

from catalog.book.models import Edition, Work
from catalog.common import *
2022-12-16 07:58:34 -05:00
from catalog.book.utils import detect_isbn_asin
from lxml import html
import json
import logging
_logger = logging.getLogger(__name__)
class GoodreadsDownloader(RetryDownloader):
def validate_response(self, response):
if response is None:
return RESPONSE_NETWORK_ERROR
elif response.status_code == 200:
2022-12-29 23:57:02 -05:00
if response.text.find("__NEXT_DATA__") != -1:
return RESPONSE_OK
2022-12-08 16:59:03 +00:00
else:
# Goodreads may return legacy version for a/b testing
# retry if so
return RESPONSE_NETWORK_ERROR
else:
return RESPONSE_INVALID_CONTENT
2022-12-15 17:29:35 -05:00
@SiteManager.register
class Goodreads(AbstractSite):
2022-12-15 17:29:35 -05:00
SITE_NAME = SiteName.Goodreads
ID_TYPE = IdType.Goodreads
2022-12-29 23:57:02 -05:00
WIKI_PROPERTY_ID = "P2968"
DEFAULT_MODEL = Edition
2022-12-29 23:57:02 -05:00
URL_PATTERNS = [
r".+goodreads.com/.*book/show/(\d+)",
r".+goodreads.com/.*book/(\d+)",
]
@classmethod
def id_to_url(self, id_value):
return "https://www.goodreads.com/book/show/" + id_value
def scrape(self, response=None):
data = {}
if response is not None:
2022-12-08 16:59:03 +00:00
h = html.fromstring(response.text.strip())
else:
dl = GoodreadsDownloader(self.url)
2022-12-08 16:59:03 +00:00
h = dl.download().html()
# Next.JS version of GoodReads
# JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']
elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()')
src = elem[0].strip() if elem else None
if not src:
2022-12-29 23:57:02 -05:00
raise ParseError(self, "__NEXT_DATA__ element")
d = json.loads(src)["props"]["pageProps"]["apolloState"]
o = {"Book": [], "Work": [], "Series": [], "Contributor": []}
for v in d.values():
2022-12-29 23:57:02 -05:00
t = v.get("__typename")
if t and t in o:
o[t].append(v)
2022-12-29 23:57:02 -05:00
b = next(filter(lambda x: x.get("title"), o["Book"]), None)
if not b:
2022-12-08 16:59:03 +00:00
# Goodreads may return empty page template when internal service timeouts
2022-12-29 23:57:02 -05:00
raise ParseError(self, "Book in __NEXT_DATA__ json")
data["title"] = b["title"]
data["brief"] = b["description"]
2022-12-16 07:58:34 -05:00
ids = {}
2022-12-29 23:57:02 -05:00
t, n = detect_isbn_asin(b["details"].get("asin"))
2022-12-16 07:58:34 -05:00
if t:
ids[t] = n
2022-12-29 23:57:02 -05:00
t, n = detect_isbn_asin(b["details"].get("isbn13"))
2022-12-16 07:58:34 -05:00
if t:
ids[t] = n
# amazon has a known problem to use another book's isbn as asin
# so we alway overwrite asin-converted isbn with real isbn
2022-12-29 23:57:02 -05:00
data["pages"] = b["details"].get("numPages")
data["cover_image_url"] = b["imageUrl"]
w = next(filter(lambda x: x.get("details"), o["Work"]), None)
if w:
2022-12-29 23:57:02 -05:00
data["required_resources"] = [
{
"model": "Work",
"id_type": IdType.Goodreads_Work,
"id_value": str(w["legacyId"]),
"title": w["details"]["originalTitle"],
"url": w["editions"]["webUrl"],
}
]
2022-12-08 16:08:59 +00:00
pd = ResourceContent(metadata=data)
2022-12-16 07:58:34 -05:00
pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN)
pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)
if data["cover_image_url"]:
imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
2022-12-29 23:57:02 -05:00
_logger.debug(
f'failed to download cover for {self.url} from {data["cover_image_url"]}'
)
return pd
2022-12-15 17:29:35 -05:00
@SiteManager.register
class Goodreads_Work(AbstractSite):
2022-12-15 17:29:35 -05:00
SITE_NAME = SiteName.Goodreads
ID_TYPE = IdType.Goodreads_Work
2022-12-29 23:57:02 -05:00
WIKI_PROPERTY_ID = ""
DEFAULT_MODEL = Work
URL_PATTERNS = [r".+goodreads.com/work/editions/(\d+)"]
@classmethod
def id_to_url(self, id_value):
return "https://www.goodreads.com/work/editions/" + id_value
def scrape(self, response=None):
2022-12-08 16:59:03 +00:00
content = BasicDownloader(self.url).download().html()
title_elem = content.xpath("//h1/a/text()")
title = title_elem[0].strip() if title_elem else None
if not title:
2022-12-29 23:57:02 -05:00
raise ParseError(self, "title")
author_elem = content.xpath("//h2/a/text()")
author = author_elem[0].strip() if author_elem else None
first_published_elem = content.xpath("//h2/span/text()")
2022-12-29 23:57:02 -05:00
first_published = (
first_published_elem[0].strip() if first_published_elem else None
)
pd = ResourceContent(
metadata={
"title": title,
"author": author,
"first_published": first_published,
}
)
return pd