2022-12-07 19:09:05 -05:00
|
|
|
import json
|
|
|
|
import logging
|
2023-08-10 11:27:31 -04:00
|
|
|
from datetime import datetime
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2023-08-10 11:27:31 -04:00
|
|
|
from django.utils.timezone import make_aware
|
|
|
|
from lxml import html
|
|
|
|
|
|
|
|
from catalog.book.models import Edition, Work
|
2024-07-28 16:08:36 -04:00
|
|
|
from catalog.book.utils import binding_to_format, detect_isbn_asin
|
2023-08-10 11:27:31 -04:00
|
|
|
from catalog.common import *
|
2024-07-13 18:14:40 -04:00
|
|
|
from common.models.lang import detect_language
|
2024-07-16 13:51:55 -04:00
|
|
|
from journal.models.renderers import html_to_text
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2022-12-08 05:53:00 +00:00
|
|
|
_logger = logging.getLogger(__name__)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
|
|
|
|
class GoodreadsDownloader(RetryDownloader):
|
|
|
|
def validate_response(self, response):
|
|
|
|
if response is None:
|
|
|
|
return RESPONSE_NETWORK_ERROR
|
|
|
|
elif response.status_code == 200:
|
2023-04-24 00:06:27 -04:00
|
|
|
if (
|
|
|
|
response.text.find("__NEXT_DATA__") != -1
|
|
|
|
and response.text.find('"title"') != -1
|
|
|
|
):
|
2022-12-07 19:09:05 -05:00
|
|
|
return RESPONSE_OK
|
2023-04-24 00:06:27 -04:00
|
|
|
# Goodreads may return legacy version for a/b testing
|
|
|
|
# retry if so
|
|
|
|
return RESPONSE_NETWORK_ERROR
|
2022-12-07 19:09:05 -05:00
|
|
|
else:
|
|
|
|
return RESPONSE_INVALID_CONTENT
|
|
|
|
|
|
|
|
|
2022-12-15 17:29:35 -05:00
|
|
|
@SiteManager.register
|
2022-12-07 19:09:05 -05:00
|
|
|
class Goodreads(AbstractSite):
|
2022-12-15 17:29:35 -05:00
|
|
|
SITE_NAME = SiteName.Goodreads
|
2022-12-07 19:09:05 -05:00
|
|
|
ID_TYPE = IdType.Goodreads
|
2022-12-29 23:57:02 -05:00
|
|
|
WIKI_PROPERTY_ID = "P2968"
|
2022-12-07 19:09:05 -05:00
|
|
|
DEFAULT_MODEL = Edition
|
2022-12-29 23:57:02 -05:00
|
|
|
URL_PATTERNS = [
|
2023-11-14 22:56:37 -05:00
|
|
|
r".+goodreads\.com/.*book/show/(\d+)",
|
|
|
|
r".+goodreads\.com/.*book/(\d+)",
|
2022-12-29 23:57:02 -05:00
|
|
|
]
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
@classmethod
|
2023-02-13 00:47:46 -05:00
|
|
|
def id_to_url(cls, id_value):
|
2022-12-07 19:09:05 -05:00
|
|
|
return "https://www.goodreads.com/book/show/" + id_value
|
|
|
|
|
|
|
|
def scrape(self, response=None):
|
|
|
|
data = {}
|
|
|
|
if response is not None:
|
2022-12-08 16:59:03 +00:00
|
|
|
h = html.fromstring(response.text.strip())
|
2022-12-07 19:09:05 -05:00
|
|
|
else:
|
|
|
|
dl = GoodreadsDownloader(self.url)
|
2022-12-08 16:59:03 +00:00
|
|
|
h = dl.download().html()
|
2022-12-07 19:09:05 -05:00
|
|
|
# Next.JS version of GoodReads
|
|
|
|
# JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']
|
2023-08-11 11:55:42 -04:00
|
|
|
src = self.query_str(h, '//script[@id="__NEXT_DATA__"]/text()')
|
2022-12-07 19:09:05 -05:00
|
|
|
if not src:
|
2022-12-29 23:57:02 -05:00
|
|
|
raise ParseError(self, "__NEXT_DATA__ element")
|
|
|
|
d = json.loads(src)["props"]["pageProps"]["apolloState"]
|
|
|
|
o = {"Book": [], "Work": [], "Series": [], "Contributor": []}
|
2022-12-07 19:09:05 -05:00
|
|
|
for v in d.values():
|
2022-12-29 23:57:02 -05:00
|
|
|
t = v.get("__typename")
|
2022-12-07 19:09:05 -05:00
|
|
|
if t and t in o:
|
|
|
|
o[t].append(v)
|
2022-12-29 23:57:02 -05:00
|
|
|
b = next(filter(lambda x: x.get("title"), o["Book"]), None)
|
2022-12-07 19:09:05 -05:00
|
|
|
if not b:
|
2022-12-08 16:59:03 +00:00
|
|
|
# Goodreads may return empty page template when internal service timeouts
|
2022-12-29 23:57:02 -05:00
|
|
|
raise ParseError(self, "Book in __NEXT_DATA__ json")
|
|
|
|
data["title"] = b["title"]
|
2024-07-16 13:51:55 -04:00
|
|
|
data["brief"] = html_to_text(b["description"] or "").strip()
|
|
|
|
lang = detect_language(b["title"] + " " + data["brief"])
|
2024-07-13 18:14:40 -04:00
|
|
|
data["localized_title"] = [{"lang": lang, "text": b["title"]}]
|
|
|
|
data["localized_subtitle"] = [] # Goodreads does not support subtitle
|
2024-07-16 22:55:29 -04:00
|
|
|
if data["brief"]:
|
|
|
|
data["brief"] = html_to_text(data["brief"])
|
2024-07-16 00:51:05 -04:00
|
|
|
data["localized_description"] = (
|
2024-07-16 13:51:55 -04:00
|
|
|
[{"lang": lang, "text": data["brief"]}] if data["brief"] else []
|
2024-07-16 00:51:05 -04:00
|
|
|
)
|
2024-07-16 22:55:29 -04:00
|
|
|
data["author"] = [c["name"] for c in o["Contributor"] if c.get("name")]
|
2022-12-16 07:58:34 -05:00
|
|
|
ids = {}
|
2022-12-29 23:57:02 -05:00
|
|
|
t, n = detect_isbn_asin(b["details"].get("asin"))
|
2022-12-16 07:58:34 -05:00
|
|
|
if t:
|
|
|
|
ids[t] = n
|
2023-02-13 00:47:46 -05:00
|
|
|
# amazon has a known problem to use another book's isbn as asin
|
|
|
|
# so we alway overwrite asin-converted isbn with real isbn
|
2022-12-29 23:57:02 -05:00
|
|
|
t, n = detect_isbn_asin(b["details"].get("isbn13"))
|
2022-12-16 07:58:34 -05:00
|
|
|
if t:
|
|
|
|
ids[t] = n
|
2023-02-13 00:47:46 -05:00
|
|
|
else:
|
|
|
|
t, n = detect_isbn_asin(b["details"].get("isbn"))
|
|
|
|
if t:
|
|
|
|
ids[t] = n
|
2022-12-29 23:57:02 -05:00
|
|
|
data["pages"] = b["details"].get("numPages")
|
2023-02-13 00:47:46 -05:00
|
|
|
data["binding"] = b["details"].get("format")
|
2024-07-28 16:08:36 -04:00
|
|
|
data["format"] = binding_to_format(b["details"].get("format"))
|
2023-02-13 00:47:46 -05:00
|
|
|
data["pub_house"] = b["details"].get("publisher")
|
|
|
|
if b["details"].get("publicationTime"):
|
|
|
|
dt = make_aware(
|
|
|
|
datetime.fromtimestamp(b["details"].get("publicationTime") / 1000)
|
|
|
|
)
|
|
|
|
data["pub_year"] = dt.year
|
|
|
|
data["pub_month"] = dt.month
|
2024-07-16 13:51:55 -04:00
|
|
|
if b["details"].get("language", {}).get("name"):
|
2024-07-13 18:14:40 -04:00
|
|
|
data["language"] = [b["details"].get("language").get("name")]
|
2022-12-29 23:57:02 -05:00
|
|
|
data["cover_image_url"] = b["imageUrl"]
|
|
|
|
w = next(filter(lambda x: x.get("details"), o["Work"]), None)
|
2022-12-07 19:09:05 -05:00
|
|
|
if w:
|
2022-12-29 23:57:02 -05:00
|
|
|
data["required_resources"] = [
|
|
|
|
{
|
|
|
|
"model": "Work",
|
|
|
|
"id_type": IdType.Goodreads_Work,
|
|
|
|
"id_value": str(w["legacyId"]),
|
|
|
|
"title": w["details"]["originalTitle"],
|
|
|
|
"url": w["editions"]["webUrl"],
|
|
|
|
}
|
|
|
|
]
|
2022-12-08 16:08:59 +00:00
|
|
|
pd = ResourceContent(metadata=data)
|
2022-12-16 07:58:34 -05:00
|
|
|
pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN)
|
|
|
|
pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)
|
2022-12-08 05:53:00 +00:00
|
|
|
return pd
|
|
|
|
|
|
|
|
|
2022-12-15 17:29:35 -05:00
|
|
|
@SiteManager.register
|
2022-12-08 05:53:00 +00:00
|
|
|
class Goodreads_Work(AbstractSite):
|
2022-12-15 17:29:35 -05:00
|
|
|
SITE_NAME = SiteName.Goodreads
|
2022-12-08 05:53:00 +00:00
|
|
|
ID_TYPE = IdType.Goodreads_Work
|
2022-12-29 23:57:02 -05:00
|
|
|
WIKI_PROPERTY_ID = ""
|
2022-12-08 05:53:00 +00:00
|
|
|
DEFAULT_MODEL = Work
|
2023-11-14 22:56:37 -05:00
|
|
|
URL_PATTERNS = [r".+goodreads\.com/work/editions/(\d+)"]
|
2022-12-08 05:53:00 +00:00
|
|
|
|
|
|
|
@classmethod
|
2023-02-13 00:47:46 -05:00
|
|
|
def id_to_url(cls, id_value):
|
2022-12-08 05:53:00 +00:00
|
|
|
return "https://www.goodreads.com/work/editions/" + id_value
|
|
|
|
|
|
|
|
def scrape(self, response=None):
|
2022-12-08 16:59:03 +00:00
|
|
|
content = BasicDownloader(self.url).download().html()
|
2023-08-11 11:55:42 -04:00
|
|
|
title = self.query_str(content, "//h1/a/text()")
|
2022-12-08 05:53:00 +00:00
|
|
|
if not title:
|
2022-12-29 23:57:02 -05:00
|
|
|
raise ParseError(self, "title")
|
2023-08-11 11:55:42 -04:00
|
|
|
author = self.query_str(content, "//h2/a/text()")
|
|
|
|
try:
|
|
|
|
first_published = self.query_str(content, "//h2/span/text()")
|
2024-04-06 00:13:50 -04:00
|
|
|
except Exception:
|
2023-08-11 11:55:42 -04:00
|
|
|
first_published = None
|
2022-12-29 23:57:02 -05:00
|
|
|
pd = ResourceContent(
|
|
|
|
metadata={
|
|
|
|
"title": title,
|
2024-07-13 18:14:40 -04:00
|
|
|
"localized_title": [{"lang": "en", "text": title}],
|
2024-07-16 22:55:29 -04:00
|
|
|
"author": [author] if author else [],
|
2022-12-29 23:57:02 -05:00
|
|
|
"first_published": first_published,
|
|
|
|
}
|
|
|
|
)
|
2022-12-07 19:09:05 -05:00
|
|
|
return pd
|