lib.itmens/catalog/sites/goodreads.py

232 lines
9.6 KiB
Python
Raw Normal View History

import json
from datetime import datetime
from urllib.parse import quote_plus
import httpx
from django.utils.timezone import make_aware
from loguru import logger
from lxml import html
2024-07-28 16:08:36 -04:00
from catalog.book.utils import binding_to_format, detect_isbn_asin
from catalog.common import *
from catalog.models import Edition, ExternalSearchResultItem, Work
from common.models import detect_language
2024-07-16 13:51:55 -04:00
from journal.models.renderers import html_to_text
class GoodreadsDownloader(RetryDownloader):
def validate_response(self, response):
if response is None:
return RESPONSE_NETWORK_ERROR
elif response.status_code == 200:
2023-04-24 00:06:27 -04:00
if (
response.text.find("__NEXT_DATA__") != -1
and response.text.find('"title"') != -1
):
return RESPONSE_OK
2023-04-24 00:06:27 -04:00
# Goodreads may return legacy version for a/b testing
# retry if so
return RESPONSE_NETWORK_ERROR
else:
return RESPONSE_INVALID_CONTENT
2022-12-15 17:29:35 -05:00
@SiteManager.register
class Goodreads(AbstractSite):
2022-12-15 17:29:35 -05:00
SITE_NAME = SiteName.Goodreads
ID_TYPE = IdType.Goodreads
2022-12-29 23:57:02 -05:00
WIKI_PROPERTY_ID = "P2968"
DEFAULT_MODEL = Edition
2022-12-29 23:57:02 -05:00
URL_PATTERNS = [
2023-11-14 22:56:37 -05:00
r".+goodreads\.com/.*book/show/(\d+)",
r".+goodreads\.com/.*book/(\d+)",
2022-12-29 23:57:02 -05:00
]
@classmethod
2023-02-13 00:47:46 -05:00
def id_to_url(cls, id_value):
return "https://www.goodreads.com/book/show/" + id_value
def scrape(self, response=None):
data = {}
if response is not None:
2022-12-08 16:59:03 +00:00
h = html.fromstring(response.text.strip())
else:
dl = GoodreadsDownloader(self.url)
2022-12-08 16:59:03 +00:00
h = dl.download().html()
# Next.JS version of GoodReads
# JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']
2023-08-11 11:55:42 -04:00
src = self.query_str(h, '//script[@id="__NEXT_DATA__"]/text()')
if not src:
2022-12-29 23:57:02 -05:00
raise ParseError(self, "__NEXT_DATA__ element")
d = json.loads(src)["props"]["pageProps"]["apolloState"]
o = {"Book": [], "Work": [], "Series": [], "Contributor": []}
for v in d.values():
2022-12-29 23:57:02 -05:00
t = v.get("__typename")
if t and t in o:
o[t].append(v)
2022-12-29 23:57:02 -05:00
b = next(filter(lambda x: x.get("title"), o["Book"]), None)
if not b:
2022-12-08 16:59:03 +00:00
# Goodreads may return empty page template when internal service timeouts
2022-12-29 23:57:02 -05:00
raise ParseError(self, "Book in __NEXT_DATA__ json")
data["title"] = b["title"]
2024-07-16 13:51:55 -04:00
data["brief"] = html_to_text(b["description"] or "").strip()
lang = detect_language(b["title"] + " " + data["brief"])
2024-07-13 18:14:40 -04:00
data["localized_title"] = [{"lang": lang, "text": b["title"]}]
data["localized_subtitle"] = [] # Goodreads does not support subtitle
2024-07-16 22:55:29 -04:00
if data["brief"]:
data["brief"] = html_to_text(data["brief"])
2024-07-16 00:51:05 -04:00
data["localized_description"] = (
2024-07-16 13:51:55 -04:00
[{"lang": lang, "text": data["brief"]}] if data["brief"] else []
2024-07-16 00:51:05 -04:00
)
2024-07-16 22:55:29 -04:00
data["author"] = [c["name"] for c in o["Contributor"] if c.get("name")]
2022-12-16 07:58:34 -05:00
ids = {}
2022-12-29 23:57:02 -05:00
t, n = detect_isbn_asin(b["details"].get("asin"))
2022-12-16 07:58:34 -05:00
if t:
ids[t] = n
2023-02-13 00:47:46 -05:00
# amazon has a known problem to use another book's isbn as asin
# so we alway overwrite asin-converted isbn with real isbn
2022-12-29 23:57:02 -05:00
t, n = detect_isbn_asin(b["details"].get("isbn13"))
2022-12-16 07:58:34 -05:00
if t:
ids[t] = n
2023-02-13 00:47:46 -05:00
else:
t, n = detect_isbn_asin(b["details"].get("isbn"))
if t:
ids[t] = n
2022-12-29 23:57:02 -05:00
data["pages"] = b["details"].get("numPages")
2023-02-13 00:47:46 -05:00
data["binding"] = b["details"].get("format")
2024-07-28 16:08:36 -04:00
data["format"] = binding_to_format(b["details"].get("format"))
2023-02-13 00:47:46 -05:00
data["pub_house"] = b["details"].get("publisher")
if b["details"].get("publicationTime"):
dt = make_aware(
datetime.fromtimestamp(b["details"].get("publicationTime") / 1000)
)
data["pub_year"] = dt.year
data["pub_month"] = dt.month
2024-07-16 13:51:55 -04:00
if b["details"].get("language", {}).get("name"):
2024-07-13 18:14:40 -04:00
data["language"] = [b["details"].get("language").get("name")]
2022-12-29 23:57:02 -05:00
data["cover_image_url"] = b["imageUrl"]
w = next(filter(lambda x: x.get("details"), o["Work"]), None)
if w:
2022-12-29 23:57:02 -05:00
data["required_resources"] = [
{
"model": "Work",
"id_type": IdType.Goodreads_Work,
"id_value": str(w["legacyId"]),
"title": w["details"]["originalTitle"],
"url": w["editions"]["webUrl"],
}
]
2022-12-08 16:08:59 +00:00
pd = ResourceContent(metadata=data)
2022-12-16 07:58:34 -05:00
pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN)
pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)
return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category not in ["all", "book"]:
return []
SEARCH_PAGE_SIZE = 5
p = (page - 1) * SEARCH_PAGE_SIZE // 20 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 20
results = []
search_url = f"https://www.goodreads.com/search?page={p}&q={quote_plus(q)}"
async with httpx.AsyncClient() as client:
try:
r = await client.get(
search_url,
timeout=3,
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": BasicDownloader.get_accept_language(),
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"DNT": "1",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": "no-cache",
},
)
if r.url.path.startswith("/book/show/"):
# Goodreads will 302 if only one result matches ISBN
site = SiteManager.get_site_by_url(str(r.url))
if site:
res = site.get_resource_ready()
if res:
subtitle = f"{res.metadata.get('pub_year')} {', '.join(res.metadata.get('author', []))} {', '.join(res.metadata.get('translator', []))}"
results.append(
ExternalSearchResultItem(
ItemCategory.Book,
SiteName.Goodreads,
res.url,
res.metadata["title"],
subtitle,
res.metadata.get("brief", ""),
res.metadata.get("cover_image_url", ""),
)
)
else:
h = html.fromstring(r.content.decode("utf-8"))
books = h.xpath('//tr[@itemtype="http://schema.org/Book"]')
for c in books: # type:ignore
el_cover = c.xpath('.//img[@class="bookCover"]/@src')
cover = el_cover[0] if el_cover else ""
el_title = c.xpath('.//a[@class="bookTitle"]//text()')
title = (
"".join(el_title).strip() if el_title else "Unkown Title"
)
el_url = c.xpath('.//a[@class="bookTitle"]/@href')
url = "https://www.goodreads.com" + el_url[0] if el_url else ""
el_authors = c.xpath('.//a[@class="authorName"]//text()')
subtitle = ", ".join(el_authors) if el_authors else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Book,
SiteName.Goodreads,
url,
title,
subtitle,
"",
cover,
)
)
except Exception as e:
logger.error(
"Goodreads search error", extra={"query": q, "exception": e}
)
return results[offset : offset + SEARCH_PAGE_SIZE]
2022-12-15 17:29:35 -05:00
@SiteManager.register
class Goodreads_Work(AbstractSite):
2022-12-15 17:29:35 -05:00
SITE_NAME = SiteName.Goodreads
ID_TYPE = IdType.Goodreads_Work
2022-12-29 23:57:02 -05:00
WIKI_PROPERTY_ID = ""
DEFAULT_MODEL = Work
2023-11-14 22:56:37 -05:00
URL_PATTERNS = [r".+goodreads\.com/work/editions/(\d+)"]
@classmethod
2023-02-13 00:47:46 -05:00
def id_to_url(cls, id_value):
return "https://www.goodreads.com/work/editions/" + id_value
def scrape(self, response=None):
2022-12-08 16:59:03 +00:00
content = BasicDownloader(self.url).download().html()
2023-08-11 11:55:42 -04:00
title = self.query_str(content, "//h1/a/text()")
if not title:
2022-12-29 23:57:02 -05:00
raise ParseError(self, "title")
2023-08-11 11:55:42 -04:00
author = self.query_str(content, "//h2/a/text()")
try:
first_published = self.query_str(content, "//h2/span/text()")
2024-04-06 00:13:50 -04:00
except Exception:
2023-08-11 11:55:42 -04:00
first_published = None
2022-12-29 23:57:02 -05:00
pd = ResourceContent(
metadata={
"title": title,
2024-07-13 18:14:40 -04:00
"localized_title": [{"lang": "en", "text": title}],
2024-07-16 22:55:29 -04:00
"author": [author] if author else [],
2022-12-29 23:57:02 -05:00
"first_published": first_published,
}
)
return pd