improve goodreads parser

This commit is contained in:
Your Name 2024-07-16 13:51:55 -04:00 committed by Henri Dickson
parent 9cc5d4744a
commit b511892485
3 changed files with 11 additions and 6 deletions
catalog
journal/models

View file

@ -116,7 +116,7 @@ class DoubanGameTestCase(TestCase):
class BangumiGameTestCase(TestCase):
databases = "__all__"
# @use_local_response
@use_local_response
def test_parse(self):
t_id_type = IdType.Bangumi
t_id_value = "15912"

View file

@ -10,6 +10,7 @@ from catalog.book.models import Edition, Work
from catalog.book.utils import detect_isbn_asin
from catalog.common import *
from common.models.lang import detect_language
from journal.models.renderers import html_to_text
_logger = logging.getLogger(__name__)
@ -69,12 +70,12 @@ class Goodreads(AbstractSite):
# Goodreads may return empty page template when internal service timeouts
raise ParseError(self, "Book in __NEXT_DATA__ json")
data["title"] = b["title"]
data["brief"] = b["description"]
lang = detect_language(b["title"] + " " + (b["description"] or ""))
data["brief"] = html_to_text(b["description"] or "").strip()
lang = detect_language(b["title"] + " " + data["brief"])
data["localized_title"] = [{"lang": lang, "text": b["title"]}]
data["localized_subtitle"] = [] # Goodreads does not support subtitle
data["localized_description"] = (
[{"lang": lang, "text": b["description"]}] if b["description"] else []
[{"lang": lang, "text": data["brief"]}] if data["brief"] else []
)
if data["brief"]:
@ -103,7 +104,7 @@ class Goodreads(AbstractSite):
)
data["pub_year"] = dt.year
data["pub_month"] = dt.month
if b["details"].get("language"):
if b["details"].get("language", {}).get("name"):
data["language"] = [b["details"].get("language").get("name")]
data["cover_image_url"] = b["imageUrl"]
w = next(filter(lambda x: x.get("details"), o["Work"]), None)

View file

@ -43,7 +43,11 @@ _RE_HTML_TAG = re.compile(r"<[^>]*>")
def html_to_text(h: str) -> str:
return unescape(_RE_HTML_TAG.sub(" ", h.replace("\r", "")))
return unescape(
_RE_HTML_TAG.sub(
" ", h.replace("\r", "").replace("<br", "\n<br").replace("</p", "\n</p")
)
)
def _spolier(s: str) -> str: