improve goodreads parser
This commit is contained in:
parent
9cc5d4744a
commit
b511892485
3 changed files with 11 additions and 6 deletions
|
@ -116,7 +116,7 @@ class DoubanGameTestCase(TestCase):
|
|||
class BangumiGameTestCase(TestCase):
|
||||
databases = "__all__"
|
||||
|
||||
# @use_local_response
|
||||
@use_local_response
|
||||
def test_parse(self):
|
||||
t_id_type = IdType.Bangumi
|
||||
t_id_value = "15912"
|
||||
|
|
|
@ -10,6 +10,7 @@ from catalog.book.models import Edition, Work
|
|||
from catalog.book.utils import detect_isbn_asin
|
||||
from catalog.common import *
|
||||
from common.models.lang import detect_language
|
||||
from journal.models.renderers import html_to_text
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -69,12 +70,12 @@ class Goodreads(AbstractSite):
|
|||
# Goodreads may return empty page template when internal service timeouts
|
||||
raise ParseError(self, "Book in __NEXT_DATA__ json")
|
||||
data["title"] = b["title"]
|
||||
data["brief"] = b["description"]
|
||||
lang = detect_language(b["title"] + " " + (b["description"] or ""))
|
||||
data["brief"] = html_to_text(b["description"] or "").strip()
|
||||
lang = detect_language(b["title"] + " " + data["brief"])
|
||||
data["localized_title"] = [{"lang": lang, "text": b["title"]}]
|
||||
data["localized_subtitle"] = [] # Goodreads does not support subtitle
|
||||
data["localized_description"] = (
|
||||
[{"lang": lang, "text": b["description"]}] if b["description"] else []
|
||||
[{"lang": lang, "text": data["brief"]}] if data["brief"] else []
|
||||
)
|
||||
|
||||
if data["brief"]:
|
||||
|
@ -103,7 +104,7 @@ class Goodreads(AbstractSite):
|
|||
)
|
||||
data["pub_year"] = dt.year
|
||||
data["pub_month"] = dt.month
|
||||
if b["details"].get("language"):
|
||||
if b["details"].get("language", {}).get("name"):
|
||||
data["language"] = [b["details"].get("language").get("name")]
|
||||
data["cover_image_url"] = b["imageUrl"]
|
||||
w = next(filter(lambda x: x.get("details"), o["Work"]), None)
|
||||
|
|
|
@ -43,7 +43,11 @@ _RE_HTML_TAG = re.compile(r"<[^>]*>")
|
|||
|
||||
|
||||
def html_to_text(h: str) -> str:
|
||||
return unescape(_RE_HTML_TAG.sub(" ", h.replace("\r", "")))
|
||||
return unescape(
|
||||
_RE_HTML_TAG.sub(
|
||||
" ", h.replace("\r", "").replace("<br", "\n<br").replace("</p", "\n</p")
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _spolier(s: str) -> str:
|
||||
|
|
Loading…
Add table
Reference in a new issue