lib.itmens/catalog/sites/bookstw.py
2023-04-25 19:07:42 -04:00

137 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from catalog.common import *
from catalog.book.models import *
from catalog.book.utils import *
from .douban import *
import logging
_logger = logging.getLogger(__name__)
@SiteManager.register
class BooksTW(AbstractSite):
SITE_NAME = SiteName.BooksTW
ID_TYPE = IdType.BooksTW
URL_PATTERNS = [
r"\w+://www\.books\.com\.tw/products/(\w+)",
]
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = Edition
@classmethod
def id_to_url(cls, id_value):
return "https://www.books.com.tw/products/" + id_value
def scrape(self):
content = BasicDownloader(self.url).download().html()
isbn_elem = content.xpath(
"//div[@class='bd']/ul/li[starts-with(text(),'ISBN')]/text()"
)
isbn = isbn_elem[0].strip().split("", 1)[1].strip() if isbn_elem else None
# isbn_elem = content.xpath(
# "//div[@class='bd']/ul/li[starts-with(text(),'EISBN')]/text()"
# )
# eisbn = isbn_elem[0].strip().split("", 1)[1].strip() if isbn_elem else None
title = content.xpath("string(//h1)") or f"Unknown Title {self.id_value}"
subtitle = None
orig_title = content.xpath("string(//h1/following-sibling::h2)")
authors = content.xpath("string(//div/ul/li[contains(text(),'作者:')])")
authors = authors.strip().split("", 1)[1].split(",") if authors else []
authors = [s.strip() for s in authors]
# author_orig = content.xpath("string(//div/ul/li[contains(text(),'原文作者:')])")
translators = content.xpath("string(//div/ul/li[contains(text(),'譯者:')])")
translators = (
translators.strip().split("", 1)[1].split(",") if translators else []
)
translators = [s.strip() for s in translators]
language_elem = content.xpath("//div/ul/li[starts-with(text(),'語言:')]/text()")
language = (
language_elem[0].strip().split("")[1].strip() if language_elem else None
)
pub_house = content.xpath("string(//div/ul/li[contains(text(),'出版社:')])")
pub_house = (
pub_house.strip().split("", 1)[1].strip().split(" ", 1)[0]
if pub_house
else None
)
pub_date = content.xpath("string(//div/ul/li[contains(text(),'出版日期:')])")
pub_date = re.match(
r"(\d+)/(\d+)/(\d+)\s*$",
pub_date.strip().split("", 1)[1].strip().split(" ", 1)[0]
if pub_date
else "",
)
if pub_date:
pub_year = int(pub_date[1])
pub_month = int(pub_date[2])
else:
pub_year = None
pub_month = None
spec = content.xpath("string(//div/ul/li[contains(text(),'規格:')])")
spec = spec.strip().split("", 1)[1].strip().split("/") if spec else []
if len(spec) > 1:
binding = spec[0].strip()
pages = spec[1].strip().split("")
pages = int(pages[0]) if len(pages) > 1 else None
if pages and (pages > 999999 or pages < 1):
pages = None
else:
binding = None
pages = None
price = content.xpath("string(//div/ul/li[contains(text(),'定價:')])")
price = (
price.strip().split("", 1)[1].split("")[0].strip() + " NTD"
if price
else None
)
brief = content.xpath("string(//h3[text()='內容簡介']/following-sibling::div)")
contents = content.xpath("string(//h3[text()='目錄']/following-sibling::div)")
series = None
imprint = None
img_url = content.xpath(
"string(//div[contains(@class,'cover_img')]//img[contains(@class,'cover')]/@src)"
)
img_url = re.sub(r"&[wh]=\d+", "", img_url) if img_url else None
data = {
"title": title,
"subtitle": subtitle,
"orig_title": orig_title,
"author": authors,
"translator": translators,
"language": language,
"pub_house": pub_house,
"pub_year": pub_year,
"pub_month": pub_month,
"binding": binding,
"price": price,
"pages": pages,
"isbn": isbn,
"brief": brief,
"contents": contents,
"series": series,
"imprint": imprint,
"cover_image_url": img_url,
}
pd = ResourceContent(metadata=data)
t, n = detect_isbn_asin(isbn)
if t:
pd.lookup_ids[t] = n
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(
img_url, self.url
)
return pd