2023-08-10 11:27:31 -04:00
|
|
|
|
import logging
|
|
|
|
|
|
2023-04-25 19:04:57 -04:00
|
|
|
|
from catalog.book.models import *
|
|
|
|
|
from catalog.book.utils import *
|
2023-08-10 11:27:31 -04:00
|
|
|
|
from catalog.common import *
|
2023-04-25 19:04:57 -04:00
|
|
|
|
|
2023-08-10 11:27:31 -04:00
|
|
|
|
from .douban import *
|
2023-04-25 19:04:57 -04:00
|
|
|
|
|
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@SiteManager.register
|
|
|
|
|
class BooksTW(AbstractSite):
|
|
|
|
|
SITE_NAME = SiteName.BooksTW
|
|
|
|
|
ID_TYPE = IdType.BooksTW
|
|
|
|
|
URL_PATTERNS = [
|
|
|
|
|
r"\w+://www\.books\.com\.tw/products/(\w+)",
|
|
|
|
|
]
|
|
|
|
|
WIKI_PROPERTY_ID = "?"
|
|
|
|
|
DEFAULT_MODEL = Edition
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def id_to_url(cls, id_value):
|
|
|
|
|
return "https://www.books.com.tw/products/" + id_value
|
|
|
|
|
|
|
|
|
|
def scrape(self):
|
|
|
|
|
content = BasicDownloader(self.url).download().html()
|
|
|
|
|
|
|
|
|
|
isbn_elem = content.xpath(
|
|
|
|
|
"//div[@class='bd']/ul/li[starts-with(text(),'ISBN:')]/text()"
|
|
|
|
|
)
|
2023-08-11 11:55:42 -04:00
|
|
|
|
isbn = isbn_elem[0].strip().split(":", 1)[1].strip() if isbn_elem else None # type: ignore
|
2023-04-25 19:04:57 -04:00
|
|
|
|
|
|
|
|
|
# isbn_elem = content.xpath(
|
|
|
|
|
# "//div[@class='bd']/ul/li[starts-with(text(),'EISBN')]/text()"
|
|
|
|
|
# )
|
|
|
|
|
# eisbn = isbn_elem[0].strip().split(":", 1)[1].strip() if isbn_elem else None
|
|
|
|
|
|
2023-05-14 10:14:02 -04:00
|
|
|
|
title = content.xpath("string(//h1)")
|
|
|
|
|
if not title:
|
|
|
|
|
raise ParseError(self, "title")
|
2023-04-25 19:04:57 -04:00
|
|
|
|
subtitle = None
|
2024-07-13 00:16:47 -04:00
|
|
|
|
orig_title = str(content.xpath("string(//h1/following-sibling::h2)"))
|
2023-04-25 19:04:57 -04:00
|
|
|
|
|
|
|
|
|
authors = content.xpath("string(//div/ul/li[contains(text(),'作者:')])")
|
2023-08-11 11:55:42 -04:00
|
|
|
|
authors = authors.strip().split(":", 1)[1].split(",") if authors else [] # type: ignore
|
2023-04-25 20:33:49 -04:00
|
|
|
|
if not authors:
|
|
|
|
|
authors = [content.xpath("string(//div/ul/li[contains(.,'作者:')]/a)")]
|
2023-08-11 11:55:42 -04:00
|
|
|
|
authors = [s.strip() for s in authors] # type: ignore
|
2023-04-25 19:04:57 -04:00
|
|
|
|
# author_orig = content.xpath("string(//div/ul/li[contains(text(),'原文作者:')])")
|
|
|
|
|
|
|
|
|
|
translators = content.xpath("string(//div/ul/li[contains(text(),'譯者:')])")
|
|
|
|
|
translators = (
|
2023-08-11 11:55:42 -04:00
|
|
|
|
translators.strip().split(":", 1)[1].split(",") if translators else [] # type: ignore
|
2023-04-25 19:04:57 -04:00
|
|
|
|
)
|
|
|
|
|
translators = [s.strip() for s in translators]
|
|
|
|
|
|
2024-06-02 14:50:07 -04:00
|
|
|
|
language_elem = content.xpath(
|
|
|
|
|
"//div/ul/li[starts-with(text(),'語言:')]/text()"
|
|
|
|
|
)
|
2023-04-25 19:04:57 -04:00
|
|
|
|
language = (
|
2024-07-13 18:14:40 -04:00
|
|
|
|
[language_elem[0].strip().split(":")[1].strip()] if language_elem else [] # type: ignore
|
2023-04-25 19:04:57 -04:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
pub_house = content.xpath("string(//div/ul/li[contains(text(),'出版社:')])")
|
|
|
|
|
pub_house = (
|
2023-08-11 11:55:42 -04:00
|
|
|
|
pub_house.strip().split(":", 1)[1].strip().split(" ", 1)[0] # type: ignore
|
2023-04-25 19:04:57 -04:00
|
|
|
|
if pub_house
|
|
|
|
|
else None
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
pub_date = content.xpath("string(//div/ul/li[contains(text(),'出版日期:')])")
|
|
|
|
|
pub_date = re.match(
|
|
|
|
|
r"(\d+)/(\d+)/(\d+)\s*$",
|
2024-06-02 14:50:07 -04:00
|
|
|
|
(
|
|
|
|
|
pub_date.strip().split(":", 1)[1].strip().split(" ", 1)[0] # type: ignore
|
|
|
|
|
if pub_date
|
|
|
|
|
else ""
|
|
|
|
|
),
|
2023-04-25 19:04:57 -04:00
|
|
|
|
)
|
|
|
|
|
if pub_date:
|
|
|
|
|
pub_year = int(pub_date[1])
|
|
|
|
|
pub_month = int(pub_date[2])
|
|
|
|
|
else:
|
|
|
|
|
pub_year = None
|
|
|
|
|
pub_month = None
|
|
|
|
|
|
|
|
|
|
spec = content.xpath("string(//div/ul/li[contains(text(),'規格:')])")
|
2023-08-11 11:55:42 -04:00
|
|
|
|
spec = spec.strip().split(":", 1)[1].strip().split("/") if spec else [] # type: ignore
|
2023-04-25 19:04:57 -04:00
|
|
|
|
if len(spec) > 1:
|
|
|
|
|
binding = spec[0].strip()
|
2023-08-11 11:55:42 -04:00
|
|
|
|
pages = str(spec[1].strip()).split("頁")
|
2023-04-25 19:04:57 -04:00
|
|
|
|
pages = int(pages[0]) if len(pages) > 1 else None
|
|
|
|
|
if pages and (pages > 999999 or pages < 1):
|
|
|
|
|
pages = None
|
|
|
|
|
else:
|
|
|
|
|
binding = None
|
|
|
|
|
pages = None
|
|
|
|
|
|
|
|
|
|
price = content.xpath("string(//div/ul/li[contains(text(),'定價:')])")
|
|
|
|
|
price = (
|
2023-08-11 11:55:42 -04:00
|
|
|
|
price.strip().split(":", 1)[1].split("元")[0].strip() + " NTD" # type: ignore
|
2023-04-25 19:04:57 -04:00
|
|
|
|
if price
|
|
|
|
|
else None
|
|
|
|
|
)
|
|
|
|
|
|
2023-04-25 20:33:49 -04:00
|
|
|
|
series = content.xpath("string(//div/ul/li[contains(text(),'叢書系列:')]/a)")
|
2023-04-25 19:04:57 -04:00
|
|
|
|
|
|
|
|
|
imprint = None
|
|
|
|
|
|
2023-04-25 20:33:49 -04:00
|
|
|
|
brief = content.xpath("string(//h3[text()='內容簡介']/following-sibling::div)")
|
|
|
|
|
|
|
|
|
|
contents = content.xpath("string(//h3[text()='目錄']/following-sibling::div)")
|
|
|
|
|
|
2023-04-25 19:04:57 -04:00
|
|
|
|
img_url = content.xpath(
|
|
|
|
|
"string(//div[contains(@class,'cover_img')]//img[contains(@class,'cover')]/@src)"
|
|
|
|
|
)
|
2023-08-11 11:55:42 -04:00
|
|
|
|
img_url = re.sub(r"&[wh]=\d+", "", img_url) if img_url else None # type: ignore
|
2023-04-25 19:04:57 -04:00
|
|
|
|
data = {
|
|
|
|
|
"title": title,
|
|
|
|
|
"subtitle": subtitle,
|
2024-07-13 18:14:40 -04:00
|
|
|
|
"localized_title": [{"lang": "zh-tw", "text": title}],
|
|
|
|
|
"localized_subtitle": [{"lang": "zh-tw", "text": subtitle}],
|
|
|
|
|
"localized_description": [{"lang": "zh-tw", "text": brief}],
|
2023-04-25 19:04:57 -04:00
|
|
|
|
"orig_title": orig_title,
|
|
|
|
|
"author": authors,
|
|
|
|
|
"translator": translators,
|
|
|
|
|
"language": language,
|
|
|
|
|
"pub_house": pub_house,
|
|
|
|
|
"pub_year": pub_year,
|
|
|
|
|
"pub_month": pub_month,
|
|
|
|
|
"binding": binding,
|
2024-07-28 16:08:36 -04:00
|
|
|
|
"format": binding_to_format(str(binding)) if binding else None,
|
2023-04-25 19:04:57 -04:00
|
|
|
|
"price": price,
|
|
|
|
|
"pages": pages,
|
|
|
|
|
"isbn": isbn,
|
|
|
|
|
"brief": brief,
|
|
|
|
|
"contents": contents,
|
|
|
|
|
"series": series,
|
|
|
|
|
"imprint": imprint,
|
|
|
|
|
"cover_image_url": img_url,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pd = ResourceContent(metadata=data)
|
2024-05-26 22:57:49 -04:00
|
|
|
|
t, n = detect_isbn_asin(str(isbn))
|
2023-04-25 19:04:57 -04:00
|
|
|
|
if t:
|
|
|
|
|
pd.lookup_ids[t] = n
|
|
|
|
|
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(
|
|
|
|
|
img_url, self.url
|
|
|
|
|
)
|
|
|
|
|
return pd
|