add support for books.com.tw

This commit is contained in:
Your Name 2023-04-25 19:04:57 -04:00 committed by Henri Dickson
parent a6787b186d
commit 8e3b96ef70
9 changed files with 1385 additions and 3 deletions

View file

@ -169,6 +169,54 @@ class GoogleBooksTestCase(TestCase):
self.assertEqual(site.resource.item.title, "1984 Nineteen Eighty-Four") self.assertEqual(site.resource.item.title, "1984 Nineteen Eighty-Four")
class BooksTWTestCase(TestCase):
def test_parse(self):
t_type = IdType.BooksTW
t_id = "0010947886"
t_url = "https://www.books.com.tw/products/0010947886?loc=P_br_60nq68yhb_D_2aabdc_B_1"
t_url2 = "https://www.books.com.tw/products/0010947886"
p1 = SiteManager.get_site_by_url(t_url)
p2 = SiteManager.get_site_by_url(t_url2)
self.assertIsNotNone(p1)
self.assertEqual(p1.url, t_url2)
self.assertEqual(p1.ID_TYPE, t_type)
self.assertEqual(p1.id_value, t_id)
self.assertEqual(p2.url, t_url2)
@use_local_response
def test_scrape(self):
t_url = "https://www.books.com.tw/products/0010947886"
site = SiteManager.get_site_by_url(t_url)
self.assertIsNotNone(site)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(
site.resource.metadata.get("title"),
"阿拉伯人三千年:從民族、部落、語言、文化、宗教到帝國,綜覽阿拉伯世界的崛起、衰落與再興",
)
self.assertEqual(
site.resource.metadata.get("orig_title"),
"Arabs: A 3000-Year History of Peoples, Tribes and Empires",
)
self.assertEqual(site.resource.metadata.get("isbn"), "9786263152236")
self.assertEqual(site.resource.metadata.get("author"), ["Tim Mackintosh-Smith"])
self.assertEqual(site.resource.metadata.get("translator"), ["吳莉君"])
self.assertEqual(site.resource.metadata.get("language"), "繁體中文")
self.assertEqual(site.resource.metadata.get("pub_house"), "臉譜")
self.assertEqual(site.resource.metadata.get("pub_year"), 2023)
self.assertEqual(site.resource.metadata.get("pub_month"), 2)
self.assertEqual(site.resource.metadata.get("binding"), "平裝")
self.assertEqual(site.resource.metadata.get("pages"), 792)
self.assertEqual(site.resource.metadata.get("price"), "1050 NTD")
self.assertEqual(site.resource.id_type, IdType.BooksTW)
self.assertEqual(site.resource.id_value, "0010947886")
self.assertEqual(site.resource.item.isbn, "9786263152236")
self.assertEqual(
site.resource.item.title, "阿拉伯人三千年:從民族、部落、語言、文化、宗教到帝國,綜覽阿拉伯世界的崛起、衰落與再興"
)
class DoubanBookTestCase(TestCase): class DoubanBookTestCase(TestCase):
def setUp(self): def setUp(self):
pass pass

View file

@ -25,6 +25,7 @@ class SiteName(models.TextChoices):
Douban = "douban", _("豆瓣") Douban = "douban", _("豆瓣")
Goodreads = "goodreads", _("Goodreads") Goodreads = "goodreads", _("Goodreads")
GoogleBooks = "googlebooks", _("谷歌图书") GoogleBooks = "googlebooks", _("谷歌图书")
BooksTW = "bookstw", _("博客来")
IMDB = "imdb", _("IMDB") IMDB = "imdb", _("IMDB")
TMDB = "tmdb", _("The Movie Database") TMDB = "tmdb", _("The Movie Database")
Bandcamp = "bandcamp", _("Bandcamp") Bandcamp = "bandcamp", _("Bandcamp")
@ -61,6 +62,7 @@ class IdType(models.TextChoices):
DoubanMusic = "doubanmusic", _("豆瓣音乐") DoubanMusic = "doubanmusic", _("豆瓣音乐")
DoubanGame = "doubangame", _("豆瓣游戏") DoubanGame = "doubangame", _("豆瓣游戏")
DoubanDrama = "doubandrama", _("豆瓣舞台剧") DoubanDrama = "doubandrama", _("豆瓣舞台剧")
BooksTW = "bookstw", _("博客来图书")
Bandcamp = "bandcamp", _("Bandcamp") Bandcamp = "bandcamp", _("Bandcamp")
Spotify_Album = "spotify_album", _("Spotify专辑") Spotify_Album = "spotify_album", _("Spotify专辑")
Spotify_Show = "spotify_show", _("Spotify播客") Spotify_Show = "spotify_show", _("Spotify播客")

View file

@ -0,0 +1,111 @@
# Generated by Django 3.2.18 on 2023-04-25 23:02
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("catalog", "0004_podcast_no_real_change"),
]
operations = [
migrations.AlterField(
model_name="externalresource",
name="id_type",
field=models.CharField(
choices=[
("wikidata", "维基数据"),
("isbn10", "ISBN10"),
("isbn", "ISBN"),
("asin", "ASIN"),
("issn", "ISSN"),
("cubn", "统一书号"),
("isrc", "ISRC"),
("gtin", "GTIN UPC EAN码"),
("rss", "RSS Feed URL"),
("imdb", "IMDb"),
("tmdb_tv", "TMDB剧集"),
("tmdb_tvseason", "TMDB剧集"),
("tmdb_tvepisode", "TMDB剧集"),
("tmdb_movie", "TMDB电影"),
("goodreads", "Goodreads"),
("goodreads_work", "Goodreads著作"),
("googlebooks", "谷歌图书"),
("doubanbook", "豆瓣读书"),
("doubanbook_work", "豆瓣读书著作"),
("doubanmovie", "豆瓣电影"),
("doubanmusic", "豆瓣音乐"),
("doubangame", "豆瓣游戏"),
("doubandrama", "豆瓣舞台剧"),
("bookstw", "博客来图书"),
("bandcamp", "Bandcamp"),
("spotify_album", "Spotify专辑"),
("spotify_show", "Spotify播客"),
("discogs_release", "Discogs Release"),
("discogs_master", "Discogs Master"),
("musicbrainz", "MusicBrainz ID"),
("doubanbook_author", "豆瓣读书作者"),
("doubanmovie_celebrity", "豆瓣电影影人"),
("goodreads_author", "Goodreads作者"),
("spotify_artist", "Spotify艺术家"),
("tmdb_person", "TMDB影人"),
("igdb", "IGDB游戏"),
("steam", "Steam游戏"),
("bangumi", "Bangumi"),
("apple_podcast", "苹果播客"),
],
max_length=50,
verbose_name="IdType of the source site",
),
),
migrations.AlterField(
model_name="itemlookupid",
name="id_type",
field=models.CharField(
blank=True,
choices=[
("wikidata", "维基数据"),
("isbn10", "ISBN10"),
("isbn", "ISBN"),
("asin", "ASIN"),
("issn", "ISSN"),
("cubn", "统一书号"),
("isrc", "ISRC"),
("gtin", "GTIN UPC EAN码"),
("rss", "RSS Feed URL"),
("imdb", "IMDb"),
("tmdb_tv", "TMDB剧集"),
("tmdb_tvseason", "TMDB剧集"),
("tmdb_tvepisode", "TMDB剧集"),
("tmdb_movie", "TMDB电影"),
("goodreads", "Goodreads"),
("goodreads_work", "Goodreads著作"),
("googlebooks", "谷歌图书"),
("doubanbook", "豆瓣读书"),
("doubanbook_work", "豆瓣读书著作"),
("doubanmovie", "豆瓣电影"),
("doubanmusic", "豆瓣音乐"),
("doubangame", "豆瓣游戏"),
("doubandrama", "豆瓣舞台剧"),
("bookstw", "博客来图书"),
("bandcamp", "Bandcamp"),
("spotify_album", "Spotify专辑"),
("spotify_show", "Spotify播客"),
("discogs_release", "Discogs Release"),
("discogs_master", "Discogs Master"),
("musicbrainz", "MusicBrainz ID"),
("doubanbook_author", "豆瓣读书作者"),
("doubanmovie_celebrity", "豆瓣电影影人"),
("goodreads_author", "Goodreads作者"),
("spotify_artist", "Spotify艺术家"),
("tmdb_person", "TMDB影人"),
("igdb", "IGDB游戏"),
("steam", "Steam游戏"),
("bangumi", "Bangumi"),
("apple_podcast", "苹果播客"),
],
max_length=50,
verbose_name="源网站",
),
),
]

View file

@ -18,3 +18,4 @@ from .bandcamp import Bandcamp
from .bangumi import Bangumi from .bangumi import Bangumi
from .discogs import DiscogsRelease from .discogs import DiscogsRelease
from .discogs import DiscogsMaster from .discogs import DiscogsMaster
from .bookstw import BooksTW

137
catalog/sites/bookstw.py Normal file
View file

@ -0,0 +1,137 @@
from catalog.common import *
from catalog.book.models import *
from catalog.book.utils import *
from .douban import *
import logging
_logger = logging.getLogger(__name__)
@SiteManager.register
class BooksTW(AbstractSite):
SITE_NAME = SiteName.BooksTW
ID_TYPE = IdType.BooksTW
URL_PATTERNS = [
r"\w+://www\.books\.com\.tw/products/(\w+)",
]
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = Edition
@classmethod
def id_to_url(cls, id_value):
return "https://www.books.com.tw/products/" + id_value
def scrape(self):
content = BasicDownloader(self.url).download().html()
isbn_elem = content.xpath(
"//div[@class='bd']/ul/li[starts-with(text(),'ISBN')]/text()"
)
isbn = isbn_elem[0].strip().split("", 1)[1].strip() if isbn_elem else None
# isbn_elem = content.xpath(
# "//div[@class='bd']/ul/li[starts-with(text(),'EISBN')]/text()"
# )
# eisbn = isbn_elem[0].strip().split("", 1)[1].strip() if isbn_elem else None
title = content.xpath("string(//h1)") or f"Unknown Title {self.id_value}"
subtitle = None
orig_title = content.xpath("string(//h1/following-sibling::h2)")
authors = content.xpath("string(//div/ul/li[contains(text(),'作者:')])")
authors = authors.strip().split("", 1)[1].split(",") if authors else []
authors = [s.strip() for s in authors]
# author_orig = content.xpath("string(//div/ul/li[contains(text(),'原文作者:')])")
translators = content.xpath("string(//div/ul/li[contains(text(),'譯者:')])")
translators = (
translators.strip().split("", 1)[1].split(",") if translators else []
)
translators = [s.strip() for s in translators]
language_elem = content.xpath("//div/ul/li[starts-with(text(),'語言:')]/text()")
language = (
language_elem[0].strip().split("")[1].strip() if language_elem else None
)
pub_house = content.xpath("string(//div/ul/li[contains(text(),'出版社:')])")
pub_house = (
pub_house.strip().split("", 1)[1].strip().split(" ", 1)[0]
if pub_house
else None
)
pub_date = content.xpath("string(//div/ul/li[contains(text(),'出版日期:')])")
pub_date = re.match(
r"(\d+)/(\d+)/(\d+)\s*$",
pub_date.strip().split("", 1)[1].strip().split(" ", 1)[0]
if pub_date
else "",
)
if pub_date:
pub_year = int(pub_date[1])
pub_month = int(pub_date[2])
else:
pub_year = None
pub_month = None
spec = content.xpath("string(//div/ul/li[contains(text(),'規格:')])")
spec = spec.strip().split("", 1)[1].strip().split("/") if spec else []
if len(spec) > 1:
binding = spec[0].strip()
pages = spec[1].strip().split("")
pages = int(pages[0]) if len(pages) > 1 else None
if pages and (pages > 999999 or pages < 1):
pages = None
else:
binding = None
pages = None
price = content.xpath("string(//div/ul/li[contains(text(),'定價:')])")
price = (
price.strip().split("", 1)[1].split("")[0].strip() + " NTD"
if price
else None
)
brief = content.xpath("string(//h3[text()='內容簡介']/following-sibling::div)")
contents = content.xpath("string(//h3[text()='目錄']/following-sibling::div)")
series = None
imprint = None
img_url = content.xpath(
"string(//div[contains(@class,'cover_img')]//img[contains(@class,'cover')]/@src)"
)
img_url = re.sub(r"&[wh]=\d+", "", img_url) if img_url else None
data = {
"title": title,
"subtitle": subtitle,
"orig_title": orig_title,
"author": authors,
"translator": translators,
"language": language,
"pub_house": pub_house,
"pub_year": pub_year,
"pub_month": pub_month,
"binding": binding,
"price": price,
"pages": pages,
"isbn": isbn,
"brief": brief,
"contents": contents,
"series": series,
"imprint": imprint,
"cover_image_url": img_url,
}
pd = ResourceContent(metadata=data)
t, n = detect_isbn_asin(isbn)
if t:
pd.lookup_ids[t] = n
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(
img_url, self.url
)
return pd

View file

@ -47,7 +47,7 @@ class RSS(AbstractSite):
@classmethod @classmethod
def validate_url_fallback(cls, url): def validate_url_fallback(cls, url):
val = URLValidator(verify_exists=False) val = URLValidator()
try: try:
val(url) val(url)
return cls.parse_feed_from_url(url) is not None return cls.parse_feed_from_url(url) is not None

View file

@ -15,6 +15,8 @@ $bangumi-color-primary: #F09199
$bangumi-color-secondary: #FCFCFC $bangumi-color-secondary: #FCFCFC
$goodreads-color-primary: #372213 $goodreads-color-primary: #372213
$goodreads-color-secondary: #F4F1EA $goodreads-color-secondary: #F4F1EA
$bookstw-color-primary: white
$bookstw-color-secondary: #7FBA19
$tmdb-color-primary: #91CCA3 $tmdb-color-primary: #91CCA3
$tmdb-color-secondary: #1FB4E2 $tmdb-color-secondary: #1FB4E2
$bandcamp-color-primary: #28A0C1 $bandcamp-color-primary: #28A0C1
@ -78,6 +80,10 @@ $bandcamp-color-secondary: white
background: $goodreads-color-secondary background: $goodreads-color-secondary
color: $goodreads-color-primary color: $goodreads-color-primary
font-weight: lighter font-weight: lighter
&.source-label__bookstw
background: $bookstw-color-secondary
color: $bookstw-color-primary
font-weight: lighter
&.source-label__tmdb &.source-label__tmdb
background: linear-gradient(90deg, $tmdb-color-primary, $tmdb-color-secondary) background: linear-gradient(90deg, $tmdb-color-primary, $tmdb-color-secondary)
color: white color: white

View file

@ -88,14 +88,14 @@ classDiagram
Add a new site Add a new site
-------------- --------------
- add a new item to `IdType` enum in `catalog/common/models.py` - add a new value to `IdType` and `SiteName` in `catalog/common/models.py`
- add a new file in `catalog/sites/`, a new class inherits `AbstractSite`, with: - add a new file in `catalog/sites/`, a new class inherits `AbstractSite`, with:
* `SITE_NAME` * `SITE_NAME`
* `ID_TYPE` * `ID_TYPE`
* `URL_PATTERNS` * `URL_PATTERNS`
* `WIKI_PROPERTY_ID` (not used now) * `WIKI_PROPERTY_ID` (not used now)
* `DEFAULT_MODEL` (unless specified in `scrape()` return val) * `DEFAULT_MODEL` (unless specified in `scrape()` return val)
* a `classmethod` `id_to_url()` * a classmethod `id_to_url()`
* a method `scrape()` returns a `ResourceContent` object * a method `scrape()` returns a `ResourceContent` object
* ... * ...

File diff suppressed because it is too large Load diff