add support for books.com.tw
This commit is contained in:
parent
a6787b186d
commit
8e3b96ef70
9 changed files with 1385 additions and 3 deletions
|
@ -169,6 +169,54 @@ class GoogleBooksTestCase(TestCase):
|
||||||
self.assertEqual(site.resource.item.title, "1984 Nineteen Eighty-Four")
|
self.assertEqual(site.resource.item.title, "1984 Nineteen Eighty-Four")
|
||||||
|
|
||||||
|
|
||||||
|
class BooksTWTestCase(TestCase):
|
||||||
|
def test_parse(self):
|
||||||
|
t_type = IdType.BooksTW
|
||||||
|
t_id = "0010947886"
|
||||||
|
t_url = "https://www.books.com.tw/products/0010947886?loc=P_br_60nq68yhb_D_2aabdc_B_1"
|
||||||
|
t_url2 = "https://www.books.com.tw/products/0010947886"
|
||||||
|
p1 = SiteManager.get_site_by_url(t_url)
|
||||||
|
p2 = SiteManager.get_site_by_url(t_url2)
|
||||||
|
self.assertIsNotNone(p1)
|
||||||
|
self.assertEqual(p1.url, t_url2)
|
||||||
|
self.assertEqual(p1.ID_TYPE, t_type)
|
||||||
|
self.assertEqual(p1.id_value, t_id)
|
||||||
|
self.assertEqual(p2.url, t_url2)
|
||||||
|
|
||||||
|
@use_local_response
|
||||||
|
def test_scrape(self):
|
||||||
|
t_url = "https://www.books.com.tw/products/0010947886"
|
||||||
|
site = SiteManager.get_site_by_url(t_url)
|
||||||
|
self.assertIsNotNone(site)
|
||||||
|
self.assertEqual(site.ready, False)
|
||||||
|
site.get_resource_ready()
|
||||||
|
self.assertEqual(site.ready, True)
|
||||||
|
self.assertEqual(
|
||||||
|
site.resource.metadata.get("title"),
|
||||||
|
"阿拉伯人三千年:從民族、部落、語言、文化、宗教到帝國,綜覽阿拉伯世界的崛起、衰落與再興",
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
site.resource.metadata.get("orig_title"),
|
||||||
|
"Arabs: A 3000-Year History of Peoples, Tribes and Empires",
|
||||||
|
)
|
||||||
|
self.assertEqual(site.resource.metadata.get("isbn"), "9786263152236")
|
||||||
|
self.assertEqual(site.resource.metadata.get("author"), ["Tim Mackintosh-Smith"])
|
||||||
|
self.assertEqual(site.resource.metadata.get("translator"), ["吳莉君"])
|
||||||
|
self.assertEqual(site.resource.metadata.get("language"), "繁體中文")
|
||||||
|
self.assertEqual(site.resource.metadata.get("pub_house"), "臉譜")
|
||||||
|
self.assertEqual(site.resource.metadata.get("pub_year"), 2023)
|
||||||
|
self.assertEqual(site.resource.metadata.get("pub_month"), 2)
|
||||||
|
self.assertEqual(site.resource.metadata.get("binding"), "平裝")
|
||||||
|
self.assertEqual(site.resource.metadata.get("pages"), 792)
|
||||||
|
self.assertEqual(site.resource.metadata.get("price"), "1050 NTD")
|
||||||
|
self.assertEqual(site.resource.id_type, IdType.BooksTW)
|
||||||
|
self.assertEqual(site.resource.id_value, "0010947886")
|
||||||
|
self.assertEqual(site.resource.item.isbn, "9786263152236")
|
||||||
|
self.assertEqual(
|
||||||
|
site.resource.item.title, "阿拉伯人三千年:從民族、部落、語言、文化、宗教到帝國,綜覽阿拉伯世界的崛起、衰落與再興"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class DoubanBookTestCase(TestCase):
|
class DoubanBookTestCase(TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -25,6 +25,7 @@ class SiteName(models.TextChoices):
|
||||||
Douban = "douban", _("豆瓣")
|
Douban = "douban", _("豆瓣")
|
||||||
Goodreads = "goodreads", _("Goodreads")
|
Goodreads = "goodreads", _("Goodreads")
|
||||||
GoogleBooks = "googlebooks", _("谷歌图书")
|
GoogleBooks = "googlebooks", _("谷歌图书")
|
||||||
|
BooksTW = "bookstw", _("博客来")
|
||||||
IMDB = "imdb", _("IMDB")
|
IMDB = "imdb", _("IMDB")
|
||||||
TMDB = "tmdb", _("The Movie Database")
|
TMDB = "tmdb", _("The Movie Database")
|
||||||
Bandcamp = "bandcamp", _("Bandcamp")
|
Bandcamp = "bandcamp", _("Bandcamp")
|
||||||
|
@ -61,6 +62,7 @@ class IdType(models.TextChoices):
|
||||||
DoubanMusic = "doubanmusic", _("豆瓣音乐")
|
DoubanMusic = "doubanmusic", _("豆瓣音乐")
|
||||||
DoubanGame = "doubangame", _("豆瓣游戏")
|
DoubanGame = "doubangame", _("豆瓣游戏")
|
||||||
DoubanDrama = "doubandrama", _("豆瓣舞台剧")
|
DoubanDrama = "doubandrama", _("豆瓣舞台剧")
|
||||||
|
BooksTW = "bookstw", _("博客来图书")
|
||||||
Bandcamp = "bandcamp", _("Bandcamp")
|
Bandcamp = "bandcamp", _("Bandcamp")
|
||||||
Spotify_Album = "spotify_album", _("Spotify专辑")
|
Spotify_Album = "spotify_album", _("Spotify专辑")
|
||||||
Spotify_Show = "spotify_show", _("Spotify播客")
|
Spotify_Show = "spotify_show", _("Spotify播客")
|
||||||
|
|
111
catalog/migrations/0005_bookstw.py
Normal file
111
catalog/migrations/0005_bookstw.py
Normal file
|
@ -0,0 +1,111 @@
|
||||||
|
# Generated by Django 3.2.18 on 2023-04-25 23:02
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
dependencies = [
|
||||||
|
("catalog", "0004_podcast_no_real_change"),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name="externalresource",
|
||||||
|
name="id_type",
|
||||||
|
field=models.CharField(
|
||||||
|
choices=[
|
||||||
|
("wikidata", "维基数据"),
|
||||||
|
("isbn10", "ISBN10"),
|
||||||
|
("isbn", "ISBN"),
|
||||||
|
("asin", "ASIN"),
|
||||||
|
("issn", "ISSN"),
|
||||||
|
("cubn", "统一书号"),
|
||||||
|
("isrc", "ISRC"),
|
||||||
|
("gtin", "GTIN UPC EAN码"),
|
||||||
|
("rss", "RSS Feed URL"),
|
||||||
|
("imdb", "IMDb"),
|
||||||
|
("tmdb_tv", "TMDB剧集"),
|
||||||
|
("tmdb_tvseason", "TMDB剧集"),
|
||||||
|
("tmdb_tvepisode", "TMDB剧集"),
|
||||||
|
("tmdb_movie", "TMDB电影"),
|
||||||
|
("goodreads", "Goodreads"),
|
||||||
|
("goodreads_work", "Goodreads著作"),
|
||||||
|
("googlebooks", "谷歌图书"),
|
||||||
|
("doubanbook", "豆瓣读书"),
|
||||||
|
("doubanbook_work", "豆瓣读书著作"),
|
||||||
|
("doubanmovie", "豆瓣电影"),
|
||||||
|
("doubanmusic", "豆瓣音乐"),
|
||||||
|
("doubangame", "豆瓣游戏"),
|
||||||
|
("doubandrama", "豆瓣舞台剧"),
|
||||||
|
("bookstw", "博客来图书"),
|
||||||
|
("bandcamp", "Bandcamp"),
|
||||||
|
("spotify_album", "Spotify专辑"),
|
||||||
|
("spotify_show", "Spotify播客"),
|
||||||
|
("discogs_release", "Discogs Release"),
|
||||||
|
("discogs_master", "Discogs Master"),
|
||||||
|
("musicbrainz", "MusicBrainz ID"),
|
||||||
|
("doubanbook_author", "豆瓣读书作者"),
|
||||||
|
("doubanmovie_celebrity", "豆瓣电影影人"),
|
||||||
|
("goodreads_author", "Goodreads作者"),
|
||||||
|
("spotify_artist", "Spotify艺术家"),
|
||||||
|
("tmdb_person", "TMDB影人"),
|
||||||
|
("igdb", "IGDB游戏"),
|
||||||
|
("steam", "Steam游戏"),
|
||||||
|
("bangumi", "Bangumi"),
|
||||||
|
("apple_podcast", "苹果播客"),
|
||||||
|
],
|
||||||
|
max_length=50,
|
||||||
|
verbose_name="IdType of the source site",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name="itemlookupid",
|
||||||
|
name="id_type",
|
||||||
|
field=models.CharField(
|
||||||
|
blank=True,
|
||||||
|
choices=[
|
||||||
|
("wikidata", "维基数据"),
|
||||||
|
("isbn10", "ISBN10"),
|
||||||
|
("isbn", "ISBN"),
|
||||||
|
("asin", "ASIN"),
|
||||||
|
("issn", "ISSN"),
|
||||||
|
("cubn", "统一书号"),
|
||||||
|
("isrc", "ISRC"),
|
||||||
|
("gtin", "GTIN UPC EAN码"),
|
||||||
|
("rss", "RSS Feed URL"),
|
||||||
|
("imdb", "IMDb"),
|
||||||
|
("tmdb_tv", "TMDB剧集"),
|
||||||
|
("tmdb_tvseason", "TMDB剧集"),
|
||||||
|
("tmdb_tvepisode", "TMDB剧集"),
|
||||||
|
("tmdb_movie", "TMDB电影"),
|
||||||
|
("goodreads", "Goodreads"),
|
||||||
|
("goodreads_work", "Goodreads著作"),
|
||||||
|
("googlebooks", "谷歌图书"),
|
||||||
|
("doubanbook", "豆瓣读书"),
|
||||||
|
("doubanbook_work", "豆瓣读书著作"),
|
||||||
|
("doubanmovie", "豆瓣电影"),
|
||||||
|
("doubanmusic", "豆瓣音乐"),
|
||||||
|
("doubangame", "豆瓣游戏"),
|
||||||
|
("doubandrama", "豆瓣舞台剧"),
|
||||||
|
("bookstw", "博客来图书"),
|
||||||
|
("bandcamp", "Bandcamp"),
|
||||||
|
("spotify_album", "Spotify专辑"),
|
||||||
|
("spotify_show", "Spotify播客"),
|
||||||
|
("discogs_release", "Discogs Release"),
|
||||||
|
("discogs_master", "Discogs Master"),
|
||||||
|
("musicbrainz", "MusicBrainz ID"),
|
||||||
|
("doubanbook_author", "豆瓣读书作者"),
|
||||||
|
("doubanmovie_celebrity", "豆瓣电影影人"),
|
||||||
|
("goodreads_author", "Goodreads作者"),
|
||||||
|
("spotify_artist", "Spotify艺术家"),
|
||||||
|
("tmdb_person", "TMDB影人"),
|
||||||
|
("igdb", "IGDB游戏"),
|
||||||
|
("steam", "Steam游戏"),
|
||||||
|
("bangumi", "Bangumi"),
|
||||||
|
("apple_podcast", "苹果播客"),
|
||||||
|
],
|
||||||
|
max_length=50,
|
||||||
|
verbose_name="源网站",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
|
@ -18,3 +18,4 @@ from .bandcamp import Bandcamp
|
||||||
from .bangumi import Bangumi
|
from .bangumi import Bangumi
|
||||||
from .discogs import DiscogsRelease
|
from .discogs import DiscogsRelease
|
||||||
from .discogs import DiscogsMaster
|
from .discogs import DiscogsMaster
|
||||||
|
from .bookstw import BooksTW
|
||||||
|
|
137
catalog/sites/bookstw.py
Normal file
137
catalog/sites/bookstw.py
Normal file
|
@ -0,0 +1,137 @@
|
||||||
|
from catalog.common import *
|
||||||
|
from catalog.book.models import *
|
||||||
|
from catalog.book.utils import *
|
||||||
|
from .douban import *
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
_logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@SiteManager.register
|
||||||
|
class BooksTW(AbstractSite):
|
||||||
|
SITE_NAME = SiteName.BooksTW
|
||||||
|
ID_TYPE = IdType.BooksTW
|
||||||
|
URL_PATTERNS = [
|
||||||
|
r"\w+://www\.books\.com\.tw/products/(\w+)",
|
||||||
|
]
|
||||||
|
WIKI_PROPERTY_ID = "?"
|
||||||
|
DEFAULT_MODEL = Edition
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def id_to_url(cls, id_value):
|
||||||
|
return "https://www.books.com.tw/products/" + id_value
|
||||||
|
|
||||||
|
def scrape(self):
|
||||||
|
content = BasicDownloader(self.url).download().html()
|
||||||
|
|
||||||
|
isbn_elem = content.xpath(
|
||||||
|
"//div[@class='bd']/ul/li[starts-with(text(),'ISBN:')]/text()"
|
||||||
|
)
|
||||||
|
isbn = isbn_elem[0].strip().split(":", 1)[1].strip() if isbn_elem else None
|
||||||
|
|
||||||
|
# isbn_elem = content.xpath(
|
||||||
|
# "//div[@class='bd']/ul/li[starts-with(text(),'EISBN')]/text()"
|
||||||
|
# )
|
||||||
|
# eisbn = isbn_elem[0].strip().split(":", 1)[1].strip() if isbn_elem else None
|
||||||
|
|
||||||
|
title = content.xpath("string(//h1)") or f"Unknown Title {self.id_value}"
|
||||||
|
subtitle = None
|
||||||
|
orig_title = content.xpath("string(//h1/following-sibling::h2)")
|
||||||
|
|
||||||
|
authors = content.xpath("string(//div/ul/li[contains(text(),'作者:')])")
|
||||||
|
authors = authors.strip().split(":", 1)[1].split(",") if authors else []
|
||||||
|
authors = [s.strip() for s in authors]
|
||||||
|
# author_orig = content.xpath("string(//div/ul/li[contains(text(),'原文作者:')])")
|
||||||
|
|
||||||
|
translators = content.xpath("string(//div/ul/li[contains(text(),'譯者:')])")
|
||||||
|
translators = (
|
||||||
|
translators.strip().split(":", 1)[1].split(",") if translators else []
|
||||||
|
)
|
||||||
|
translators = [s.strip() for s in translators]
|
||||||
|
|
||||||
|
language_elem = content.xpath("//div/ul/li[starts-with(text(),'語言:')]/text()")
|
||||||
|
language = (
|
||||||
|
language_elem[0].strip().split(":")[1].strip() if language_elem else None
|
||||||
|
)
|
||||||
|
|
||||||
|
pub_house = content.xpath("string(//div/ul/li[contains(text(),'出版社:')])")
|
||||||
|
pub_house = (
|
||||||
|
pub_house.strip().split(":", 1)[1].strip().split(" ", 1)[0]
|
||||||
|
if pub_house
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
pub_date = content.xpath("string(//div/ul/li[contains(text(),'出版日期:')])")
|
||||||
|
pub_date = re.match(
|
||||||
|
r"(\d+)/(\d+)/(\d+)\s*$",
|
||||||
|
pub_date.strip().split(":", 1)[1].strip().split(" ", 1)[0]
|
||||||
|
if pub_date
|
||||||
|
else "",
|
||||||
|
)
|
||||||
|
if pub_date:
|
||||||
|
pub_year = int(pub_date[1])
|
||||||
|
pub_month = int(pub_date[2])
|
||||||
|
else:
|
||||||
|
pub_year = None
|
||||||
|
pub_month = None
|
||||||
|
|
||||||
|
spec = content.xpath("string(//div/ul/li[contains(text(),'規格:')])")
|
||||||
|
spec = spec.strip().split(":", 1)[1].strip().split("/") if spec else []
|
||||||
|
if len(spec) > 1:
|
||||||
|
binding = spec[0].strip()
|
||||||
|
pages = spec[1].strip().split("頁")
|
||||||
|
pages = int(pages[0]) if len(pages) > 1 else None
|
||||||
|
if pages and (pages > 999999 or pages < 1):
|
||||||
|
pages = None
|
||||||
|
else:
|
||||||
|
binding = None
|
||||||
|
pages = None
|
||||||
|
|
||||||
|
price = content.xpath("string(//div/ul/li[contains(text(),'定價:')])")
|
||||||
|
price = (
|
||||||
|
price.strip().split(":", 1)[1].split("元")[0].strip() + " NTD"
|
||||||
|
if price
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
brief = content.xpath("string(//h3[text()='內容簡介']/following-sibling::div)")
|
||||||
|
contents = content.xpath("string(//h3[text()='目錄']/following-sibling::div)")
|
||||||
|
|
||||||
|
series = None
|
||||||
|
imprint = None
|
||||||
|
|
||||||
|
img_url = content.xpath(
|
||||||
|
"string(//div[contains(@class,'cover_img')]//img[contains(@class,'cover')]/@src)"
|
||||||
|
)
|
||||||
|
img_url = re.sub(r"&[wh]=\d+", "", img_url) if img_url else None
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"title": title,
|
||||||
|
"subtitle": subtitle,
|
||||||
|
"orig_title": orig_title,
|
||||||
|
"author": authors,
|
||||||
|
"translator": translators,
|
||||||
|
"language": language,
|
||||||
|
"pub_house": pub_house,
|
||||||
|
"pub_year": pub_year,
|
||||||
|
"pub_month": pub_month,
|
||||||
|
"binding": binding,
|
||||||
|
"price": price,
|
||||||
|
"pages": pages,
|
||||||
|
"isbn": isbn,
|
||||||
|
"brief": brief,
|
||||||
|
"contents": contents,
|
||||||
|
"series": series,
|
||||||
|
"imprint": imprint,
|
||||||
|
"cover_image_url": img_url,
|
||||||
|
}
|
||||||
|
|
||||||
|
pd = ResourceContent(metadata=data)
|
||||||
|
t, n = detect_isbn_asin(isbn)
|
||||||
|
if t:
|
||||||
|
pd.lookup_ids[t] = n
|
||||||
|
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(
|
||||||
|
img_url, self.url
|
||||||
|
)
|
||||||
|
return pd
|
|
@ -47,7 +47,7 @@ class RSS(AbstractSite):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate_url_fallback(cls, url):
|
def validate_url_fallback(cls, url):
|
||||||
val = URLValidator(verify_exists=False)
|
val = URLValidator()
|
||||||
try:
|
try:
|
||||||
val(url)
|
val(url)
|
||||||
return cls.parse_feed_from_url(url) is not None
|
return cls.parse_feed_from_url(url) is not None
|
||||||
|
|
|
@ -15,6 +15,8 @@ $bangumi-color-primary: #F09199
|
||||||
$bangumi-color-secondary: #FCFCFC
|
$bangumi-color-secondary: #FCFCFC
|
||||||
$goodreads-color-primary: #372213
|
$goodreads-color-primary: #372213
|
||||||
$goodreads-color-secondary: #F4F1EA
|
$goodreads-color-secondary: #F4F1EA
|
||||||
|
$bookstw-color-primary: white
|
||||||
|
$bookstw-color-secondary: #7FBA19
|
||||||
$tmdb-color-primary: #91CCA3
|
$tmdb-color-primary: #91CCA3
|
||||||
$tmdb-color-secondary: #1FB4E2
|
$tmdb-color-secondary: #1FB4E2
|
||||||
$bandcamp-color-primary: #28A0C1
|
$bandcamp-color-primary: #28A0C1
|
||||||
|
@ -78,6 +80,10 @@ $bandcamp-color-secondary: white
|
||||||
background: $goodreads-color-secondary
|
background: $goodreads-color-secondary
|
||||||
color: $goodreads-color-primary
|
color: $goodreads-color-primary
|
||||||
font-weight: lighter
|
font-weight: lighter
|
||||||
|
&.source-label__bookstw
|
||||||
|
background: $bookstw-color-secondary
|
||||||
|
color: $bookstw-color-primary
|
||||||
|
font-weight: lighter
|
||||||
&.source-label__tmdb
|
&.source-label__tmdb
|
||||||
background: linear-gradient(90deg, $tmdb-color-primary, $tmdb-color-secondary)
|
background: linear-gradient(90deg, $tmdb-color-primary, $tmdb-color-secondary)
|
||||||
color: white
|
color: white
|
||||||
|
|
|
@ -88,14 +88,14 @@ classDiagram
|
||||||
|
|
||||||
Add a new site
|
Add a new site
|
||||||
--------------
|
--------------
|
||||||
- add a new item to `IdType` enum in `catalog/common/models.py`
|
- add a new value to `IdType` and `SiteName` in `catalog/common/models.py`
|
||||||
- add a new file in `catalog/sites/`, a new class inherits `AbstractSite`, with:
|
- add a new file in `catalog/sites/`, a new class inherits `AbstractSite`, with:
|
||||||
* `SITE_NAME`
|
* `SITE_NAME`
|
||||||
* `ID_TYPE`
|
* `ID_TYPE`
|
||||||
* `URL_PATTERNS`
|
* `URL_PATTERNS`
|
||||||
* `WIKI_PROPERTY_ID` (not used now)
|
* `WIKI_PROPERTY_ID` (not used now)
|
||||||
* `DEFAULT_MODEL` (unless specified in `scrape()` return val)
|
* `DEFAULT_MODEL` (unless specified in `scrape()` return val)
|
||||||
* a `classmethod` `id_to_url()`
|
* a classmethod `id_to_url()`
|
||||||
* a method `scrape()` returns a `ResourceContent` object
|
* a method `scrape()` returns a `ResourceContent` object
|
||||||
* ...
|
* ...
|
||||||
|
|
||||||
|
|
1077
test_data/https___www_books_com_tw_products_0010947886
Normal file
1077
test_data/https___www_books_com_tw_products_0010947886
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue