lib.itmens/catalog/sites/douban_music.py

144 lines
5.1 KiB
Python
Raw Normal View History

2023-07-19 11:12:58 -04:00
import dateparser
2022-12-08 18:08:05 +00:00
from catalog.common import *
from catalog.models import *
from catalog.music.utils import upc_to_gtin_13
2024-07-13 00:16:47 -04:00
from common.models.lang import detect_language
2022-12-08 18:08:05 +00:00
2025-01-04 11:23:07 -05:00
from .douban import DoubanDownloader, DoubanSearcher
2022-12-08 18:08:05 +00:00
2022-12-15 17:29:35 -05:00
@SiteManager.register
2022-12-08 18:08:05 +00:00
class DoubanMusic(AbstractSite):
2022-12-16 01:08:10 -05:00
SITE_NAME = SiteName.Douban
2022-12-08 18:08:05 +00:00
ID_TYPE = IdType.DoubanMusic
2022-12-29 23:57:02 -05:00
URL_PATTERNS = [
r"\w+://music\.douban\.com/subject/(\d+)/{0,1}",
r"\w+://m.douban.com/music/subject/(\d+)/{0,1}",
2023-07-19 11:12:58 -04:00
r"\w+://www.douban.com/doubanapp/dispatch\?uri=/music/(\d+)/",
2024-10-13 17:35:36 -04:00
r"\w+://www.douban.com/doubanapp/dispatch/music/(\d+)",
2022-12-29 23:57:02 -05:00
]
WIKI_PROPERTY_ID = ""
2022-12-08 18:08:05 +00:00
DEFAULT_MODEL = Album
@classmethod
def id_to_url(cls, id_value):
2022-12-08 18:08:05 +00:00
return "https://music.douban.com/subject/" + id_value + "/"
2025-01-04 11:23:07 -05:00
@classmethod
def search(cls, q: str, p: int = 1):
return DoubanSearcher.search(ItemCategory.Music, "music", q, p)
2022-12-08 18:08:05 +00:00
def scrape(self):
content = DoubanDownloader(self.url).download().html()
2025-01-04 11:23:07 -05:00
elem = self.query_list(content, "//h1/span/text()")
2022-12-08 18:08:05 +00:00
title = elem[0].strip() if len(elem) else None
if not title:
raise ParseError(self, "title")
2025-01-04 11:23:07 -05:00
artists_elem = self.query_list(
content, "//div[@id='info']/span/span[@class='pl']/a/text()"
2022-12-29 23:57:02 -05:00
)
artist = (
None if not artists_elem else list(map(lambda a: a[:200], artists_elem))
)
2022-12-08 18:08:05 +00:00
2025-01-04 11:23:07 -05:00
genre_elem = self.query_list(
content, "//div[@id='info']//span[text()='流派:']/following::text()[1]"
2022-12-29 23:57:02 -05:00
)
genre = genre_elem[0].strip().split(" / ") if genre_elem else []
2022-12-08 18:08:05 +00:00
2025-01-04 11:23:07 -05:00
date_elem = self.query_list(
content, "//div[@id='info']//span[text()='发行时间:']/following::text()[1]"
2022-12-29 23:57:02 -05:00
)
2023-11-28 22:59:07 -05:00
release_date = dateparser.parse(date_elem[0].strip()) if date_elem else None
release_date = release_date.strftime("%Y-%m-%d") if release_date else None
2022-12-08 18:08:05 +00:00
2025-01-04 11:23:07 -05:00
company_elem = self.query_list(
content, "//div[@id='info']//span[text()='出版者:']/following::text()[1]"
2022-12-29 23:57:02 -05:00
)
2022-12-08 18:08:05 +00:00
company = company_elem[0].strip() if company_elem else None
2025-01-04 11:23:07 -05:00
track_list_elem = self.query_list(
content, "//div[@class='track-list']/div[@class='indent']/div/text()"
2022-12-08 18:08:05 +00:00
)
if track_list_elem:
2022-12-29 23:57:02 -05:00
track_list = "\n".join([track.strip() for track in track_list_elem])
2022-12-08 18:08:05 +00:00
else:
track_list = None
2025-01-04 11:23:07 -05:00
brief_elem = self.query_list(content, "//span[@class='all hidden']")
2022-12-08 18:08:05 +00:00
if not brief_elem:
2025-01-04 11:23:07 -05:00
brief_elem = self.query_list(content, "//span[@property='v:summary']")
2022-12-29 23:57:02 -05:00
brief = (
"\n".join([e.strip() for e in brief_elem[0].xpath("./text()")])
if brief_elem
else None
)
2022-12-08 18:08:05 +00:00
2025-01-04 11:23:07 -05:00
img_url_elem = self.query_list(content, "//div[@id='mainpic']//img/@src")
2022-12-16 01:08:10 -05:00
img_url = img_url_elem[0].strip() if img_url_elem else None
2025-01-04 11:23:07 -05:00
other_elem = self.query_list(
content,
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]",
2024-07-13 00:16:47 -04:00
)
other_title = other_elem[0].strip().split(" / ") if other_elem else []
lang = detect_language(f"{title} {brief}")
localized_title = [{"lang": lang, "text": title}]
localized_title += [
{"lang": detect_language(t), "text": t} for t in other_title
]
2022-12-16 01:08:10 -05:00
data = {
2022-12-29 23:57:02 -05:00
"title": title,
2024-07-13 00:16:47 -04:00
"localized_title": localized_title,
2024-07-16 00:51:05 -04:00
"localized_description": [{"lang": lang, "text": brief}] if brief else [],
2022-12-29 23:57:02 -05:00
"artist": artist,
"genre": genre,
"release_date": release_date,
"duration": None,
2024-01-10 22:20:57 -05:00
"company": [company] if company else [],
2022-12-29 23:57:02 -05:00
"track_list": track_list,
"brief": brief,
"cover_image_url": img_url,
2022-12-16 01:08:10 -05:00
}
2022-12-08 18:08:05 +00:00
gtin = None
isrc = None
2025-01-04 11:23:07 -05:00
other_elem = self.query_list(
content,
"//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]",
2022-12-29 23:57:02 -05:00
)
2022-12-08 18:08:05 +00:00
if other_elem:
2022-12-29 23:57:02 -05:00
data["album_type"] = other_elem[0].strip()
2025-01-04 11:23:07 -05:00
other_elem = self.query_list(
content,
"//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]",
2022-12-29 23:57:02 -05:00
)
2022-12-08 18:08:05 +00:00
if other_elem:
2022-12-29 23:57:02 -05:00
data["media"] = other_elem[0].strip()
2025-01-04 11:23:07 -05:00
other_elem = self.query_list(
content,
"//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]",
2022-12-29 23:57:02 -05:00
)
2022-12-08 18:08:05 +00:00
if other_elem:
isrc = other_elem[0].strip()
2025-01-04 11:23:07 -05:00
other_elem = self.query_list(
content,
"//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]",
2022-12-29 23:57:02 -05:00
)
2022-12-08 18:08:05 +00:00
if other_elem:
gtin = upc_to_gtin_13(other_elem[0].strip())
2025-01-04 11:23:07 -05:00
other_elem = self.query_list(
content,
"//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]",
2022-12-29 23:57:02 -05:00
)
2022-12-08 18:08:05 +00:00
if other_elem:
2022-12-29 23:57:02 -05:00
data["disc_count"] = other_elem[0].strip()
2022-12-08 18:08:05 +00:00
2022-12-16 01:08:10 -05:00
pd = ResourceContent(metadata=data)
2022-12-08 18:08:05 +00:00
if gtin:
pd.lookup_ids[IdType.GTIN] = gtin
if isrc:
pd.lookup_ids[IdType.ISRC] = isrc
return pd