2022-12-08 18:08:05 +00:00
|
|
|
from catalog.common import *
|
|
|
|
from catalog.models import *
|
2023-02-03 16:33:58 -05:00
|
|
|
from catalog.music.utils import upc_to_gtin_13
|
2022-12-08 18:08:05 +00:00
|
|
|
from .douban import DoubanDownloader
|
|
|
|
import dateparser
|
|
|
|
import logging
|
|
|
|
|
|
|
|
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
2022-12-15 17:29:35 -05:00
|
|
|
@SiteManager.register
|
2022-12-08 18:08:05 +00:00
|
|
|
class DoubanMusic(AbstractSite):
|
2022-12-16 01:08:10 -05:00
|
|
|
SITE_NAME = SiteName.Douban
|
2022-12-08 18:08:05 +00:00
|
|
|
ID_TYPE = IdType.DoubanMusic
|
2022-12-29 23:57:02 -05:00
|
|
|
URL_PATTERNS = [
|
|
|
|
r"\w+://music\.douban\.com/subject/(\d+)/{0,1}",
|
|
|
|
r"\w+://m.douban.com/music/subject/(\d+)/{0,1}",
|
|
|
|
]
|
|
|
|
WIKI_PROPERTY_ID = ""
|
2022-12-08 18:08:05 +00:00
|
|
|
DEFAULT_MODEL = Album
|
|
|
|
|
|
|
|
@classmethod
|
2023-02-03 16:33:58 -05:00
|
|
|
def id_to_url(cls, id_value):
|
2022-12-08 18:08:05 +00:00
|
|
|
return "https://music.douban.com/subject/" + id_value + "/"
|
|
|
|
|
|
|
|
def scrape(self):
|
|
|
|
content = DoubanDownloader(self.url).download().html()
|
|
|
|
|
|
|
|
elem = content.xpath("//h1/span/text()")
|
|
|
|
title = elem[0].strip() if len(elem) else None
|
|
|
|
if not title:
|
|
|
|
raise ParseError(self, "title")
|
|
|
|
|
2022-12-29 23:57:02 -05:00
|
|
|
artists_elem = content.xpath(
|
|
|
|
"//div[@id='info']/span/span[@class='pl']/a/text()"
|
|
|
|
)
|
|
|
|
artist = (
|
|
|
|
None if not artists_elem else list(map(lambda a: a[:200], artists_elem))
|
|
|
|
)
|
2022-12-08 18:08:05 +00:00
|
|
|
|
|
|
|
genre_elem = content.xpath(
|
2022-12-29 23:57:02 -05:00
|
|
|
"//div[@id='info']//span[text()='流派:']/following::text()[1]"
|
|
|
|
)
|
2022-12-08 18:08:05 +00:00
|
|
|
genre = genre_elem[0].strip() if genre_elem else None
|
|
|
|
|
|
|
|
date_elem = content.xpath(
|
2022-12-29 23:57:02 -05:00
|
|
|
"//div[@id='info']//span[text()='发行时间:']/following::text()[1]"
|
|
|
|
)
|
|
|
|
release_date = (
|
|
|
|
dateparser.parse(date_elem[0].strip()).strftime("%Y-%m-%d")
|
|
|
|
if date_elem
|
|
|
|
else None
|
|
|
|
)
|
2022-12-08 18:08:05 +00:00
|
|
|
|
|
|
|
company_elem = content.xpath(
|
2022-12-29 23:57:02 -05:00
|
|
|
"//div[@id='info']//span[text()='出版者:']/following::text()[1]"
|
|
|
|
)
|
2022-12-08 18:08:05 +00:00
|
|
|
company = company_elem[0].strip() if company_elem else None
|
|
|
|
|
|
|
|
track_list_elem = content.xpath(
|
|
|
|
"//div[@class='track-list']/div[@class='indent']/div/text()"
|
|
|
|
)
|
|
|
|
if track_list_elem:
|
2022-12-29 23:57:02 -05:00
|
|
|
track_list = "\n".join([track.strip() for track in track_list_elem])
|
2022-12-08 18:08:05 +00:00
|
|
|
else:
|
|
|
|
track_list = None
|
|
|
|
|
|
|
|
brief_elem = content.xpath("//span[@class='all hidden']")
|
|
|
|
if not brief_elem:
|
|
|
|
brief_elem = content.xpath("//span[@property='v:summary']")
|
2022-12-29 23:57:02 -05:00
|
|
|
brief = (
|
|
|
|
"\n".join([e.strip() for e in brief_elem[0].xpath("./text()")])
|
|
|
|
if brief_elem
|
|
|
|
else None
|
|
|
|
)
|
2022-12-08 18:08:05 +00:00
|
|
|
|
2022-12-16 01:08:10 -05:00
|
|
|
img_url_elem = content.xpath("//div[@id='mainpic']//img/@src")
|
|
|
|
img_url = img_url_elem[0].strip() if img_url_elem else None
|
|
|
|
|
|
|
|
data = {
|
2022-12-29 23:57:02 -05:00
|
|
|
"title": title,
|
|
|
|
"artist": artist,
|
|
|
|
"genre": genre,
|
|
|
|
"release_date": release_date,
|
|
|
|
"duration": None,
|
|
|
|
"company": [company],
|
|
|
|
"track_list": track_list,
|
|
|
|
"brief": brief,
|
|
|
|
"cover_image_url": img_url,
|
2022-12-16 01:08:10 -05:00
|
|
|
}
|
2022-12-08 18:08:05 +00:00
|
|
|
gtin = None
|
|
|
|
isrc = None
|
|
|
|
other_elem = content.xpath(
|
2022-12-29 23:57:02 -05:00
|
|
|
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]"
|
|
|
|
)
|
2022-12-08 18:08:05 +00:00
|
|
|
if other_elem:
|
2022-12-29 23:57:02 -05:00
|
|
|
data["other_title"] = other_elem[0].strip()
|
2022-12-08 18:08:05 +00:00
|
|
|
other_elem = content.xpath(
|
2022-12-29 23:57:02 -05:00
|
|
|
"//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]"
|
|
|
|
)
|
2022-12-08 18:08:05 +00:00
|
|
|
if other_elem:
|
2022-12-29 23:57:02 -05:00
|
|
|
data["album_type"] = other_elem[0].strip()
|
2022-12-08 18:08:05 +00:00
|
|
|
other_elem = content.xpath(
|
2022-12-29 23:57:02 -05:00
|
|
|
"//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]"
|
|
|
|
)
|
2022-12-08 18:08:05 +00:00
|
|
|
if other_elem:
|
2022-12-29 23:57:02 -05:00
|
|
|
data["media"] = other_elem[0].strip()
|
2022-12-08 18:08:05 +00:00
|
|
|
other_elem = content.xpath(
|
2022-12-29 23:57:02 -05:00
|
|
|
"//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]"
|
|
|
|
)
|
2022-12-08 18:08:05 +00:00
|
|
|
if other_elem:
|
|
|
|
isrc = other_elem[0].strip()
|
|
|
|
other_elem = content.xpath(
|
2022-12-29 23:57:02 -05:00
|
|
|
"//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]"
|
|
|
|
)
|
2022-12-08 18:08:05 +00:00
|
|
|
if other_elem:
|
2023-02-03 16:33:58 -05:00
|
|
|
gtin = upc_to_gtin_13(other_elem[0].strip())
|
2022-12-08 18:08:05 +00:00
|
|
|
other_elem = content.xpath(
|
2022-12-29 23:57:02 -05:00
|
|
|
"//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]"
|
|
|
|
)
|
2022-12-08 18:08:05 +00:00
|
|
|
if other_elem:
|
2022-12-29 23:57:02 -05:00
|
|
|
data["disc_count"] = other_elem[0].strip()
|
2022-12-08 18:08:05 +00:00
|
|
|
|
2022-12-16 01:08:10 -05:00
|
|
|
pd = ResourceContent(metadata=data)
|
2022-12-08 18:08:05 +00:00
|
|
|
if gtin:
|
|
|
|
pd.lookup_ids[IdType.GTIN] = gtin
|
|
|
|
if isrc:
|
|
|
|
pd.lookup_ids[IdType.ISRC] = isrc
|
|
|
|
if pd.metadata["cover_image_url"]:
|
|
|
|
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
|
|
|
|
try:
|
|
|
|
pd.cover_image = imgdl.download().content
|
|
|
|
pd.cover_image_extention = imgdl.extention
|
|
|
|
except Exception:
|
2022-12-29 23:57:02 -05:00
|
|
|
_logger.debug(
|
|
|
|
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
|
|
|
|
)
|
2022-12-08 18:08:05 +00:00
|
|
|
return pd
|