lib.itmens/catalog/sites/douban_music.py

import dateparser

from catalog.common import *
from catalog.models import *
from catalog.music.utils import upc_to_gtin_13
from common.models.lang import detect_language

from .douban import DoubanDownloader, DoubanSearcher


@SiteManager.register
class DoubanMusic(AbstractSite):
    SITE_NAME = SiteName.Douban
    ID_TYPE = IdType.DoubanMusic
    URL_PATTERNS = [
        r"\w+://music\.douban\.com/subject/(\d+)/{0,1}",
        r"\w+://m.douban.com/music/subject/(\d+)/{0,1}",
        r"\w+://www.douban.com/doubanapp/dispatch\?uri=/music/(\d+)/",
        r"\w+://www.douban.com/doubanapp/dispatch/music/(\d+)",
    ]
    WIKI_PROPERTY_ID = ""
    DEFAULT_MODEL = Album

    @classmethod
    def id_to_url(cls, id_value):
        return "https://music.douban.com/subject/" + id_value + "/"

    @classmethod
    def search(cls, q: str, p: int = 1):
        return DoubanSearcher.search(ItemCategory.Music, "music", q, p)

    def scrape(self):
        content = DoubanDownloader(self.url).download().html()

        elem = self.query_list(content, "//h1/span/text()")
        title = elem[0].strip() if len(elem) else None
        if not title:
            raise ParseError(self, "title")

        artists_elem = self.query_list(
            content, "//div[@id='info']/span/span[@class='pl']/a/text()"
        )
        artist = (
            None if not artists_elem else list(map(lambda a: a[:200], artists_elem))
        )

        genre_elem = self.query_list(
            content, "//div[@id='info']//span[text()='流派:']/following::text()[1]"
        )
        genre = genre_elem[0].strip().split(" / ") if genre_elem else []

        date_elem = self.query_list(
            content, "//div[@id='info']//span[text()='发行时间:']/following::text()[1]"
        )
        release_date = dateparser.parse(date_elem[0].strip()) if date_elem else None
        release_date = release_date.strftime("%Y-%m-%d") if release_date else None

        company_elem = self.query_list(
            content, "//div[@id='info']//span[text()='出版者:']/following::text()[1]"
        )
        company = company_elem[0].strip() if company_elem else None

        track_list_elem = self.query_list(
            content, "//div[@class='track-list']/div[@class='indent']/div/text()"
        )
        if track_list_elem:
            track_list = "\n".join([track.strip() for track in track_list_elem])
        else:
            track_list = None

        brief_elem = self.query_list(content, "//span[@class='all hidden']")
        if not brief_elem:
            brief_elem = self.query_list(content, "//span[@property='v:summary']")
        brief = (
            "\n".join([e.strip() for e in brief_elem[0].xpath("./text()")])
            if brief_elem
            else None
        )

        img_url_elem = self.query_list(content, "//div[@id='mainpic']//img/@src")
        img_url = img_url_elem[0].strip() if img_url_elem else None
        other_elem = self.query_list(
            content,
            "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]",
        )
        other_title = other_elem[0].strip().split(" / ") if other_elem else []
        lang = detect_language(f"{title} {brief}")
        localized_title = [{"lang": lang, "text": title}]
        localized_title += [
            {"lang": detect_language(t), "text": t} for t in other_title
        ]
        data = {
            "title": title,
            "localized_title": localized_title,
            "localized_description": [{"lang": lang, "text": brief}] if brief else [],
            "artist": artist,
            "genre": genre,
            "release_date": release_date,
            "duration": None,
            "company": [company] if company else [],
            "track_list": track_list,
            "brief": brief,
            "cover_image_url": img_url,
        }
        gtin = None
        isrc = None
        other_elem = self.query_list(
            content,
            "//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]",
        )
        if other_elem:
            data["album_type"] = other_elem[0].strip()
        other_elem = self.query_list(
            content,
            "//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]",
        )
        if other_elem:
            data["media"] = other_elem[0].strip()
        other_elem = self.query_list(
            content,
            "//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]",
        )
        if other_elem:
            isrc = other_elem[0].strip()
        other_elem = self.query_list(
            content,
            "//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]",
        )
        if other_elem:
            gtin = upc_to_gtin_13(other_elem[0].strip())
        other_elem = self.query_list(
            content,
            "//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]",
        )
        if other_elem:
            data["disc_count"] = other_elem[0].strip()

        pd = ResourceContent(metadata=data)
        if gtin:
            pd.lookup_ids[IdType.GTIN] = gtin
        if isrc:
            pd.lookup_ids[IdType.ISRC] = isrc
        return pd
support additional douban url format 2023-07-19 11:12:58 -04:00			`import dateparser`

new data model: douban music 2022-12-08 18:08:05 +00:00			`from catalog.common import *`
			`from catalog.models import *`
fix discogs url regex; normalize upc/gtin across douban/spotify/discogs 2023-02-03 16:33:58 -05:00			`from catalog.music.utils import upc_to_gtin_13`
supports localized title 2024-07-13 00:16:47 -04:00			`from common.models.lang import detect_language`
new data model: douban music 2022-12-08 18:08:05 +00:00
fix some lint issues 2025-01-04 11:23:07 -05:00			`from .douban import DoubanDownloader, DoubanSearcher`
new data model: douban music 2022-12-08 18:08:05 +00:00

new data model: /book/<uid> 2022-12-15 17:29:35 -05:00			`@SiteManager.register`
new data model: douban music 2022-12-08 18:08:05 +00:00			`class DoubanMusic(AbstractSite):`
new data model: view detail page 2022-12-16 01:08:10 -05:00			`SITE_NAME = SiteName.Douban`
new data model: douban music 2022-12-08 18:08:05 +00:00			`ID_TYPE = IdType.DoubanMusic`
reformat new code with black 2022-12-29 23:57:02 -05:00			`URL_PATTERNS = [`
			`r"\w+://music\.douban\.com/subject/(\d+)/{0,1}",`
			`r"\w+://m.douban.com/music/subject/(\d+)/{0,1}",`
support additional douban url format 2023-07-19 11:12:58 -04:00			`r"\w+://www.douban.com/doubanapp/dispatch\?uri=/music/(\d+)/",`
update new link format from douban app 2024-10-13 17:35:36 -04:00			`r"\w+://www.douban.com/doubanapp/dispatch/music/(\d+)",`
reformat new code with black 2022-12-29 23:57:02 -05:00			`]`
			`WIKI_PROPERTY_ID = ""`
new data model: douban music 2022-12-08 18:08:05 +00:00			`DEFAULT_MODEL = Album`

			`@classmethod`
fix discogs url regex; normalize upc/gtin across douban/spotify/discogs 2023-02-03 16:33:58 -05:00			`def id_to_url(cls, id_value):`
new data model: douban music 2022-12-08 18:08:05 +00:00			`return "https://music.douban.com/subject/" + id_value + "/"`

fix some lint issues 2025-01-04 11:23:07 -05:00			`@classmethod`
			`def search(cls, q: str, p: int = 1):`
			`return DoubanSearcher.search(ItemCategory.Music, "music", q, p)`

new data model: douban music 2022-12-08 18:08:05 +00:00			`def scrape(self):`
			`content = DoubanDownloader(self.url).download().html()`

fix some lint issues 2025-01-04 11:23:07 -05:00			`elem = self.query_list(content, "//h1/span/text()")`
new data model: douban music 2022-12-08 18:08:05 +00:00			`title = elem[0].strip() if len(elem) else None`
			`if not title:`
			`raise ParseError(self, "title")`

fix some lint issues 2025-01-04 11:23:07 -05:00			`artists_elem = self.query_list(`
			`content, "//div[@id='info']/span/span[@class='pl']/a/text()"`
reformat new code with black 2022-12-29 23:57:02 -05:00			`)`
			`artist = (`
			`None if not artists_elem else list(map(lambda a: a[:200], artists_elem))`
			`)`
new data model: douban music 2022-12-08 18:08:05 +00:00
fix some lint issues 2025-01-04 11:23:07 -05:00			`genre_elem = self.query_list(`
			`content, "//div[@id='info']//span[text()='流派:']/following::text()[1]"`
reformat new code with black 2022-12-29 23:57:02 -05:00			`)`
make other_title and genre list[str] and compatible with legacy data 2023-04-17 21:43:20 -04:00			`genre = genre_elem[0].strip().split(" / ") if genre_elem else []`
new data model: douban music 2022-12-08 18:08:05 +00:00
fix some lint issues 2025-01-04 11:23:07 -05:00			`date_elem = self.query_list(`
			`content, "//div[@id='info']//span[text()='发行时间:']/following::text()[1]"`
reformat new code with black 2022-12-29 23:57:02 -05:00			`)`
minor scraper errors 2023-11-28 22:59:07 -05:00			`release_date = dateparser.parse(date_elem[0].strip()) if date_elem else None`
			`release_date = release_date.strftime("%Y-%m-%d") if release_date else None`
new data model: douban music 2022-12-08 18:08:05 +00:00
fix some lint issues 2025-01-04 11:23:07 -05:00			`company_elem = self.query_list(`
			`content, "//div[@id='info']//span[text()='出版者:']/following::text()[1]"`
reformat new code with black 2022-12-29 23:57:02 -05:00			`)`
new data model: douban music 2022-12-08 18:08:05 +00:00			`company = company_elem[0].strip() if company_elem else None`

fix some lint issues 2025-01-04 11:23:07 -05:00			`track_list_elem = self.query_list(`
			`content, "//div[@class='track-list']/div[@class='indent']/div/text()"`
new data model: douban music 2022-12-08 18:08:05 +00:00			`)`
			`if track_list_elem:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`track_list = "\n".join([track.strip() for track in track_list_elem])`
new data model: douban music 2022-12-08 18:08:05 +00:00			`else:`
			`track_list = None`

fix some lint issues 2025-01-04 11:23:07 -05:00			`brief_elem = self.query_list(content, "//span[@class='all hidden']")`
new data model: douban music 2022-12-08 18:08:05 +00:00			`if not brief_elem:`
fix some lint issues 2025-01-04 11:23:07 -05:00			`brief_elem = self.query_list(content, "//span[@property='v:summary']")`
reformat new code with black 2022-12-29 23:57:02 -05:00			`brief = (`
			`"\n".join([e.strip() for e in brief_elem[0].xpath("./text()")])`
			`if brief_elem`
			`else None`
			`)`
new data model: douban music 2022-12-08 18:08:05 +00:00
fix some lint issues 2025-01-04 11:23:07 -05:00			`img_url_elem = self.query_list(content, "//div[@id='mainpic']//img/@src")`
new data model: view detail page 2022-12-16 01:08:10 -05:00			`img_url = img_url_elem[0].strip() if img_url_elem else None`
fix some lint issues 2025-01-04 11:23:07 -05:00			`other_elem = self.query_list(`
			`content,`
			`"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]",`
supports localized title 2024-07-13 00:16:47 -04:00			`)`
			`other_title = other_elem[0].strip().split(" / ") if other_elem else []`
			`lang = detect_language(f"{title} {brief}")`
			`localized_title = [{"lang": lang, "text": title}]`
			`localized_title += [`
			`{"lang": detect_language(t), "text": t} for t in other_title`
			`]`
new data model: view detail page 2022-12-16 01:08:10 -05:00			`data = {`
reformat new code with black 2022-12-29 23:57:02 -05:00			`"title": title,`
supports localized title 2024-07-13 00:16:47 -04:00			`"localized_title": localized_title,`
more data checks in scrapers 2024-07-16 00:51:05 -04:00			`"localized_description": [{"lang": lang, "text": brief}] if brief else [],`
reformat new code with black 2022-12-29 23:57:02 -05:00			`"artist": artist,`
			`"genre": genre,`
			`"release_date": release_date,`
			`"duration": None,`
letterboxd import ui 2024-01-10 22:20:57 -05:00			`"company": [company] if company else [],`
reformat new code with black 2022-12-29 23:57:02 -05:00			`"track_list": track_list,`
			`"brief": brief,`
			`"cover_image_url": img_url,`
new data model: view detail page 2022-12-16 01:08:10 -05:00			`}`
new data model: douban music 2022-12-08 18:08:05 +00:00			`gtin = None`
			`isrc = None`
fix some lint issues 2025-01-04 11:23:07 -05:00			`other_elem = self.query_list(`
			`content,`
			`"//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]",`
reformat new code with black 2022-12-29 23:57:02 -05:00			`)`
new data model: douban music 2022-12-08 18:08:05 +00:00			`if other_elem:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`data["album_type"] = other_elem[0].strip()`
fix some lint issues 2025-01-04 11:23:07 -05:00			`other_elem = self.query_list(`
			`content,`
			`"//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]",`
reformat new code with black 2022-12-29 23:57:02 -05:00			`)`
new data model: douban music 2022-12-08 18:08:05 +00:00			`if other_elem:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`data["media"] = other_elem[0].strip()`
fix some lint issues 2025-01-04 11:23:07 -05:00			`other_elem = self.query_list(`
			`content,`
			`"//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]",`
reformat new code with black 2022-12-29 23:57:02 -05:00			`)`
new data model: douban music 2022-12-08 18:08:05 +00:00			`if other_elem:`
			`isrc = other_elem[0].strip()`
fix some lint issues 2025-01-04 11:23:07 -05:00			`other_elem = self.query_list(`
			`content,`
			`"//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]",`
reformat new code with black 2022-12-29 23:57:02 -05:00			`)`
new data model: douban music 2022-12-08 18:08:05 +00:00			`if other_elem:`
fix discogs url regex; normalize upc/gtin across douban/spotify/discogs 2023-02-03 16:33:58 -05:00			`gtin = upc_to_gtin_13(other_elem[0].strip())`
fix some lint issues 2025-01-04 11:23:07 -05:00			`other_elem = self.query_list(`
			`content,`
			`"//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]",`
reformat new code with black 2022-12-29 23:57:02 -05:00			`)`
new data model: douban music 2022-12-08 18:08:05 +00:00			`if other_elem:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`data["disc_count"] = other_elem[0].strip()`
new data model: douban music 2022-12-08 18:08:05 +00:00
new data model: view detail page 2022-12-16 01:08:10 -05:00			`pd = ResourceContent(metadata=data)`
new data model: douban music 2022-12-08 18:08:05 +00:00			`if gtin:`
			`pd.lookup_ids[IdType.GTIN] = gtin`
			`if isrc:`
			`pd.lookup_ids[IdType.ISRC] = isrc`
			`return pd`