lib.itmens/catalog/sites/douban_music.py

from catalog.common import *
from catalog.models import *
from .douban import DoubanDownloader
import dateparser
import logging


_logger = logging.getLogger(__name__)


@SiteList.register
class DoubanMusic(AbstractSite):
    ID_TYPE = IdType.DoubanMusic
    URL_PATTERNS = [r"\w+://music\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/music/subject/(\d+)/{0,1}"]
    WIKI_PROPERTY_ID = ''
    DEFAULT_MODEL = Album

    @classmethod
    def id_to_url(self, id_value):
        return "https://music.douban.com/subject/" + id_value + "/"

    def scrape(self):
        content = DoubanDownloader(self.url).download().html()

        elem = content.xpath("//h1/span/text()")
        title = elem[0].strip() if len(elem) else None
        if not title:
            raise ParseError(self, "title")

        artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()")
        artist = None if not artists_elem else list(map(lambda a: a[:200], artists_elem))

        genre_elem = content.xpath(
            "//div[@id='info']//span[text()='流派:']/following::text()[1]")
        genre = genre_elem[0].strip() if genre_elem else None

        date_elem = content.xpath(
            "//div[@id='info']//span[text()='发行时间:']/following::text()[1]")
        release_date = dateparser.parse(date_elem[0].strip()).strftime('%Y-%m-%d') if date_elem else None

        company_elem = content.xpath(
            "//div[@id='info']//span[text()='出版者:']/following::text()[1]")
        company = company_elem[0].strip() if company_elem else None

        track_list_elem = content.xpath(
            "//div[@class='track-list']/div[@class='indent']/div/text()"
        )
        if track_list_elem:
            track_list = '\n'.join([track.strip() for track in track_list_elem])
        else:
            track_list = None

        brief_elem = content.xpath("//span[@class='all hidden']")
        if not brief_elem:
            brief_elem = content.xpath("//span[@property='v:summary']")
        brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
            './text()')]) if brief_elem else None

        gtin = None
        isrc = None
        other_info = {}
        other_elem = content.xpath(
            "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
        if other_elem:
            other_info['又名'] = other_elem[0].strip()
        other_elem = content.xpath(
            "//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]")
        if other_elem:
            other_info['专辑类型'] = other_elem[0].strip()
        other_elem = content.xpath(
            "//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]")
        if other_elem:
            other_info['介质'] = other_elem[0].strip()
        other_elem = content.xpath(
            "//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]")
        if other_elem:
            other_info['ISRC'] = other_elem[0].strip()
            isrc = other_elem[0].strip()
        other_elem = content.xpath(
            "//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]")
        if other_elem:
            other_info['条形码'] = other_elem[0].strip()
            gtin = other_elem[0].strip()
        other_elem = content.xpath(
            "//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]")
        if other_elem:
            other_info['碟片数'] = other_elem[0].strip()

        img_url_elem = content.xpath("//div[@id='mainpic']//img/@src")
        img_url = img_url_elem[0].strip() if img_url_elem else None

        pd = ResourceContent(metadata={
            'title': title,
            'artist': artist,
            'genre': genre,
            'release_date': release_date,
            'duration': None,
            'company': company,
            'track_list': track_list,
            'brief': brief,
            'other_info': other_info,
            'cover_image_url': img_url
        })
        if gtin:
            pd.lookup_ids[IdType.GTIN] = gtin
        if isrc:
            pd.lookup_ids[IdType.ISRC] = isrc
        if pd.metadata["cover_image_url"]:
            imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
            try:
                pd.cover_image = imgdl.download().content
                pd.cover_image_extention = imgdl.extention
            except Exception:
                _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
        return pd
new data model: douban music 2022-12-08 18:08:05 +00:00			`from catalog.common import *`
			`from catalog.models import *`
			`from .douban import DoubanDownloader`
			`import dateparser`
			`import logging`


			`_logger = logging.getLogger(__name__)`


			`@SiteList.register`
			`class DoubanMusic(AbstractSite):`
			`ID_TYPE = IdType.DoubanMusic`
			`URL_PATTERNS = [r"\w+://music\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/music/subject/(\d+)/{0,1}"]`
			`WIKI_PROPERTY_ID = ''`
			`DEFAULT_MODEL = Album`

			`@classmethod`
			`def id_to_url(self, id_value):`
			`return "https://music.douban.com/subject/" + id_value + "/"`

			`def scrape(self):`
			`content = DoubanDownloader(self.url).download().html()`

			`elem = content.xpath("//h1/span/text()")`
			`title = elem[0].strip() if len(elem) else None`
			`if not title:`
			`raise ParseError(self, "title")`

			`artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()")`
			`artist = None if not artists_elem else list(map(lambda a: a[:200], artists_elem))`

			`genre_elem = content.xpath(`
			`"//div[@id='info']//span[text()='流派:']/following::text()[1]")`
			`genre = genre_elem[0].strip() if genre_elem else None`

			`date_elem = content.xpath(`
			`"//div[@id='info']//span[text()='发行时间:']/following::text()[1]")`
			`release_date = dateparser.parse(date_elem[0].strip()).strftime('%Y-%m-%d') if date_elem else None`

			`company_elem = content.xpath(`
			`"//div[@id='info']//span[text()='出版者:']/following::text()[1]")`
			`company = company_elem[0].strip() if company_elem else None`

			`track_list_elem = content.xpath(`
			`"//div[@class='track-list']/div[@class='indent']/div/text()"`
			`)`
			`if track_list_elem:`
			`track_list = '\n'.join([track.strip() for track in track_list_elem])`
			`else:`
			`track_list = None`

			`brief_elem = content.xpath("//span[@class='all hidden']")`
			`if not brief_elem:`
			`brief_elem = content.xpath("//span[@property='v:summary']")`
			`brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(`
			`'./text()')]) if brief_elem else None`

			`gtin = None`
			`isrc = None`
			`other_info = {}`
			`other_elem = content.xpath(`
			`"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")`
			`if other_elem:`
			`other_info['又名'] = other_elem[0].strip()`
			`other_elem = content.xpath(`
			`"//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]")`
			`if other_elem:`
			`other_info['专辑类型'] = other_elem[0].strip()`
			`other_elem = content.xpath(`
			`"//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]")`
			`if other_elem:`
			`other_info['介质'] = other_elem[0].strip()`
			`other_elem = content.xpath(`
			`"//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]")`
			`if other_elem:`
			`other_info['ISRC'] = other_elem[0].strip()`
			`isrc = other_elem[0].strip()`
			`other_elem = content.xpath(`
			`"//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]")`
			`if other_elem:`
			`other_info['条形码'] = other_elem[0].strip()`
			`gtin = other_elem[0].strip()`
			`other_elem = content.xpath(`
			`"//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]")`
			`if other_elem:`
			`other_info['碟片数'] = other_elem[0].strip()`

			`img_url_elem = content.xpath("//div[@id='mainpic']//img/@src")`
			`img_url = img_url_elem[0].strip() if img_url_elem else None`

			`pd = ResourceContent(metadata={`
			`'title': title,`
			`'artist': artist,`
			`'genre': genre,`
			`'release_date': release_date,`
			`'duration': None,`
			`'company': company,`
			`'track_list': track_list,`
			`'brief': brief,`
			`'other_info': other_info,`
			`'cover_image_url': img_url`
			`})`
			`if gtin:`
			`pd.lookup_ids[IdType.GTIN] = gtin`
			`if isrc:`
			`pd.lookup_ids[IdType.ISRC] = isrc`
			`if pd.metadata["cover_image_url"]:`
			`imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)`
			`try:`
			`pd.cover_image = imgdl.download().content`
			`pd.cover_image_extention = imgdl.extention`
			`except Exception:`
			`_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')`
			`return pd`