new data model: douban music

2022-12-08 18:08:05 +00:00 · 2022-12-08 18:08:05 +00:00 · e5b958755c
commit e5b958755c
parent e389fc302d
8 changed files with 1278 additions and 6 deletions
--- a/catalog/common/models.py
+++ b/catalog/common/models.py
@ -166,7 +166,12 @@ class Item(PolymorphicModel):
    @classmethod
    def get_best_lookup_id(cls, lookup_ids):
        """ get best available lookup id, ideally commonly used """
-        best_id_types = [IdType.ISBN, IdType.CUBN, IdType.ASIN, IdType.IMDB, IdType.Feed, IdType.TMDB_TVSeason]
+        best_id_types = [
+            IdType.ISBN, IdType.CUBN, IdType.ASIN, 
+            IdType.GTIN, IdType.ISRC, IdType.MusicBrainz,
+            IdType.Feed, 
+            IdType.IMDB, IdType.TMDB_TVSeason
+        ]
        for t in best_id_types:
            if lookup_ids.get(t):
                return t, lookup_ids[t]
--- a/catalog/common/sites.py
+++ b/catalog/common/sites.py
@ -1,3 +1,11 @@
+"""
+Site and SiteList
+
+Site should inherite from AbstractSite
+a Site should map to a unique set of url patterns.
+a Site may scrape a url and store result in ResourceContent
+ResourceContent persists as an ExternalResource which may link to an Item
+"""
 from typing import *
 import re
 from .models import ExternalResource
--- a/catalog/music/tests.py
+++ b/catalog/music/tests.py
@ -25,3 +25,37 @@ class SpotifyTestCase(TestCase):
        self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
        self.assertIsInstance(site.resource.item, Album)
        self.assertEqual(site.resource.item.barcode, '3610159662676')
+
+
+class DoubanMusicTestCase(TestCase):
+    def test_parse(self):
+        t_id_type = IdType.DoubanMusic
+        t_id_value = '33551231'
+        t_url = 'https://music.douban.com/subject/33551231/'
+        site = SiteList.get_site_by_id_type(t_id_type)
+        self.assertIsNotNone(site)
+        self.assertEqual(site.validate_url(t_url), True)
+        site = SiteList.get_site_by_url(t_url)
+        self.assertEqual(site.url, t_url)
+        self.assertEqual(site.id_value, t_id_value)
+
+    @use_local_response
+    def test_scrape(self):
+        t_url = 'https://music.douban.com/subject/33551231/'
+        site = SiteList.get_site_by_url(t_url)
+        self.assertEqual(site.ready, False)
+        site.get_resource_ready()
+        self.assertEqual(site.ready, True)
+        self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
+        self.assertIsInstance(site.resource.item, Album)
+        self.assertEqual(site.resource.item.barcode, '3610159662676')
+
+
+class MultiMusicSitesTestCase(TestCase):
+    @use_local_response
+    def test_albums(self):
+        url1 = 'https://music.douban.com/subject/33551231/'
+        url2 = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP'
+        p1 = SiteList.get_site_by_url(url1).get_resource_ready()
+        p2 = SiteList.get_site_by_url(url2).get_resource_ready()
+        self.assertEqual(p1.item.id, p2.item.id)
--- a/catalog/sites/init.py
+++ b/catalog/sites/init.py
@ -2,6 +2,7 @@ from ..common.sites import SiteList
 from .apple_podcast import ApplePodcast
 from .douban_book import DoubanBook
 from .douban_movie import DoubanMovie
+from .douban_music import DoubanMusic
 from .douban_drama import DoubanDrama
 from .goodreads import Goodreads
 from .tmdb import TMDB_Movie
--- a/catalog/sites/douban_music.py
+++ b/catalog/sites/douban_music.py
@ -0,0 +1,115 @@
+from catalog.common import *
+from catalog.models import *
+from .douban import DoubanDownloader
+import dateparser
+import logging
+
+
+_logger = logging.getLogger(__name__)
+
+
+@SiteList.register
+class DoubanMusic(AbstractSite):
+    ID_TYPE = IdType.DoubanMusic
+    URL_PATTERNS = [r"\w+://music\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/music/subject/(\d+)/{0,1}"]
+    WIKI_PROPERTY_ID = ''
+    DEFAULT_MODEL = Album
+
+    @classmethod
+    def id_to_url(self, id_value):
+        return "https://music.douban.com/subject/" + id_value + "/"
+
+    def scrape(self):
+        content = DoubanDownloader(self.url).download().html()
+
+        elem = content.xpath("//h1/span/text()")
+        title = elem[0].strip() if len(elem) else None
+        if not title:
+            raise ParseError(self, "title")
+
+        artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()")
+        artist = None if not artists_elem else list(map(lambda a: a[:200], artists_elem))
+
+        genre_elem = content.xpath(
+            "//div[@id='info']//span[text()='流派:']/following::text()[1]")
+        genre = genre_elem[0].strip() if genre_elem else None
+
+        date_elem = content.xpath(
+            "//div[@id='info']//span[text()='发行时间:']/following::text()[1]")
+        release_date = dateparser.parse(date_elem[0].strip()).strftime('%Y-%m-%d') if date_elem else None
+
+        company_elem = content.xpath(
+            "//div[@id='info']//span[text()='出版者:']/following::text()[1]")
+        company = company_elem[0].strip() if company_elem else None
+
+        track_list_elem = content.xpath(
+            "//div[@class='track-list']/div[@class='indent']/div/text()"
+        )
+        if track_list_elem:
+            track_list = '\n'.join([track.strip() for track in track_list_elem])
+        else:
+            track_list = None
+
+        brief_elem = content.xpath("//span[@class='all hidden']")
+        if not brief_elem:
+            brief_elem = content.xpath("//span[@property='v:summary']")
+        brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
+            './text()')]) if brief_elem else None
+
+        gtin = None
+        isrc = None
+        other_info = {}
+        other_elem = content.xpath(
+            "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
+        if other_elem:
+            other_info['又名'] = other_elem[0].strip()
+        other_elem = content.xpath(
+            "//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]")
+        if other_elem:
+            other_info['专辑类型'] = other_elem[0].strip()
+        other_elem = content.xpath(
+            "//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]")
+        if other_elem:
+            other_info['介质'] = other_elem[0].strip()
+        other_elem = content.xpath(
+            "//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]")
+        if other_elem:
+            other_info['ISRC'] = other_elem[0].strip()
+            isrc = other_elem[0].strip()
+        other_elem = content.xpath(
+            "//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]")
+        if other_elem:
+            other_info['条形码'] = other_elem[0].strip()
+            gtin = other_elem[0].strip()
+        other_elem = content.xpath(
+            "//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]")
+        if other_elem:
+            other_info['碟片数'] = other_elem[0].strip()
+
+        img_url_elem = content.xpath("//div[@id='mainpic']//img/@src")
+        img_url = img_url_elem[0].strip() if img_url_elem else None
+
+        pd = ResourceContent(metadata={
+            'title': title,
+            'artist': artist,
+            'genre': genre,
+            'release_date': release_date,
+            'duration': None,
+            'company': company,
+            'track_list': track_list,
+            'brief': brief,
+            'other_info': other_info,
+            'cover_image_url': img_url
+        })
+        if gtin:
+            pd.lookup_ids[IdType.GTIN] = gtin
+        if isrc:
+            pd.lookup_ids[IdType.ISRC] = isrc
+        if pd.metadata["cover_image_url"]:
+            imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
+            try:
+                pd.cover_image = imgdl.download().content
+                pd.cover_image_extention = imgdl.extention
+            except Exception:
+                _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
+        return pd
--- a/catalog/sites/goodreads.py
+++ b/catalog/sites/goodreads.py
@ -1,4 +1,3 @@
-import re
 from catalog.book.models import Edition, Work
 from catalog.common import *
 from lxml import html
--- a/catalog/sites/spotify.py
+++ b/catalog/sites/spotify.py
@ -69,10 +69,9 @@ class Spotify(AbstractSite):
            gtin = res_data['external_ids'].get('upc')
        if res_data['external_ids'].get('ean'):
            gtin = res_data['external_ids'].get('ean')
-        # isrc = None
-        # if res_data['external_ids'].get('isrc'):
-        #     isrc = res_data['external_ids'].get('isrc')
-        #     _logger.error('isrc for album? this should not happen')
+        isrc = None
+        if res_data['external_ids'].get('isrc'):
+            isrc = res_data['external_ids'].get('isrc')

        pd = ResourceContent(metadata={
            'title': title,
@ -87,6 +86,8 @@ class Spotify(AbstractSite):
        })
        if gtin:
            pd.lookup_ids[IdType.GTIN] = gtin
+        if isrc:
+            pd.lookup_ids[IdType.ISRC] = isrc
        if pd.metadata["cover_image_url"]:
            imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
            try:
--- a/test_data/https___music_douban_com_subject_33551231_
+++ b/test_data/https___music_douban_com_subject_33551231_