diff --git a/catalog/common/models.py b/catalog/common/models.py index 511c3c65..691d317a 100644 --- a/catalog/common/models.py +++ b/catalog/common/models.py @@ -166,7 +166,12 @@ class Item(PolymorphicModel): @classmethod def get_best_lookup_id(cls, lookup_ids): """ get best available lookup id, ideally commonly used """ - best_id_types = [IdType.ISBN, IdType.CUBN, IdType.ASIN, IdType.IMDB, IdType.Feed, IdType.TMDB_TVSeason] + best_id_types = [ + IdType.ISBN, IdType.CUBN, IdType.ASIN, + IdType.GTIN, IdType.ISRC, IdType.MusicBrainz, + IdType.Feed, + IdType.IMDB, IdType.TMDB_TVSeason + ] for t in best_id_types: if lookup_ids.get(t): return t, lookup_ids[t] diff --git a/catalog/common/sites.py b/catalog/common/sites.py index 7acbb5c1..8c959158 100644 --- a/catalog/common/sites.py +++ b/catalog/common/sites.py @@ -1,3 +1,11 @@ +""" +Site and SiteList + +Site should inherite from AbstractSite +a Site should map to a unique set of url patterns. +a Site may scrape a url and store result in ResourceContent +ResourceContent persists as an ExternalResource which may link to an Item +""" from typing import * import re from .models import ExternalResource diff --git a/catalog/music/tests.py b/catalog/music/tests.py index 3354b8d9..d035382d 100644 --- a/catalog/music/tests.py +++ b/catalog/music/tests.py @@ -25,3 +25,37 @@ class SpotifyTestCase(TestCase): self.assertEqual(site.resource.metadata['title'], 'The Race For Space') self.assertIsInstance(site.resource.item, Album) self.assertEqual(site.resource.item.barcode, '3610159662676') + + +class DoubanMusicTestCase(TestCase): + def test_parse(self): + t_id_type = IdType.DoubanMusic + t_id_value = '33551231' + t_url = 'https://music.douban.com/subject/33551231/' + site = SiteList.get_site_by_id_type(t_id_type) + self.assertIsNotNone(site) + self.assertEqual(site.validate_url(t_url), True) + site = SiteList.get_site_by_url(t_url) + self.assertEqual(site.url, t_url) + self.assertEqual(site.id_value, t_id_value) + + @use_local_response + def test_scrape(self): + t_url = 'https://music.douban.com/subject/33551231/' + site = SiteList.get_site_by_url(t_url) + self.assertEqual(site.ready, False) + site.get_resource_ready() + self.assertEqual(site.ready, True) + self.assertEqual(site.resource.metadata['title'], 'The Race For Space') + self.assertIsInstance(site.resource.item, Album) + self.assertEqual(site.resource.item.barcode, '3610159662676') + + +class MultiMusicSitesTestCase(TestCase): + @use_local_response + def test_albums(self): + url1 = 'https://music.douban.com/subject/33551231/' + url2 = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP' + p1 = SiteList.get_site_by_url(url1).get_resource_ready() + p2 = SiteList.get_site_by_url(url2).get_resource_ready() + self.assertEqual(p1.item.id, p2.item.id) diff --git a/catalog/sites/__init__.py b/catalog/sites/__init__.py index 1ca1ed46..0ba885d1 100644 --- a/catalog/sites/__init__.py +++ b/catalog/sites/__init__.py @@ -2,6 +2,7 @@ from ..common.sites import SiteList from .apple_podcast import ApplePodcast from .douban_book import DoubanBook from .douban_movie import DoubanMovie +from .douban_music import DoubanMusic from .douban_drama import DoubanDrama from .goodreads import Goodreads from .tmdb import TMDB_Movie diff --git a/catalog/sites/douban_music.py b/catalog/sites/douban_music.py new file mode 100644 index 00000000..1aa157f2 --- /dev/null +++ b/catalog/sites/douban_music.py @@ -0,0 +1,115 @@ +from catalog.common import * +from catalog.models import * +from .douban import DoubanDownloader +import dateparser +import logging + + +_logger = logging.getLogger(__name__) + + +@SiteList.register +class DoubanMusic(AbstractSite): + ID_TYPE = IdType.DoubanMusic + URL_PATTERNS = [r"\w+://music\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/music/subject/(\d+)/{0,1}"] + WIKI_PROPERTY_ID = '' + DEFAULT_MODEL = Album + + @classmethod + def id_to_url(self, id_value): + return "https://music.douban.com/subject/" + id_value + "/" + + def scrape(self): + content = DoubanDownloader(self.url).download().html() + + elem = content.xpath("//h1/span/text()") + title = elem[0].strip() if len(elem) else None + if not title: + raise ParseError(self, "title") + + artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()") + artist = None if not artists_elem else list(map(lambda a: a[:200], artists_elem)) + + genre_elem = content.xpath( + "//div[@id='info']//span[text()='流派:']/following::text()[1]") + genre = genre_elem[0].strip() if genre_elem else None + + date_elem = content.xpath( + "//div[@id='info']//span[text()='发行时间:']/following::text()[1]") + release_date = dateparser.parse(date_elem[0].strip()).strftime('%Y-%m-%d') if date_elem else None + + company_elem = content.xpath( + "//div[@id='info']//span[text()='出版者:']/following::text()[1]") + company = company_elem[0].strip() if company_elem else None + + track_list_elem = content.xpath( + "//div[@class='track-list']/div[@class='indent']/div/text()" + ) + if track_list_elem: + track_list = '\n'.join([track.strip() for track in track_list_elem]) + else: + track_list = None + + brief_elem = content.xpath("//span[@class='all hidden']") + if not brief_elem: + brief_elem = content.xpath("//span[@property='v:summary']") + brief = '\n'.join([e.strip() for e in brief_elem[0].xpath( + './text()')]) if brief_elem else None + + gtin = None + isrc = None + other_info = {} + other_elem = content.xpath( + "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]") + if other_elem: + other_info['又名'] = other_elem[0].strip() + other_elem = content.xpath( + "//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]") + if other_elem: + other_info['专辑类型'] = other_elem[0].strip() + other_elem = content.xpath( + "//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]") + if other_elem: + other_info['介质'] = other_elem[0].strip() + other_elem = content.xpath( + "//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]") + if other_elem: + other_info['ISRC'] = other_elem[0].strip() + isrc = other_elem[0].strip() + other_elem = content.xpath( + "//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]") + if other_elem: + other_info['条形码'] = other_elem[0].strip() + gtin = other_elem[0].strip() + other_elem = content.xpath( + "//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]") + if other_elem: + other_info['碟片数'] = other_elem[0].strip() + + img_url_elem = content.xpath("//div[@id='mainpic']//img/@src") + img_url = img_url_elem[0].strip() if img_url_elem else None + + pd = ResourceContent(metadata={ + 'title': title, + 'artist': artist, + 'genre': genre, + 'release_date': release_date, + 'duration': None, + 'company': company, + 'track_list': track_list, + 'brief': brief, + 'other_info': other_info, + 'cover_image_url': img_url + }) + if gtin: + pd.lookup_ids[IdType.GTIN] = gtin + if isrc: + pd.lookup_ids[IdType.ISRC] = isrc + if pd.metadata["cover_image_url"]: + imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url) + try: + pd.cover_image = imgdl.download().content + pd.cover_image_extention = imgdl.extention + except Exception: + _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + return pd diff --git a/catalog/sites/goodreads.py b/catalog/sites/goodreads.py index 6584be92..be3d4c26 100644 --- a/catalog/sites/goodreads.py +++ b/catalog/sites/goodreads.py @@ -1,4 +1,3 @@ -import re from catalog.book.models import Edition, Work from catalog.common import * from lxml import html diff --git a/catalog/sites/spotify.py b/catalog/sites/spotify.py index 75914281..134ce47e 100644 --- a/catalog/sites/spotify.py +++ b/catalog/sites/spotify.py @@ -69,10 +69,9 @@ class Spotify(AbstractSite): gtin = res_data['external_ids'].get('upc') if res_data['external_ids'].get('ean'): gtin = res_data['external_ids'].get('ean') - # isrc = None - # if res_data['external_ids'].get('isrc'): - # isrc = res_data['external_ids'].get('isrc') - # _logger.error('isrc for album? this should not happen') + isrc = None + if res_data['external_ids'].get('isrc'): + isrc = res_data['external_ids'].get('isrc') pd = ResourceContent(metadata={ 'title': title, @@ -87,6 +86,8 @@ class Spotify(AbstractSite): }) if gtin: pd.lookup_ids[IdType.GTIN] = gtin + if isrc: + pd.lookup_ids[IdType.ISRC] = isrc if pd.metadata["cover_image_url"]: imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url) try: diff --git a/test_data/https___music_douban_com_subject_33551231_ b/test_data/https___music_douban_com_subject_33551231_ new file mode 100644 index 00000000..8e13b9de --- /dev/null +++ b/test_data/https___music_douban_com_subject_33551231_ @@ -0,0 +1,1109 @@ + + + + + + + + + The Race For Space (豆瓣) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +

+ The Race For Space +
+

+ +
+ +
+ + +
+ + + +
+
+ + + +
+
+ + + The Race For Space + + +
+ + +
+ +
+ + + + + + 表演者: + + Public Service Broadcasting + + + +
+ + + + 流派: 爵士 +
+ + + + + 专辑类型: 专辑 +
+ + + + + 发行时间: 2015-02-23 +
+ + + + + 出版者: Believe Sas +
+ + + + + 条形码: 3610159662676 +
+ + +
+
+ + + + + + + +
+
+ +
+ + +
+
+
+ 暂无评分 +
+
+
+ +
+
+ + + + + +
+ + + + + +
+ + + + + +
+ + + 评价: + + + + + + + +
+ + + + + +
+ + + + + + + + +
+ +
+ + + + + + + + + +
+ + + + + +
+ + +
+
+ +
+ + + +
+
+ + + + + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + +

+ 谁听这张唱片? +

+
+ + + + + + +
+ Q-bits +
+
+ Q-bits +
+
8月13日听过
+
+
+
+
+
+ + +
+ 石斑鱼 +
+
+ 石斑鱼 +
+
2021年12月3日想听
+
+
+
+
+
+ + +
+ Icarus +
+
+ Icarus +
+
2021年12月3日听过
+
+
+
+
+
+ + +
+ Yamoas +
+
+ Yamoas +
+
2021年12月3日想听
+
+
+
+
+
+ + + + + + +

> + 4人听过 +

+ +

> + 2人想听 +

+
+ + + + + + + + + + + +
+ + + + + + + + + + + + + + +

订阅关于The Race For Space的评论:
+ feed: rss 2.0

+ + + +
+
+ + + + +
+ + + + + +
+
+
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +