diff --git a/catalog/common/models.py b/catalog/common/models.py
index 511c3c65..691d317a 100644
--- a/catalog/common/models.py
+++ b/catalog/common/models.py
@@ -166,7 +166,12 @@ class Item(PolymorphicModel):
@classmethod
def get_best_lookup_id(cls, lookup_ids):
""" get best available lookup id, ideally commonly used """
- best_id_types = [IdType.ISBN, IdType.CUBN, IdType.ASIN, IdType.IMDB, IdType.Feed, IdType.TMDB_TVSeason]
+ best_id_types = [
+ IdType.ISBN, IdType.CUBN, IdType.ASIN,
+ IdType.GTIN, IdType.ISRC, IdType.MusicBrainz,
+ IdType.Feed,
+ IdType.IMDB, IdType.TMDB_TVSeason
+ ]
for t in best_id_types:
if lookup_ids.get(t):
return t, lookup_ids[t]
diff --git a/catalog/common/sites.py b/catalog/common/sites.py
index 7acbb5c1..8c959158 100644
--- a/catalog/common/sites.py
+++ b/catalog/common/sites.py
@@ -1,3 +1,11 @@
+"""
+Site and SiteList
+
+Site should inherite from AbstractSite
+a Site should map to a unique set of url patterns.
+a Site may scrape a url and store result in ResourceContent
+ResourceContent persists as an ExternalResource which may link to an Item
+"""
from typing import *
import re
from .models import ExternalResource
diff --git a/catalog/music/tests.py b/catalog/music/tests.py
index 3354b8d9..d035382d 100644
--- a/catalog/music/tests.py
+++ b/catalog/music/tests.py
@@ -25,3 +25,37 @@ class SpotifyTestCase(TestCase):
self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
self.assertIsInstance(site.resource.item, Album)
self.assertEqual(site.resource.item.barcode, '3610159662676')
+
+
+class DoubanMusicTestCase(TestCase):
+ def test_parse(self):
+ t_id_type = IdType.DoubanMusic
+ t_id_value = '33551231'
+ t_url = 'https://music.douban.com/subject/33551231/'
+ site = SiteList.get_site_by_id_type(t_id_type)
+ self.assertIsNotNone(site)
+ self.assertEqual(site.validate_url(t_url), True)
+ site = SiteList.get_site_by_url(t_url)
+ self.assertEqual(site.url, t_url)
+ self.assertEqual(site.id_value, t_id_value)
+
+ @use_local_response
+ def test_scrape(self):
+ t_url = 'https://music.douban.com/subject/33551231/'
+ site = SiteList.get_site_by_url(t_url)
+ self.assertEqual(site.ready, False)
+ site.get_resource_ready()
+ self.assertEqual(site.ready, True)
+ self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
+ self.assertIsInstance(site.resource.item, Album)
+ self.assertEqual(site.resource.item.barcode, '3610159662676')
+
+
+class MultiMusicSitesTestCase(TestCase):
+ @use_local_response
+ def test_albums(self):
+ url1 = 'https://music.douban.com/subject/33551231/'
+ url2 = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP'
+ p1 = SiteList.get_site_by_url(url1).get_resource_ready()
+ p2 = SiteList.get_site_by_url(url2).get_resource_ready()
+ self.assertEqual(p1.item.id, p2.item.id)
diff --git a/catalog/sites/__init__.py b/catalog/sites/__init__.py
index 1ca1ed46..0ba885d1 100644
--- a/catalog/sites/__init__.py
+++ b/catalog/sites/__init__.py
@@ -2,6 +2,7 @@ from ..common.sites import SiteList
from .apple_podcast import ApplePodcast
from .douban_book import DoubanBook
from .douban_movie import DoubanMovie
+from .douban_music import DoubanMusic
from .douban_drama import DoubanDrama
from .goodreads import Goodreads
from .tmdb import TMDB_Movie
diff --git a/catalog/sites/douban_music.py b/catalog/sites/douban_music.py
new file mode 100644
index 00000000..1aa157f2
--- /dev/null
+++ b/catalog/sites/douban_music.py
@@ -0,0 +1,115 @@
+from catalog.common import *
+from catalog.models import *
+from .douban import DoubanDownloader
+import dateparser
+import logging
+
+
+_logger = logging.getLogger(__name__)
+
+
+@SiteList.register
+class DoubanMusic(AbstractSite):
+ ID_TYPE = IdType.DoubanMusic
+ URL_PATTERNS = [r"\w+://music\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/music/subject/(\d+)/{0,1}"]
+ WIKI_PROPERTY_ID = ''
+ DEFAULT_MODEL = Album
+
+ @classmethod
+ def id_to_url(self, id_value):
+ return "https://music.douban.com/subject/" + id_value + "/"
+
+ def scrape(self):
+ content = DoubanDownloader(self.url).download().html()
+
+ elem = content.xpath("//h1/span/text()")
+ title = elem[0].strip() if len(elem) else None
+ if not title:
+ raise ParseError(self, "title")
+
+ artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()")
+ artist = None if not artists_elem else list(map(lambda a: a[:200], artists_elem))
+
+ genre_elem = content.xpath(
+ "//div[@id='info']//span[text()='流派:']/following::text()[1]")
+ genre = genre_elem[0].strip() if genre_elem else None
+
+ date_elem = content.xpath(
+ "//div[@id='info']//span[text()='发行时间:']/following::text()[1]")
+ release_date = dateparser.parse(date_elem[0].strip()).strftime('%Y-%m-%d') if date_elem else None
+
+ company_elem = content.xpath(
+ "//div[@id='info']//span[text()='出版者:']/following::text()[1]")
+ company = company_elem[0].strip() if company_elem else None
+
+ track_list_elem = content.xpath(
+ "//div[@class='track-list']/div[@class='indent']/div/text()"
+ )
+ if track_list_elem:
+ track_list = '\n'.join([track.strip() for track in track_list_elem])
+ else:
+ track_list = None
+
+ brief_elem = content.xpath("//span[@class='all hidden']")
+ if not brief_elem:
+ brief_elem = content.xpath("//span[@property='v:summary']")
+ brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
+ './text()')]) if brief_elem else None
+
+ gtin = None
+ isrc = None
+ other_info = {}
+ other_elem = content.xpath(
+ "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
+ if other_elem:
+ other_info['又名'] = other_elem[0].strip()
+ other_elem = content.xpath(
+ "//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]")
+ if other_elem:
+ other_info['专辑类型'] = other_elem[0].strip()
+ other_elem = content.xpath(
+ "//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]")
+ if other_elem:
+ other_info['介质'] = other_elem[0].strip()
+ other_elem = content.xpath(
+ "//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]")
+ if other_elem:
+ other_info['ISRC'] = other_elem[0].strip()
+ isrc = other_elem[0].strip()
+ other_elem = content.xpath(
+ "//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]")
+ if other_elem:
+ other_info['条形码'] = other_elem[0].strip()
+ gtin = other_elem[0].strip()
+ other_elem = content.xpath(
+ "//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]")
+ if other_elem:
+ other_info['碟片数'] = other_elem[0].strip()
+
+ img_url_elem = content.xpath("//div[@id='mainpic']//img/@src")
+ img_url = img_url_elem[0].strip() if img_url_elem else None
+
+ pd = ResourceContent(metadata={
+ 'title': title,
+ 'artist': artist,
+ 'genre': genre,
+ 'release_date': release_date,
+ 'duration': None,
+ 'company': company,
+ 'track_list': track_list,
+ 'brief': brief,
+ 'other_info': other_info,
+ 'cover_image_url': img_url
+ })
+ if gtin:
+ pd.lookup_ids[IdType.GTIN] = gtin
+ if isrc:
+ pd.lookup_ids[IdType.ISRC] = isrc
+ if pd.metadata["cover_image_url"]:
+ imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
+ try:
+ pd.cover_image = imgdl.download().content
+ pd.cover_image_extention = imgdl.extention
+ except Exception:
+ _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
+ return pd
diff --git a/catalog/sites/goodreads.py b/catalog/sites/goodreads.py
index 6584be92..be3d4c26 100644
--- a/catalog/sites/goodreads.py
+++ b/catalog/sites/goodreads.py
@@ -1,4 +1,3 @@
-import re
from catalog.book.models import Edition, Work
from catalog.common import *
from lxml import html
diff --git a/catalog/sites/spotify.py b/catalog/sites/spotify.py
index 75914281..134ce47e 100644
--- a/catalog/sites/spotify.py
+++ b/catalog/sites/spotify.py
@@ -69,10 +69,9 @@ class Spotify(AbstractSite):
gtin = res_data['external_ids'].get('upc')
if res_data['external_ids'].get('ean'):
gtin = res_data['external_ids'].get('ean')
- # isrc = None
- # if res_data['external_ids'].get('isrc'):
- # isrc = res_data['external_ids'].get('isrc')
- # _logger.error('isrc for album? this should not happen')
+ isrc = None
+ if res_data['external_ids'].get('isrc'):
+ isrc = res_data['external_ids'].get('isrc')
pd = ResourceContent(metadata={
'title': title,
@@ -87,6 +86,8 @@ class Spotify(AbstractSite):
})
if gtin:
pd.lookup_ids[IdType.GTIN] = gtin
+ if isrc:
+ pd.lookup_ids[IdType.ISRC] = isrc
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
diff --git a/test_data/https___music_douban_com_subject_33551231_ b/test_data/https___music_douban_com_subject_33551231_
new file mode 100644
index 00000000..8e13b9de
--- /dev/null
+++ b/test_data/https___music_douban_com_subject_33551231_
@@ -0,0 +1,1109 @@
+
+
+
+
+
+
+
+
+ The Race For Space (豆瓣)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The Race For Space
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 表演者:
+
+ Public Service Broadcasting
+
+
+
+
+
+
+
+
流派: 爵士
+
+
+
+
+
+
专辑类型: 专辑
+
+
+
+
+
+
发行时间: 2015-02-23
+
+
+
+
+
+
出版者: Believe Sas
+
+
+
+
+
+
条形码: 3610159662676
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 谁听这张唱片?
+
+
+
+
+
+
+
+
+
+

+
+
+
+
+
+
+
+
+

+
+
+
石斑鱼
+
+
2021年12月3日想听
+
+
+
+
+
+
+
+
+

+
+
+
+
+
+
+
+
+

+
+
+
+
+
+
+
+
+
+
+
+
>
+ 4人听过
+
+
+
>
+ 2人想听
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
订阅关于The Race For Space的评论:
+ feed: rss 2.0
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+-
+
+
+还没人写过短评呢
++-
+
+
+还没人写过短评呢
+