new data model: douban music
This commit is contained in:
parent
e389fc302d
commit
e5b958755c
8 changed files with 1278 additions and 6 deletions
|
@ -166,7 +166,12 @@ class Item(PolymorphicModel):
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_best_lookup_id(cls, lookup_ids):
|
def get_best_lookup_id(cls, lookup_ids):
|
||||||
""" get best available lookup id, ideally commonly used """
|
""" get best available lookup id, ideally commonly used """
|
||||||
best_id_types = [IdType.ISBN, IdType.CUBN, IdType.ASIN, IdType.IMDB, IdType.Feed, IdType.TMDB_TVSeason]
|
best_id_types = [
|
||||||
|
IdType.ISBN, IdType.CUBN, IdType.ASIN,
|
||||||
|
IdType.GTIN, IdType.ISRC, IdType.MusicBrainz,
|
||||||
|
IdType.Feed,
|
||||||
|
IdType.IMDB, IdType.TMDB_TVSeason
|
||||||
|
]
|
||||||
for t in best_id_types:
|
for t in best_id_types:
|
||||||
if lookup_ids.get(t):
|
if lookup_ids.get(t):
|
||||||
return t, lookup_ids[t]
|
return t, lookup_ids[t]
|
||||||
|
|
|
@ -1,3 +1,11 @@
|
||||||
|
"""
|
||||||
|
Site and SiteList
|
||||||
|
|
||||||
|
Site should inherite from AbstractSite
|
||||||
|
a Site should map to a unique set of url patterns.
|
||||||
|
a Site may scrape a url and store result in ResourceContent
|
||||||
|
ResourceContent persists as an ExternalResource which may link to an Item
|
||||||
|
"""
|
||||||
from typing import *
|
from typing import *
|
||||||
import re
|
import re
|
||||||
from .models import ExternalResource
|
from .models import ExternalResource
|
||||||
|
|
|
@ -25,3 +25,37 @@ class SpotifyTestCase(TestCase):
|
||||||
self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
|
self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
|
||||||
self.assertIsInstance(site.resource.item, Album)
|
self.assertIsInstance(site.resource.item, Album)
|
||||||
self.assertEqual(site.resource.item.barcode, '3610159662676')
|
self.assertEqual(site.resource.item.barcode, '3610159662676')
|
||||||
|
|
||||||
|
|
||||||
|
class DoubanMusicTestCase(TestCase):
|
||||||
|
def test_parse(self):
|
||||||
|
t_id_type = IdType.DoubanMusic
|
||||||
|
t_id_value = '33551231'
|
||||||
|
t_url = 'https://music.douban.com/subject/33551231/'
|
||||||
|
site = SiteList.get_site_by_id_type(t_id_type)
|
||||||
|
self.assertIsNotNone(site)
|
||||||
|
self.assertEqual(site.validate_url(t_url), True)
|
||||||
|
site = SiteList.get_site_by_url(t_url)
|
||||||
|
self.assertEqual(site.url, t_url)
|
||||||
|
self.assertEqual(site.id_value, t_id_value)
|
||||||
|
|
||||||
|
@use_local_response
|
||||||
|
def test_scrape(self):
|
||||||
|
t_url = 'https://music.douban.com/subject/33551231/'
|
||||||
|
site = SiteList.get_site_by_url(t_url)
|
||||||
|
self.assertEqual(site.ready, False)
|
||||||
|
site.get_resource_ready()
|
||||||
|
self.assertEqual(site.ready, True)
|
||||||
|
self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
|
||||||
|
self.assertIsInstance(site.resource.item, Album)
|
||||||
|
self.assertEqual(site.resource.item.barcode, '3610159662676')
|
||||||
|
|
||||||
|
|
||||||
|
class MultiMusicSitesTestCase(TestCase):
|
||||||
|
@use_local_response
|
||||||
|
def test_albums(self):
|
||||||
|
url1 = 'https://music.douban.com/subject/33551231/'
|
||||||
|
url2 = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP'
|
||||||
|
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
|
||||||
|
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
|
||||||
|
self.assertEqual(p1.item.id, p2.item.id)
|
||||||
|
|
|
@ -2,6 +2,7 @@ from ..common.sites import SiteList
|
||||||
from .apple_podcast import ApplePodcast
|
from .apple_podcast import ApplePodcast
|
||||||
from .douban_book import DoubanBook
|
from .douban_book import DoubanBook
|
||||||
from .douban_movie import DoubanMovie
|
from .douban_movie import DoubanMovie
|
||||||
|
from .douban_music import DoubanMusic
|
||||||
from .douban_drama import DoubanDrama
|
from .douban_drama import DoubanDrama
|
||||||
from .goodreads import Goodreads
|
from .goodreads import Goodreads
|
||||||
from .tmdb import TMDB_Movie
|
from .tmdb import TMDB_Movie
|
||||||
|
|
115
catalog/sites/douban_music.py
Normal file
115
catalog/sites/douban_music.py
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
from catalog.common import *
|
||||||
|
from catalog.models import *
|
||||||
|
from .douban import DoubanDownloader
|
||||||
|
import dateparser
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
_logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@SiteList.register
|
||||||
|
class DoubanMusic(AbstractSite):
|
||||||
|
ID_TYPE = IdType.DoubanMusic
|
||||||
|
URL_PATTERNS = [r"\w+://music\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/music/subject/(\d+)/{0,1}"]
|
||||||
|
WIKI_PROPERTY_ID = ''
|
||||||
|
DEFAULT_MODEL = Album
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def id_to_url(self, id_value):
|
||||||
|
return "https://music.douban.com/subject/" + id_value + "/"
|
||||||
|
|
||||||
|
def scrape(self):
|
||||||
|
content = DoubanDownloader(self.url).download().html()
|
||||||
|
|
||||||
|
elem = content.xpath("//h1/span/text()")
|
||||||
|
title = elem[0].strip() if len(elem) else None
|
||||||
|
if not title:
|
||||||
|
raise ParseError(self, "title")
|
||||||
|
|
||||||
|
artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()")
|
||||||
|
artist = None if not artists_elem else list(map(lambda a: a[:200], artists_elem))
|
||||||
|
|
||||||
|
genre_elem = content.xpath(
|
||||||
|
"//div[@id='info']//span[text()='流派:']/following::text()[1]")
|
||||||
|
genre = genre_elem[0].strip() if genre_elem else None
|
||||||
|
|
||||||
|
date_elem = content.xpath(
|
||||||
|
"//div[@id='info']//span[text()='发行时间:']/following::text()[1]")
|
||||||
|
release_date = dateparser.parse(date_elem[0].strip()).strftime('%Y-%m-%d') if date_elem else None
|
||||||
|
|
||||||
|
company_elem = content.xpath(
|
||||||
|
"//div[@id='info']//span[text()='出版者:']/following::text()[1]")
|
||||||
|
company = company_elem[0].strip() if company_elem else None
|
||||||
|
|
||||||
|
track_list_elem = content.xpath(
|
||||||
|
"//div[@class='track-list']/div[@class='indent']/div/text()"
|
||||||
|
)
|
||||||
|
if track_list_elem:
|
||||||
|
track_list = '\n'.join([track.strip() for track in track_list_elem])
|
||||||
|
else:
|
||||||
|
track_list = None
|
||||||
|
|
||||||
|
brief_elem = content.xpath("//span[@class='all hidden']")
|
||||||
|
if not brief_elem:
|
||||||
|
brief_elem = content.xpath("//span[@property='v:summary']")
|
||||||
|
brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
|
||||||
|
'./text()')]) if brief_elem else None
|
||||||
|
|
||||||
|
gtin = None
|
||||||
|
isrc = None
|
||||||
|
other_info = {}
|
||||||
|
other_elem = content.xpath(
|
||||||
|
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
|
||||||
|
if other_elem:
|
||||||
|
other_info['又名'] = other_elem[0].strip()
|
||||||
|
other_elem = content.xpath(
|
||||||
|
"//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]")
|
||||||
|
if other_elem:
|
||||||
|
other_info['专辑类型'] = other_elem[0].strip()
|
||||||
|
other_elem = content.xpath(
|
||||||
|
"//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]")
|
||||||
|
if other_elem:
|
||||||
|
other_info['介质'] = other_elem[0].strip()
|
||||||
|
other_elem = content.xpath(
|
||||||
|
"//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]")
|
||||||
|
if other_elem:
|
||||||
|
other_info['ISRC'] = other_elem[0].strip()
|
||||||
|
isrc = other_elem[0].strip()
|
||||||
|
other_elem = content.xpath(
|
||||||
|
"//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]")
|
||||||
|
if other_elem:
|
||||||
|
other_info['条形码'] = other_elem[0].strip()
|
||||||
|
gtin = other_elem[0].strip()
|
||||||
|
other_elem = content.xpath(
|
||||||
|
"//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]")
|
||||||
|
if other_elem:
|
||||||
|
other_info['碟片数'] = other_elem[0].strip()
|
||||||
|
|
||||||
|
img_url_elem = content.xpath("//div[@id='mainpic']//img/@src")
|
||||||
|
img_url = img_url_elem[0].strip() if img_url_elem else None
|
||||||
|
|
||||||
|
pd = ResourceContent(metadata={
|
||||||
|
'title': title,
|
||||||
|
'artist': artist,
|
||||||
|
'genre': genre,
|
||||||
|
'release_date': release_date,
|
||||||
|
'duration': None,
|
||||||
|
'company': company,
|
||||||
|
'track_list': track_list,
|
||||||
|
'brief': brief,
|
||||||
|
'other_info': other_info,
|
||||||
|
'cover_image_url': img_url
|
||||||
|
})
|
||||||
|
if gtin:
|
||||||
|
pd.lookup_ids[IdType.GTIN] = gtin
|
||||||
|
if isrc:
|
||||||
|
pd.lookup_ids[IdType.ISRC] = isrc
|
||||||
|
if pd.metadata["cover_image_url"]:
|
||||||
|
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
|
||||||
|
try:
|
||||||
|
pd.cover_image = imgdl.download().content
|
||||||
|
pd.cover_image_extention = imgdl.extention
|
||||||
|
except Exception:
|
||||||
|
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
|
||||||
|
return pd
|
|
@ -1,4 +1,3 @@
|
||||||
import re
|
|
||||||
from catalog.book.models import Edition, Work
|
from catalog.book.models import Edition, Work
|
||||||
from catalog.common import *
|
from catalog.common import *
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
|
@ -69,10 +69,9 @@ class Spotify(AbstractSite):
|
||||||
gtin = res_data['external_ids'].get('upc')
|
gtin = res_data['external_ids'].get('upc')
|
||||||
if res_data['external_ids'].get('ean'):
|
if res_data['external_ids'].get('ean'):
|
||||||
gtin = res_data['external_ids'].get('ean')
|
gtin = res_data['external_ids'].get('ean')
|
||||||
# isrc = None
|
isrc = None
|
||||||
# if res_data['external_ids'].get('isrc'):
|
if res_data['external_ids'].get('isrc'):
|
||||||
# isrc = res_data['external_ids'].get('isrc')
|
isrc = res_data['external_ids'].get('isrc')
|
||||||
# _logger.error('isrc for album? this should not happen')
|
|
||||||
|
|
||||||
pd = ResourceContent(metadata={
|
pd = ResourceContent(metadata={
|
||||||
'title': title,
|
'title': title,
|
||||||
|
@ -87,6 +86,8 @@ class Spotify(AbstractSite):
|
||||||
})
|
})
|
||||||
if gtin:
|
if gtin:
|
||||||
pd.lookup_ids[IdType.GTIN] = gtin
|
pd.lookup_ids[IdType.GTIN] = gtin
|
||||||
|
if isrc:
|
||||||
|
pd.lookup_ids[IdType.ISRC] = isrc
|
||||||
if pd.metadata["cover_image_url"]:
|
if pd.metadata["cover_image_url"]:
|
||||||
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
|
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
|
||||||
try:
|
try:
|
||||||
|
|
1109
test_data/https___music_douban_com_subject_33551231_
Normal file
1109
test_data/https___music_douban_com_subject_33551231_
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue