bandcamp
This commit is contained in:
parent
c05aa65e3f
commit
6113115fb6
5 changed files with 124 additions and 0 deletions
|
@ -39,6 +39,10 @@ class AbstractSite:
|
|||
u = next(iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), None)
|
||||
return u is not None
|
||||
|
||||
@classmethod
|
||||
def validate_url_fallback(self, url: str):
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def id_to_url(self, id_value):
|
||||
return 'https://undefined/' + id_value
|
||||
|
@ -152,6 +156,8 @@ class SiteManager:
|
|||
@classmethod
|
||||
def get_site_by_url(cls, url: str):
|
||||
cls = next(filter(lambda p: p.validate_url(url), cls.registry.values()), None)
|
||||
if cls is None:
|
||||
cls = next(filter(lambda p: p.validate_url_fallback(url), cls.registry.values()), None)
|
||||
return cls(url) if cls else None
|
||||
|
||||
@classmethod
|
||||
|
|
|
@ -23,6 +23,7 @@ class Album(Item):
|
|||
'company',
|
||||
'track_list',
|
||||
'brief',
|
||||
'bandcamp_album_id',
|
||||
]
|
||||
release_date = jsondata.DateField(_('发行日期'), auto_now=False, auto_now_add=False, null=True, blank=True)
|
||||
duration = jsondata.IntegerField(_("时长"), null=True, blank=True)
|
||||
|
@ -33,4 +34,5 @@ class Album(Item):
|
|||
other_title = jsondata.CharField(blank=True, default='', max_length=500)
|
||||
album_type = jsondata.CharField(blank=True, default='', max_length=500)
|
||||
media = jsondata.CharField(blank=True, default='', max_length=500)
|
||||
bandcamp_album_id = jsondata.CharField(blank=True, default='', max_length=500)
|
||||
disc_count = jsondata.IntegerField(blank=True, default='', max_length=500)
|
||||
|
|
|
@ -59,3 +59,28 @@ class MultiMusicSitesTestCase(TestCase):
|
|||
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
|
||||
p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
|
||||
self.assertEqual(p1.item.id, p2.item.id)
|
||||
|
||||
|
||||
class BandcampTestCase(TestCase):
|
||||
def test_parse(self):
|
||||
t_id_type = IdType.Bandcamp
|
||||
t_id_value = 'intlanthem.bandcamp.com/album/in-these-times'
|
||||
t_url = 'https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw'
|
||||
t_url2 = 'https://intlanthem.bandcamp.com/album/in-these-times'
|
||||
site = SiteManager.get_site_by_id_type(t_id_type)
|
||||
self.assertIsNotNone(site)
|
||||
self.assertEqual(site.validate_url(t_url), True)
|
||||
site = SiteManager.get_site_by_url(t_url)
|
||||
self.assertEqual(site.url, t_url2)
|
||||
self.assertEqual(site.id_value, t_id_value)
|
||||
|
||||
# @use_local_response
|
||||
def test_scrape(self):
|
||||
t_url = 'https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw'
|
||||
site = SiteManager.get_site_by_url(t_url)
|
||||
self.assertEqual(site.ready, False)
|
||||
site.get_resource_ready()
|
||||
self.assertEqual(site.ready, True)
|
||||
self.assertEqual(site.resource.metadata['title'], 'In These Times')
|
||||
self.assertEqual(site.resource.metadata['artist'], ['Makaya McCraven'])
|
||||
self.assertIsInstance(site.resource.item, Album)
|
||||
|
|
|
@ -12,4 +12,5 @@ from .imdb import IMDB
|
|||
from .spotify import Spotify
|
||||
from .igdb import IGDB
|
||||
from .steam import Steam
|
||||
from .bandcamp import Bandcamp
|
||||
from .bangumi import Bangumi
|
||||
|
|
90
catalog/sites/bandcamp.py
Normal file
90
catalog/sites/bandcamp.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
from catalog.common import *
|
||||
from catalog.models import *
|
||||
import logging
|
||||
import urllib.parse
|
||||
import dateparser
|
||||
import re
|
||||
import json
|
||||
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@SiteManager.register
|
||||
class Bandcamp(AbstractSite):
|
||||
SITE_NAME = SiteName.Bandcamp
|
||||
ID_TYPE = IdType.Bandcamp
|
||||
URL_PATTERNS = [
|
||||
r"https://([a-z0-9\-]+.bandcamp.com/album/[^?#/]+)"
|
||||
]
|
||||
URL_PATTERN_FALLBACK = r"https://([a-z0-9\-\.]+/album/[^?#/]+)"
|
||||
WIKI_PROPERTY_ID = ''
|
||||
DEFAULT_MODEL = Album
|
||||
|
||||
@classmethod
|
||||
def id_to_url(self, id_value):
|
||||
return f"https://{id_value}"
|
||||
|
||||
@classmethod
|
||||
def validate_url_fallback(self, url):
|
||||
if re.match(self.URL_PATTERN_FALLBACK, url) is None:
|
||||
return False
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
hostname = parsed_url.netloc
|
||||
try:
|
||||
answers = dns.resolver.query(hostname, 'CNAME')
|
||||
for rdata in answers:
|
||||
if str(rdata.target) == 'dom.bandcamp.com.':
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
answers = dns.resolver.query(hostname, 'A')
|
||||
for rdata in answers:
|
||||
if str(rdata.address) == '35.241.62.186':
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def scrape(self):
|
||||
content = BasicDownloader(self.url).download().html()
|
||||
try:
|
||||
title = content.xpath("//h2[@class='trackTitle']/text()")[0].strip()
|
||||
artist = [content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip()]
|
||||
except IndexError:
|
||||
raise ValueError("given url contains no valid info")
|
||||
|
||||
genre = [] # TODO: parse tags
|
||||
track_list = []
|
||||
release_nodes = content.xpath("//div[@class='tralbumData tralbum-credits']/text()")
|
||||
release_date = dateparser.parse(re.sub(r'releas\w+ ', '', release_nodes[0].strip())).strftime('%Y-%m-%d') if release_nodes else None
|
||||
duration = None
|
||||
company = None
|
||||
brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()")
|
||||
brief = "".join(brief_nodes) if brief_nodes else None
|
||||
cover_url = content.xpath("//div[@id='tralbumArt']/a/@href")[0].strip()
|
||||
bandcamp_page_data = json.loads(content.xpath(
|
||||
"//meta[@name='bc-page-properties']/@content")[0].strip())
|
||||
bandcamp_album_id = bandcamp_page_data['item_id']
|
||||
|
||||
data = {
|
||||
'title': title,
|
||||
'artist': artist,
|
||||
'genre': genre,
|
||||
'track_list': track_list,
|
||||
'release_date': release_date,
|
||||
'duration': duration,
|
||||
'company': company,
|
||||
'brief': brief,
|
||||
'bandcamp_album_id': bandcamp_album_id,
|
||||
'cover_image_url': cover_url,
|
||||
}
|
||||
pd = ResourceContent(metadata=data)
|
||||
if data["cover_image_url"]:
|
||||
imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
|
||||
try:
|
||||
pd.cover_image = imgdl.download().content
|
||||
pd.cover_image_extention = imgdl.extention
|
||||
except Exception:
|
||||
_logger.debug(f'failed to download cover for {self.url} from {data["cover_image_url"]}')
|
||||
return pd
|
Loading…
Add table
Reference in a new issue