This commit is contained in:
Your Name 2022-12-16 08:34:33 -05:00
parent c05aa65e3f
commit 6113115fb6
5 changed files with 124 additions and 0 deletions

View file

@ -39,6 +39,10 @@ class AbstractSite:
u = next(iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), None)
return u is not None
@classmethod
def validate_url_fallback(self, url: str):
return False
@classmethod
def id_to_url(self, id_value):
return 'https://undefined/' + id_value
@ -152,6 +156,8 @@ class SiteManager:
@classmethod
def get_site_by_url(cls, url: str):
cls = next(filter(lambda p: p.validate_url(url), cls.registry.values()), None)
if cls is None:
cls = next(filter(lambda p: p.validate_url_fallback(url), cls.registry.values()), None)
return cls(url) if cls else None
@classmethod

View file

@ -23,6 +23,7 @@ class Album(Item):
'company',
'track_list',
'brief',
'bandcamp_album_id',
]
release_date = jsondata.DateField(_('发行日期'), auto_now=False, auto_now_add=False, null=True, blank=True)
duration = jsondata.IntegerField(_("时长"), null=True, blank=True)
@ -33,4 +34,5 @@ class Album(Item):
other_title = jsondata.CharField(blank=True, default='', max_length=500)
album_type = jsondata.CharField(blank=True, default='', max_length=500)
media = jsondata.CharField(blank=True, default='', max_length=500)
bandcamp_album_id = jsondata.CharField(blank=True, default='', max_length=500)
disc_count = jsondata.IntegerField(blank=True, default='', max_length=500)

View file

@ -59,3 +59,28 @@ class MultiMusicSitesTestCase(TestCase):
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
self.assertEqual(p1.item.id, p2.item.id)
class BandcampTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.Bandcamp
t_id_value = 'intlanthem.bandcamp.com/album/in-these-times'
t_url = 'https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw'
t_url2 = 'https://intlanthem.bandcamp.com/album/in-these-times'
site = SiteManager.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.url, t_url2)
self.assertEqual(site.id_value, t_id_value)
# @use_local_response
def test_scrape(self):
t_url = 'https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw'
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'In These Times')
self.assertEqual(site.resource.metadata['artist'], ['Makaya McCraven'])
self.assertIsInstance(site.resource.item, Album)

View file

@ -12,4 +12,5 @@ from .imdb import IMDB
from .spotify import Spotify
from .igdb import IGDB
from .steam import Steam
from .bandcamp import Bandcamp
from .bangumi import Bangumi

90
catalog/sites/bandcamp.py Normal file
View file

@ -0,0 +1,90 @@
from catalog.common import *
from catalog.models import *
import logging
import urllib.parse
import dateparser
import re
import json
_logger = logging.getLogger(__name__)
@SiteManager.register
class Bandcamp(AbstractSite):
SITE_NAME = SiteName.Bandcamp
ID_TYPE = IdType.Bandcamp
URL_PATTERNS = [
r"https://([a-z0-9\-]+.bandcamp.com/album/[^?#/]+)"
]
URL_PATTERN_FALLBACK = r"https://([a-z0-9\-\.]+/album/[^?#/]+)"
WIKI_PROPERTY_ID = ''
DEFAULT_MODEL = Album
@classmethod
def id_to_url(self, id_value):
return f"https://{id_value}"
@classmethod
def validate_url_fallback(self, url):
if re.match(self.URL_PATTERN_FALLBACK, url) is None:
return False
parsed_url = urllib.parse.urlparse(url)
hostname = parsed_url.netloc
try:
answers = dns.resolver.query(hostname, 'CNAME')
for rdata in answers:
if str(rdata.target) == 'dom.bandcamp.com.':
return True
except Exception:
pass
try:
answers = dns.resolver.query(hostname, 'A')
for rdata in answers:
if str(rdata.address) == '35.241.62.186':
return True
except Exception:
pass
def scrape(self):
content = BasicDownloader(self.url).download().html()
try:
title = content.xpath("//h2[@class='trackTitle']/text()")[0].strip()
artist = [content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip()]
except IndexError:
raise ValueError("given url contains no valid info")
genre = [] # TODO: parse tags
track_list = []
release_nodes = content.xpath("//div[@class='tralbumData tralbum-credits']/text()")
release_date = dateparser.parse(re.sub(r'releas\w+ ', '', release_nodes[0].strip())).strftime('%Y-%m-%d') if release_nodes else None
duration = None
company = None
brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()")
brief = "".join(brief_nodes) if brief_nodes else None
cover_url = content.xpath("//div[@id='tralbumArt']/a/@href")[0].strip()
bandcamp_page_data = json.loads(content.xpath(
"//meta[@name='bc-page-properties']/@content")[0].strip())
bandcamp_album_id = bandcamp_page_data['item_id']
data = {
'title': title,
'artist': artist,
'genre': genre,
'track_list': track_list,
'release_date': release_date,
'duration': duration,
'company': company,
'brief': brief,
'bandcamp_album_id': bandcamp_album_id,
'cover_image_url': cover_url,
}
pd = ResourceContent(metadata=data)
if data["cover_image_url"]:
imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {data["cover_image_url"]}')
return pd