This commit is contained in:
Your Name 2022-12-16 08:34:33 -05:00
parent c05aa65e3f
commit 6113115fb6
5 changed files with 124 additions and 0 deletions

View file

@ -39,6 +39,10 @@ class AbstractSite:
u = next(iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), None) u = next(iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), None)
return u is not None return u is not None
@classmethod
def validate_url_fallback(self, url: str):
return False
@classmethod @classmethod
def id_to_url(self, id_value): def id_to_url(self, id_value):
return 'https://undefined/' + id_value return 'https://undefined/' + id_value
@ -152,6 +156,8 @@ class SiteManager:
@classmethod @classmethod
def get_site_by_url(cls, url: str): def get_site_by_url(cls, url: str):
cls = next(filter(lambda p: p.validate_url(url), cls.registry.values()), None) cls = next(filter(lambda p: p.validate_url(url), cls.registry.values()), None)
if cls is None:
cls = next(filter(lambda p: p.validate_url_fallback(url), cls.registry.values()), None)
return cls(url) if cls else None return cls(url) if cls else None
@classmethod @classmethod

View file

@ -23,6 +23,7 @@ class Album(Item):
'company', 'company',
'track_list', 'track_list',
'brief', 'brief',
'bandcamp_album_id',
] ]
release_date = jsondata.DateField(_('发行日期'), auto_now=False, auto_now_add=False, null=True, blank=True) release_date = jsondata.DateField(_('发行日期'), auto_now=False, auto_now_add=False, null=True, blank=True)
duration = jsondata.IntegerField(_("时长"), null=True, blank=True) duration = jsondata.IntegerField(_("时长"), null=True, blank=True)
@ -33,4 +34,5 @@ class Album(Item):
other_title = jsondata.CharField(blank=True, default='', max_length=500) other_title = jsondata.CharField(blank=True, default='', max_length=500)
album_type = jsondata.CharField(blank=True, default='', max_length=500) album_type = jsondata.CharField(blank=True, default='', max_length=500)
media = jsondata.CharField(blank=True, default='', max_length=500) media = jsondata.CharField(blank=True, default='', max_length=500)
bandcamp_album_id = jsondata.CharField(blank=True, default='', max_length=500)
disc_count = jsondata.IntegerField(blank=True, default='', max_length=500) disc_count = jsondata.IntegerField(blank=True, default='', max_length=500)

View file

@ -59,3 +59,28 @@ class MultiMusicSitesTestCase(TestCase):
p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p2 = SiteManager.get_site_by_url(url2).get_resource_ready() p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
self.assertEqual(p1.item.id, p2.item.id) self.assertEqual(p1.item.id, p2.item.id)
class BandcampTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.Bandcamp
t_id_value = 'intlanthem.bandcamp.com/album/in-these-times'
t_url = 'https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw'
t_url2 = 'https://intlanthem.bandcamp.com/album/in-these-times'
site = SiteManager.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.url, t_url2)
self.assertEqual(site.id_value, t_id_value)
# @use_local_response
def test_scrape(self):
t_url = 'https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw'
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'In These Times')
self.assertEqual(site.resource.metadata['artist'], ['Makaya McCraven'])
self.assertIsInstance(site.resource.item, Album)

View file

@ -12,4 +12,5 @@ from .imdb import IMDB
from .spotify import Spotify from .spotify import Spotify
from .igdb import IGDB from .igdb import IGDB
from .steam import Steam from .steam import Steam
from .bandcamp import Bandcamp
from .bangumi import Bangumi from .bangumi import Bangumi

90
catalog/sites/bandcamp.py Normal file
View file

@ -0,0 +1,90 @@
from catalog.common import *
from catalog.models import *
import logging
import urllib.parse
import dateparser
import re
import json
_logger = logging.getLogger(__name__)
@SiteManager.register
class Bandcamp(AbstractSite):
SITE_NAME = SiteName.Bandcamp
ID_TYPE = IdType.Bandcamp
URL_PATTERNS = [
r"https://([a-z0-9\-]+.bandcamp.com/album/[^?#/]+)"
]
URL_PATTERN_FALLBACK = r"https://([a-z0-9\-\.]+/album/[^?#/]+)"
WIKI_PROPERTY_ID = ''
DEFAULT_MODEL = Album
@classmethod
def id_to_url(self, id_value):
return f"https://{id_value}"
@classmethod
def validate_url_fallback(self, url):
if re.match(self.URL_PATTERN_FALLBACK, url) is None:
return False
parsed_url = urllib.parse.urlparse(url)
hostname = parsed_url.netloc
try:
answers = dns.resolver.query(hostname, 'CNAME')
for rdata in answers:
if str(rdata.target) == 'dom.bandcamp.com.':
return True
except Exception:
pass
try:
answers = dns.resolver.query(hostname, 'A')
for rdata in answers:
if str(rdata.address) == '35.241.62.186':
return True
except Exception:
pass
def scrape(self):
content = BasicDownloader(self.url).download().html()
try:
title = content.xpath("//h2[@class='trackTitle']/text()")[0].strip()
artist = [content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip()]
except IndexError:
raise ValueError("given url contains no valid info")
genre = [] # TODO: parse tags
track_list = []
release_nodes = content.xpath("//div[@class='tralbumData tralbum-credits']/text()")
release_date = dateparser.parse(re.sub(r'releas\w+ ', '', release_nodes[0].strip())).strftime('%Y-%m-%d') if release_nodes else None
duration = None
company = None
brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()")
brief = "".join(brief_nodes) if brief_nodes else None
cover_url = content.xpath("//div[@id='tralbumArt']/a/@href")[0].strip()
bandcamp_page_data = json.loads(content.xpath(
"//meta[@name='bc-page-properties']/@content")[0].strip())
bandcamp_album_id = bandcamp_page_data['item_id']
data = {
'title': title,
'artist': artist,
'genre': genre,
'track_list': track_list,
'release_date': release_date,
'duration': duration,
'company': company,
'brief': brief,
'bandcamp_album_id': bandcamp_album_id,
'cover_image_url': cover_url,
}
pd = ResourceContent(metadata=data)
if data["cover_image_url"]:
imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {data["cover_image_url"]}')
return pd