diff --git a/catalog/common/sites.py b/catalog/common/sites.py index d873b120..926beeef 100644 --- a/catalog/common/sites.py +++ b/catalog/common/sites.py @@ -39,6 +39,10 @@ class AbstractSite: u = next(iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), None) return u is not None + @classmethod + def validate_url_fallback(self, url: str): + return False + @classmethod def id_to_url(self, id_value): return 'https://undefined/' + id_value @@ -152,6 +156,8 @@ class SiteManager: @classmethod def get_site_by_url(cls, url: str): cls = next(filter(lambda p: p.validate_url(url), cls.registry.values()), None) + if cls is None: + cls = next(filter(lambda p: p.validate_url_fallback(url), cls.registry.values()), None) return cls(url) if cls else None @classmethod diff --git a/catalog/music/models.py b/catalog/music/models.py index 924e033b..b4993eae 100644 --- a/catalog/music/models.py +++ b/catalog/music/models.py @@ -23,6 +23,7 @@ class Album(Item): 'company', 'track_list', 'brief', + 'bandcamp_album_id', ] release_date = jsondata.DateField(_('发行日期'), auto_now=False, auto_now_add=False, null=True, blank=True) duration = jsondata.IntegerField(_("时长"), null=True, blank=True) @@ -33,4 +34,5 @@ class Album(Item): other_title = jsondata.CharField(blank=True, default='', max_length=500) album_type = jsondata.CharField(blank=True, default='', max_length=500) media = jsondata.CharField(blank=True, default='', max_length=500) + bandcamp_album_id = jsondata.CharField(blank=True, default='', max_length=500) disc_count = jsondata.IntegerField(blank=True, default='', max_length=500) diff --git a/catalog/music/tests.py b/catalog/music/tests.py index a171acb7..cbf044c8 100644 --- a/catalog/music/tests.py +++ b/catalog/music/tests.py @@ -59,3 +59,28 @@ class MultiMusicSitesTestCase(TestCase): p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p2 = SiteManager.get_site_by_url(url2).get_resource_ready() self.assertEqual(p1.item.id, p2.item.id) + + +class BandcampTestCase(TestCase): + def test_parse(self): + t_id_type = IdType.Bandcamp + t_id_value = 'intlanthem.bandcamp.com/album/in-these-times' + t_url = 'https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw' + t_url2 = 'https://intlanthem.bandcamp.com/album/in-these-times' + site = SiteManager.get_site_by_id_type(t_id_type) + self.assertIsNotNone(site) + self.assertEqual(site.validate_url(t_url), True) + site = SiteManager.get_site_by_url(t_url) + self.assertEqual(site.url, t_url2) + self.assertEqual(site.id_value, t_id_value) + + # @use_local_response + def test_scrape(self): + t_url = 'https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw' + site = SiteManager.get_site_by_url(t_url) + self.assertEqual(site.ready, False) + site.get_resource_ready() + self.assertEqual(site.ready, True) + self.assertEqual(site.resource.metadata['title'], 'In These Times') + self.assertEqual(site.resource.metadata['artist'], ['Makaya McCraven']) + self.assertIsInstance(site.resource.item, Album) diff --git a/catalog/sites/__init__.py b/catalog/sites/__init__.py index fbce5de5..a05e9c9c 100644 --- a/catalog/sites/__init__.py +++ b/catalog/sites/__init__.py @@ -12,4 +12,5 @@ from .imdb import IMDB from .spotify import Spotify from .igdb import IGDB from .steam import Steam +from .bandcamp import Bandcamp from .bangumi import Bangumi diff --git a/catalog/sites/bandcamp.py b/catalog/sites/bandcamp.py new file mode 100644 index 00000000..394dafa8 --- /dev/null +++ b/catalog/sites/bandcamp.py @@ -0,0 +1,90 @@ +from catalog.common import * +from catalog.models import * +import logging +import urllib.parse +import dateparser +import re +import json + + +_logger = logging.getLogger(__name__) + + +@SiteManager.register +class Bandcamp(AbstractSite): + SITE_NAME = SiteName.Bandcamp + ID_TYPE = IdType.Bandcamp + URL_PATTERNS = [ + r"https://([a-z0-9\-]+.bandcamp.com/album/[^?#/]+)" + ] + URL_PATTERN_FALLBACK = r"https://([a-z0-9\-\.]+/album/[^?#/]+)" + WIKI_PROPERTY_ID = '' + DEFAULT_MODEL = Album + + @classmethod + def id_to_url(self, id_value): + return f"https://{id_value}" + + @classmethod + def validate_url_fallback(self, url): + if re.match(self.URL_PATTERN_FALLBACK, url) is None: + return False + parsed_url = urllib.parse.urlparse(url) + hostname = parsed_url.netloc + try: + answers = dns.resolver.query(hostname, 'CNAME') + for rdata in answers: + if str(rdata.target) == 'dom.bandcamp.com.': + return True + except Exception: + pass + try: + answers = dns.resolver.query(hostname, 'A') + for rdata in answers: + if str(rdata.address) == '35.241.62.186': + return True + except Exception: + pass + + def scrape(self): + content = BasicDownloader(self.url).download().html() + try: + title = content.xpath("//h2[@class='trackTitle']/text()")[0].strip() + artist = [content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip()] + except IndexError: + raise ValueError("given url contains no valid info") + + genre = [] # TODO: parse tags + track_list = [] + release_nodes = content.xpath("//div[@class='tralbumData tralbum-credits']/text()") + release_date = dateparser.parse(re.sub(r'releas\w+ ', '', release_nodes[0].strip())).strftime('%Y-%m-%d') if release_nodes else None + duration = None + company = None + brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()") + brief = "".join(brief_nodes) if brief_nodes else None + cover_url = content.xpath("//div[@id='tralbumArt']/a/@href")[0].strip() + bandcamp_page_data = json.loads(content.xpath( + "//meta[@name='bc-page-properties']/@content")[0].strip()) + bandcamp_album_id = bandcamp_page_data['item_id'] + + data = { + 'title': title, + 'artist': artist, + 'genre': genre, + 'track_list': track_list, + 'release_date': release_date, + 'duration': duration, + 'company': company, + 'brief': brief, + 'bandcamp_album_id': bandcamp_album_id, + 'cover_image_url': cover_url, + } + pd = ResourceContent(metadata=data) + if data["cover_image_url"]: + imgdl = BasicImageDownloader(data["cover_image_url"], self.url) + try: + pd.cover_image = imgdl.download().content + pd.cover_image_extention = imgdl.extention + except Exception: + _logger.debug(f'failed to download cover for {self.url} from {data["cover_image_url"]}') + return pd