2023-08-10 11:27:31 -04:00
|
|
|
import json
|
2022-12-16 08:34:33 -05:00
|
|
|
import logging
|
2023-08-10 11:27:31 -04:00
|
|
|
import re
|
2022-12-16 08:34:33 -05:00
|
|
|
import urllib.parse
|
2023-08-10 11:27:31 -04:00
|
|
|
|
2022-12-16 08:34:33 -05:00
|
|
|
import dateparser
|
2023-08-11 11:55:42 -04:00
|
|
|
import dns.resolver
|
2022-12-16 08:34:33 -05:00
|
|
|
|
2023-08-10 11:27:31 -04:00
|
|
|
from catalog.common import *
|
|
|
|
from catalog.models import *
|
2022-12-16 08:34:33 -05:00
|
|
|
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
@SiteManager.register
|
|
|
|
class Bandcamp(AbstractSite):
|
|
|
|
SITE_NAME = SiteName.Bandcamp
|
|
|
|
ID_TYPE = IdType.Bandcamp
|
2023-05-20 11:01:18 -04:00
|
|
|
URL_PATTERNS = [r"https://([a-z0-9\-]+.bandcamp.com/album/[^?#/]+).*"]
|
|
|
|
URL_PATTERN_FALLBACK = r"https://([a-z0-9\-\.]+/album/[^?#/]+).*"
|
2022-12-29 23:57:02 -05:00
|
|
|
WIKI_PROPERTY_ID = ""
|
2022-12-16 08:34:33 -05:00
|
|
|
DEFAULT_MODEL = Album
|
|
|
|
|
|
|
|
@classmethod
|
2023-01-29 20:05:30 -05:00
|
|
|
def id_to_url(cls, id_value):
|
2022-12-16 08:34:33 -05:00
|
|
|
return f"https://{id_value}"
|
|
|
|
|
|
|
|
@classmethod
|
2023-01-29 20:05:30 -05:00
|
|
|
def validate_url_fallback(cls, url):
|
|
|
|
if re.match(cls.URL_PATTERN_FALLBACK, url) is None:
|
2022-12-16 08:34:33 -05:00
|
|
|
return False
|
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
|
|
|
hostname = parsed_url.netloc
|
|
|
|
try:
|
2022-12-29 23:57:02 -05:00
|
|
|
answers = dns.resolver.query(hostname, "CNAME")
|
2023-08-11 11:55:42 -04:00
|
|
|
for rdata in answers: # type:ignore
|
2022-12-29 23:57:02 -05:00
|
|
|
if str(rdata.target) == "dom.bandcamp.com.":
|
2022-12-16 08:34:33 -05:00
|
|
|
return True
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
try:
|
2022-12-29 23:57:02 -05:00
|
|
|
answers = dns.resolver.query(hostname, "A")
|
2023-08-11 11:55:42 -04:00
|
|
|
for rdata in answers: # type:ignore
|
2022-12-29 23:57:02 -05:00
|
|
|
if str(rdata.address) == "35.241.62.186":
|
2022-12-16 08:34:33 -05:00
|
|
|
return True
|
|
|
|
except Exception:
|
|
|
|
pass
|
2023-12-31 08:32:19 -05:00
|
|
|
return False
|
2022-12-16 08:34:33 -05:00
|
|
|
|
|
|
|
def scrape(self):
|
|
|
|
content = BasicDownloader(self.url).download().html()
|
|
|
|
try:
|
2023-08-11 11:55:42 -04:00
|
|
|
title = self.query_str(content, "//h2[@class='trackTitle']/text()")
|
2022-12-29 23:57:02 -05:00
|
|
|
artist = [
|
2023-08-11 11:55:42 -04:00
|
|
|
self.query_str(content, "//div[@id='name-section']/h3/span/a/text()")
|
2022-12-29 23:57:02 -05:00
|
|
|
]
|
2022-12-16 08:34:33 -05:00
|
|
|
except IndexError:
|
|
|
|
raise ValueError("given url contains no valid info")
|
|
|
|
|
|
|
|
genre = [] # TODO: parse tags
|
2023-02-03 16:56:42 -05:00
|
|
|
track_list = ""
|
2023-08-11 11:55:42 -04:00
|
|
|
try:
|
|
|
|
release_str = re.sub(
|
|
|
|
r"releas\w+ ",
|
|
|
|
"",
|
|
|
|
self.query_str(
|
|
|
|
content, "//div[@class='tralbumData tralbum-credits']/text()"
|
|
|
|
),
|
|
|
|
)
|
|
|
|
release_datetime = dateparser.parse(release_str) if release_str else None
|
|
|
|
release_date = (
|
|
|
|
release_datetime.strftime("%Y-%m-%d") if release_datetime else None
|
|
|
|
)
|
2024-04-06 00:13:50 -04:00
|
|
|
except Exception:
|
2023-08-11 11:55:42 -04:00
|
|
|
release_date = None
|
2022-12-16 08:34:33 -05:00
|
|
|
duration = None
|
|
|
|
company = None
|
|
|
|
brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()")
|
2023-08-11 11:55:42 -04:00
|
|
|
brief = "".join(brief_nodes) if brief_nodes else None # type:ignore
|
|
|
|
cover_url = self.query_str(content, "//div[@id='tralbumArt']/a/@href")
|
2022-12-29 23:57:02 -05:00
|
|
|
bandcamp_page_data = json.loads(
|
2023-08-11 11:55:42 -04:00
|
|
|
self.query_str(content, "//meta[@name='bc-page-properties']/@content")
|
2022-12-29 23:57:02 -05:00
|
|
|
)
|
|
|
|
bandcamp_album_id = bandcamp_page_data["item_id"]
|
2022-12-16 08:34:33 -05:00
|
|
|
|
|
|
|
data = {
|
2022-12-29 23:57:02 -05:00
|
|
|
"title": title,
|
|
|
|
"artist": artist,
|
|
|
|
"genre": genre,
|
|
|
|
"track_list": track_list,
|
|
|
|
"release_date": release_date,
|
|
|
|
"duration": duration,
|
|
|
|
"company": company,
|
|
|
|
"brief": brief,
|
|
|
|
"bandcamp_album_id": bandcamp_album_id,
|
|
|
|
"cover_image_url": cover_url,
|
2022-12-16 08:34:33 -05:00
|
|
|
}
|
|
|
|
pd = ResourceContent(metadata=data)
|
|
|
|
if data["cover_image_url"]:
|
|
|
|
imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
|
|
|
|
try:
|
|
|
|
pd.cover_image = imgdl.download().content
|
|
|
|
pd.cover_image_extention = imgdl.extention
|
|
|
|
except Exception:
|
2022-12-29 23:57:02 -05:00
|
|
|
_logger.debug(
|
|
|
|
f'failed to download cover for {self.url} from {data["cover_image_url"]}'
|
|
|
|
)
|
2022-12-16 08:34:33 -05:00
|
|
|
return pd
|