lib.itmens/catalog/sites/bandcamp.py

102 lines
3.3 KiB
Python
Raw Normal View History

2022-12-16 08:34:33 -05:00
from catalog.common import *
from catalog.models import *
import logging
import urllib.parse
import dateparser
import re
import json
_logger = logging.getLogger(__name__)
@SiteManager.register
class Bandcamp(AbstractSite):
SITE_NAME = SiteName.Bandcamp
ID_TYPE = IdType.Bandcamp
2022-12-29 23:57:02 -05:00
URL_PATTERNS = [r"https://([a-z0-9\-]+.bandcamp.com/album/[^?#/]+)"]
2022-12-16 08:34:33 -05:00
URL_PATTERN_FALLBACK = r"https://([a-z0-9\-\.]+/album/[^?#/]+)"
2022-12-29 23:57:02 -05:00
WIKI_PROPERTY_ID = ""
2022-12-16 08:34:33 -05:00
DEFAULT_MODEL = Album
@classmethod
2023-01-29 20:05:30 -05:00
def id_to_url(cls, id_value):
2022-12-16 08:34:33 -05:00
return f"https://{id_value}"
@classmethod
2023-01-29 20:05:30 -05:00
def validate_url_fallback(cls, url):
if re.match(cls.URL_PATTERN_FALLBACK, url) is None:
2022-12-16 08:34:33 -05:00
return False
parsed_url = urllib.parse.urlparse(url)
hostname = parsed_url.netloc
try:
2022-12-29 23:57:02 -05:00
answers = dns.resolver.query(hostname, "CNAME")
2022-12-16 08:34:33 -05:00
for rdata in answers:
2022-12-29 23:57:02 -05:00
if str(rdata.target) == "dom.bandcamp.com.":
2022-12-16 08:34:33 -05:00
return True
except Exception:
pass
try:
2022-12-29 23:57:02 -05:00
answers = dns.resolver.query(hostname, "A")
2022-12-16 08:34:33 -05:00
for rdata in answers:
2022-12-29 23:57:02 -05:00
if str(rdata.address) == "35.241.62.186":
2022-12-16 08:34:33 -05:00
return True
except Exception:
pass
def scrape(self):
content = BasicDownloader(self.url).download().html()
try:
title = content.xpath("//h2[@class='trackTitle']/text()")[0].strip()
2022-12-29 23:57:02 -05:00
artist = [
content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip()
]
2022-12-16 08:34:33 -05:00
except IndexError:
raise ValueError("given url contains no valid info")
genre = [] # TODO: parse tags
2023-02-03 16:56:42 -05:00
track_list = ""
2022-12-29 23:57:02 -05:00
release_nodes = content.xpath(
"//div[@class='tralbumData tralbum-credits']/text()"
)
release_date = (
dateparser.parse(
re.sub(r"releas\w+ ", "", release_nodes[0].strip())
).strftime("%Y-%m-%d")
if release_nodes
else None
)
2022-12-16 08:34:33 -05:00
duration = None
company = None
brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()")
brief = "".join(brief_nodes) if brief_nodes else None
cover_url = content.xpath("//div[@id='tralbumArt']/a/@href")[0].strip()
2022-12-29 23:57:02 -05:00
bandcamp_page_data = json.loads(
content.xpath("//meta[@name='bc-page-properties']/@content")[0].strip()
)
bandcamp_album_id = bandcamp_page_data["item_id"]
2022-12-16 08:34:33 -05:00
data = {
2022-12-29 23:57:02 -05:00
"title": title,
"artist": artist,
"genre": genre,
"track_list": track_list,
"release_date": release_date,
"duration": duration,
"company": company,
"brief": brief,
"bandcamp_album_id": bandcamp_album_id,
"cover_image_url": cover_url,
2022-12-16 08:34:33 -05:00
}
pd = ResourceContent(metadata=data)
if data["cover_image_url"]:
imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
2022-12-29 23:57:02 -05:00
_logger.debug(
f'failed to download cover for {self.url} from {data["cover_image_url"]}'
)
2022-12-16 08:34:33 -05:00
return pd