lib.itmens/catalog/sites/bandcamp.py

151 lines
5.5 KiB
Python
Raw Normal View History

import json
2022-12-16 08:34:33 -05:00
import logging
import re
2022-12-16 08:34:33 -05:00
import urllib.parse
2022-12-16 08:34:33 -05:00
import dateparser
2023-08-11 11:55:42 -04:00
import dns.resolver
import httpx
from loguru import logger
from lxml import html
2022-12-16 08:34:33 -05:00
from catalog.common import *
from catalog.models import *
2024-07-13 00:16:47 -04:00
from common.models.lang import detect_language
2022-12-16 08:34:33 -05:00
_logger = logging.getLogger(__name__)
@SiteManager.register
class Bandcamp(AbstractSite):
SITE_NAME = SiteName.Bandcamp
ID_TYPE = IdType.Bandcamp
URL_PATTERNS = [r"https://([a-z0-9\-]+.bandcamp.com/album/[^?#/]+).*"]
URL_PATTERN_FALLBACK = r"https://([a-z0-9\-\.]+/album/[^?#/]+).*"
2022-12-29 23:57:02 -05:00
WIKI_PROPERTY_ID = ""
2022-12-16 08:34:33 -05:00
DEFAULT_MODEL = Album
@classmethod
2023-01-29 20:05:30 -05:00
def id_to_url(cls, id_value):
2022-12-16 08:34:33 -05:00
return f"https://{id_value}"
@classmethod
2023-01-29 20:05:30 -05:00
def validate_url_fallback(cls, url):
if re.match(cls.URL_PATTERN_FALLBACK, url) is None:
2022-12-16 08:34:33 -05:00
return False
parsed_url = urllib.parse.urlparse(url)
hostname = parsed_url.netloc
try:
2022-12-29 23:57:02 -05:00
answers = dns.resolver.query(hostname, "CNAME")
for rdata in answers:
if str(rdata.target) == "dom.bandcamp.com.": # type:ignore
2022-12-16 08:34:33 -05:00
return True
except Exception:
pass
try:
2022-12-29 23:57:02 -05:00
answers = dns.resolver.query(hostname, "A")
for rdata in answers:
if str(rdata.address) == "35.241.62.186": # type:ignore
2022-12-16 08:34:33 -05:00
return True
except Exception:
pass
2023-12-31 08:32:19 -05:00
return False
2022-12-16 08:34:33 -05:00
def scrape(self):
2024-07-26 17:32:08 -04:00
content = BasicDownloader2(self.url).download().html()
2022-12-16 08:34:33 -05:00
try:
2023-08-11 11:55:42 -04:00
title = self.query_str(content, "//h2[@class='trackTitle']/text()")
2022-12-29 23:57:02 -05:00
artist = [
2023-08-11 11:55:42 -04:00
self.query_str(content, "//div[@id='name-section']/h3/span/a/text()")
2022-12-29 23:57:02 -05:00
]
2022-12-16 08:34:33 -05:00
except IndexError:
raise ValueError("given url contains no valid info")
genre = [] # TODO: parse tags
2023-02-03 16:56:42 -05:00
track_list = ""
2023-08-11 11:55:42 -04:00
try:
release_str = re.sub(
r"releas\w+ ",
"",
self.query_str(
content, "//div[@class='tralbumData tralbum-credits']/text()"
),
)
release_datetime = dateparser.parse(release_str) if release_str else None
release_date = (
release_datetime.strftime("%Y-%m-%d") if release_datetime else None
)
2024-04-06 00:13:50 -04:00
except Exception:
2023-08-11 11:55:42 -04:00
release_date = None
2022-12-16 08:34:33 -05:00
duration = None
company = None
brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()")
2024-07-13 00:16:47 -04:00
brief = "".join(brief_nodes) if brief_nodes else "" # type:ignore
2023-08-11 11:55:42 -04:00
cover_url = self.query_str(content, "//div[@id='tralbumArt']/a/@href")
2022-12-29 23:57:02 -05:00
bandcamp_page_data = json.loads(
2023-08-11 11:55:42 -04:00
self.query_str(content, "//meta[@name='bc-page-properties']/@content")
2022-12-29 23:57:02 -05:00
)
bandcamp_album_id = bandcamp_page_data["item_id"]
2024-07-13 00:16:47 -04:00
localized_title = [{"lang": detect_language(title), "text": title}]
localized_desc = (
[{"lang": detect_language(brief), "text": brief}] if brief else []
)
2022-12-16 08:34:33 -05:00
data = {
2024-07-13 00:16:47 -04:00
"localized_title": localized_title,
"localized_description": localized_desc,
2022-12-29 23:57:02 -05:00
"title": title,
"artist": artist,
"genre": genre,
"track_list": track_list,
"release_date": release_date,
"duration": duration,
"company": company,
"brief": brief,
"bandcamp_album_id": bandcamp_album_id,
"cover_image_url": cover_url,
2022-12-16 08:34:33 -05:00
}
pd = ResourceContent(metadata=data)
return pd
@classmethod
async def search_task(
cls, q: str, page: int, category: str
) -> list[ExternalSearchResultItem]:
if category != "music":
return []
SEARCH_PAGE_SIZE = 5
p = (page - 1) * SEARCH_PAGE_SIZE // 18 + 1
offset = (page - 1) * SEARCH_PAGE_SIZE % 18
results = []
search_url = f"https://bandcamp.com/search?from=results&item_type=a&page={p}&q={urllib.parse.quote_plus(q)}"
async with httpx.AsyncClient() as client:
try:
r = await client.get(search_url, timeout=2)
h = html.fromstring(r.content.decode("utf-8"))
albums = h.xpath('//li[@class="searchresult data-search"]')
for c in albums: # type:ignore
el_cover = c.xpath('.//div[@class="art"]/img/@src')
cover = el_cover[0] if el_cover else ""
el_title = c.xpath('.//div[@class="heading"]//text()')
title = "".join(el_title).strip() if el_title else "Unknown Title"
el_url = c.xpath('..//div[@class="itemurl"]/a/@href')
url = el_url[0] if el_url else ""
el_authors = c.xpath('.//div[@class="subhead"]//text()')
subtitle = ", ".join(el_authors) if el_authors else ""
results.append(
ExternalSearchResultItem(
ItemCategory.Music,
SiteName.Bandcamp,
url,
title,
subtitle,
"",
cover,
)
)
except Exception as e:
logger.error(
"Bandcamp search error", extra={"query": q, "exception": e}
)
return results[offset : offset + SEARCH_PAGE_SIZE]