bandcamp scrape and preview ;

manage.py scrape <url> ;
make ^C work when DEBUG
This commit is contained in:
Your Name 2021-11-26 23:41:47 -05:00
parent a6e24d6303
commit c5ad660184
11 changed files with 142 additions and 10 deletions

View file

@ -154,9 +154,9 @@ class MultiSelect(forms.SelectMultiple):
class Media:
css = {
'all': ('lib/css/multiple-select.min.css',)
'all': ('https://cdn.jsdelivr.net/npm/multiple-select@1.5.2/dist/multiple-select.min.css',)
}
js = ('lib/js/multiple-select.min.js',)
js = ('https://cdn.jsdelivr.net/npm/multiple-select@1.5.2/dist/multiple-select.min.js',)
class HstoreField(forms.CharField):

View file

@ -0,0 +1,29 @@
from django.core.management.base import BaseCommand
from common.scraper import scraper_registry
import pprint
class Command(BaseCommand):
help = 'Scrape an item from URL (but not save it)'
def add_arguments(self, parser):
parser.add_argument('url', type=str, help='URL to scrape')
def handle(self, *args, **options):
url = str(options['url'])
matched_host = None
for host in scraper_registry:
if host in url:
matched_host = host
break
if matched_host is None:
self.stdout.write(self.style.ERROR(f'Unable to match a scraper for {url}'))
return
scraper = scraper_registry[matched_host]
effective_url = scraper.get_effective_url(url)
self.stdout.write(f'Fetching {effective_url} via {scraper.__name__}')
data, img = scraper.scrape(effective_url)
self.stdout.write(self.style.SUCCESS(f'Done.'))
pprint.pp(data)

View file

@ -27,6 +27,7 @@ class SourceSiteEnum(models.TextChoices):
GOODREADS = "goodreads", _("goodreads")
TMDB = "tmdb", _("The Movie Database")
GOOGLEBOOKS = "googlebooks", _("Google Books")
BANDCAMP = "bandcamp", _("BandCamp")
class Entity(models.Model):

View file

@ -1854,3 +1854,5 @@ class GoogleBooksScraper(AbstractScraper):
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
from common.scrapers.bandcamp import BandcampAlbumScraper

View file

@ -0,0 +1,71 @@
import re
import dateparser
import json
from lxml import html
from common.models import SourceSiteEnum
from common.scraper import AbstractScraper
from music.models import Album
from music.forms import AlbumForm
class BandcampAlbumScraper(AbstractScraper):
site_name = SourceSiteEnum.BANDCAMP.value
# API URL
host = '.bandcamp.com/'
data_class = Album
form_class = AlbumForm
regex = re.compile(r"https://[\w-]+\.bandcamp\.com/album/[^?#]+")
def scrape(self, url, response=None):
effective_url = self.get_effective_url(url)
if effective_url is None:
raise ValueError("not valid url")
if response is not None:
content = html.fromstring(response.content.decode('utf-8'))
else:
content = self.download_page(url, {})
try:
title = content.xpath("//h2[@class='trackTitle']/text()")[0].strip()
artist = [content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip()]
except IndexError:
raise ValueError("given url contains no valid info")
genre = [] # TODO: parse tags
track_list = []
release_nodes = content.xpath("//div[@class='tralbumData tralbum-credits']/text()")
release_date = dateparser.parse(re.sub(r'releas\w+ ', '', release_nodes[0].strip())) if release_nodes else None
duration = None
company = None
brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()")
brief = "".join(brief_nodes) if brief_nodes else None
cover_url = content.xpath("//div[@id='tralbumArt']/a/@href")[0].strip()
bandcamp_page_data = json.loads(content.xpath(
"//meta[@name='bc-page-properties']/@content")[0].strip())
other_info = {}
other_info['bandcamp_album_id'] = bandcamp_page_data['item_id']
raw_img, ext = self.download_image(cover_url, url)
data = {
'title': title,
'artist': artist,
'genre': genre,
'track_list': track_list,
'release_date': release_date,
'duration': duration,
'company': company,
'brief': brief,
'other_info': other_info,
'source_site': self.site_name,
'source_url': effective_url,
'cover_url': cover_url,
}
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
@classmethod
def get_effective_url(cls, raw_url):
url = cls.regex.findall(raw_url)
return url[0] if len(url) > 0 else None

View file

@ -1143,6 +1143,17 @@ select::placeholder {
border-color: #4285F4;
}
.source-label.source-label__bandcamp {
color: white;
background-color: #28A0C1;
display: inline-block;
}
.source-label.source-label__bandcamp span {
display: inline-block;
margin: 0 4px;
}
.main-section-wrapper {
padding: 32px 48px 32px 36px;
background-color: #f7f7f7;

File diff suppressed because one or more lines are too long

View file

@ -15,6 +15,8 @@ $goodreads-color-primary: #372213
$goodreads-color-secondary: #F4F1EA
$tmdb-color-primary: #91CCA3
$tmdb-color-secondary: #1FB4E2
$bandcamp-color-primary: #28A0C1
$bandcamp-color-secondary: white
.source-label
display: inline
@ -78,4 +80,13 @@ $tmdb-color-secondary: #1FB4E2
&.source-label__googlebooks
color: white
background-color: #4285F4
border-color: #4285F4
border-color: #4285F4
&.source-label__bandcamp
color: $bandcamp-color-secondary
background-color: $bandcamp-color-primary
// transform: skewX(-30deg)
display: inline-block
&.source-label__bandcamp span
// transform: skewX(30deg)
display: inline-block
margin: 0 4px

View file

@ -49,7 +49,12 @@ class Album(Entity):
return self.title
def get_embed_link(self):
return self.source_url.replace("open.spotify.com/", "open.spotify.com/embed/") if self.source_site == SourceSiteEnum.SPOTIFY.value else None
if self.source_site == SourceSiteEnum.SPOTIFY.value:
return self.source_url.replace("open.spotify.com/", "open.spotify.com/embed/")
elif self.source_site == SourceSiteEnum.BANDCAMP.value:
return f"https://bandcamp.com/EmbeddedPlayer/album={self.other_info['bandcamp_album_id']}/size=large/bgcol=ffffff/linkcol=19A2CA/artwork=small/transparent=true/"
else:
return None
def get_absolute_url(self):
return reverse("music:retrieve_album", args=[self.id])

View file

@ -350,7 +350,7 @@
{% endif %}
</div>
{% if album.source_site == "spotify" %}
{% if album.source_site == "spotify" or album.source_site == "bandcamp" %}
<iframe src="{{ album.get_embed_link }}" height="320" frameborder="0" allowtransparency="true" allow="encrypted-media"></iframe>
{% endif %}
</div>

View file

@ -394,7 +394,9 @@ sync_task_manager = SyncTaskManger()
# sync_task_manager.start()
signal.signal(signal.SIGTERM, sync_task_manager.stop)
if sys.platform.startswith('linux'):
signal.signal(signal.SIGHUP, sync_task_manager.stop)
signal.signal(signal.SIGINT, sync_task_manager.stop)
if not settings.DEBUG:
# TODO: it seems this prevent ^C from working properly
signal.signal(signal.SIGTERM, sync_task_manager.stop)
if sys.platform.startswith('linux'):
signal.signal(signal.SIGHUP, sync_task_manager.stop)
signal.signal(signal.SIGINT, sync_task_manager.stop)