bandcamp scrape and preview ;
manage.py scrape <url> ; make ^C work when DEBUG
This commit is contained in:
parent
a6e24d6303
commit
c5ad660184
11 changed files with 142 additions and 10 deletions
|
@ -154,9 +154,9 @@ class MultiSelect(forms.SelectMultiple):
|
|||
|
||||
class Media:
|
||||
css = {
|
||||
'all': ('lib/css/multiple-select.min.css',)
|
||||
'all': ('https://cdn.jsdelivr.net/npm/multiple-select@1.5.2/dist/multiple-select.min.css',)
|
||||
}
|
||||
js = ('lib/js/multiple-select.min.js',)
|
||||
js = ('https://cdn.jsdelivr.net/npm/multiple-select@1.5.2/dist/multiple-select.min.js',)
|
||||
|
||||
|
||||
class HstoreField(forms.CharField):
|
||||
|
|
29
common/management/commands/scrape.py
Normal file
29
common/management/commands/scrape.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
from common.scraper import scraper_registry
|
||||
import pprint
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Scrape an item from URL (but not save it)'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('url', type=str, help='URL to scrape')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
url = str(options['url'])
|
||||
matched_host = None
|
||||
for host in scraper_registry:
|
||||
if host in url:
|
||||
matched_host = host
|
||||
break
|
||||
|
||||
if matched_host is None:
|
||||
self.stdout.write(self.style.ERROR(f'Unable to match a scraper for {url}'))
|
||||
return
|
||||
|
||||
scraper = scraper_registry[matched_host]
|
||||
effective_url = scraper.get_effective_url(url)
|
||||
self.stdout.write(f'Fetching {effective_url} via {scraper.__name__}')
|
||||
data, img = scraper.scrape(effective_url)
|
||||
self.stdout.write(self.style.SUCCESS(f'Done.'))
|
||||
pprint.pp(data)
|
|
@ -27,6 +27,7 @@ class SourceSiteEnum(models.TextChoices):
|
|||
GOODREADS = "goodreads", _("goodreads")
|
||||
TMDB = "tmdb", _("The Movie Database")
|
||||
GOOGLEBOOKS = "googlebooks", _("Google Books")
|
||||
BANDCAMP = "bandcamp", _("BandCamp")
|
||||
|
||||
|
||||
class Entity(models.Model):
|
||||
|
|
|
@ -1854,3 +1854,5 @@ class GoogleBooksScraper(AbstractScraper):
|
|||
|
||||
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
|
||||
return data, raw_img
|
||||
|
||||
from common.scrapers.bandcamp import BandcampAlbumScraper
|
71
common/scrapers/bandcamp.py
Normal file
71
common/scrapers/bandcamp.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
import re
|
||||
import dateparser
|
||||
import json
|
||||
from lxml import html
|
||||
from common.models import SourceSiteEnum
|
||||
from common.scraper import AbstractScraper
|
||||
from music.models import Album
|
||||
from music.forms import AlbumForm
|
||||
|
||||
|
||||
class BandcampAlbumScraper(AbstractScraper):
|
||||
site_name = SourceSiteEnum.BANDCAMP.value
|
||||
# API URL
|
||||
host = '.bandcamp.com/'
|
||||
data_class = Album
|
||||
form_class = AlbumForm
|
||||
|
||||
regex = re.compile(r"https://[\w-]+\.bandcamp\.com/album/[^?#]+")
|
||||
|
||||
def scrape(self, url, response=None):
|
||||
effective_url = self.get_effective_url(url)
|
||||
if effective_url is None:
|
||||
raise ValueError("not valid url")
|
||||
if response is not None:
|
||||
content = html.fromstring(response.content.decode('utf-8'))
|
||||
else:
|
||||
content = self.download_page(url, {})
|
||||
try:
|
||||
title = content.xpath("//h2[@class='trackTitle']/text()")[0].strip()
|
||||
artist = [content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip()]
|
||||
except IndexError:
|
||||
raise ValueError("given url contains no valid info")
|
||||
|
||||
genre = [] # TODO: parse tags
|
||||
track_list = []
|
||||
release_nodes = content.xpath("//div[@class='tralbumData tralbum-credits']/text()")
|
||||
release_date = dateparser.parse(re.sub(r'releas\w+ ', '', release_nodes[0].strip())) if release_nodes else None
|
||||
duration = None
|
||||
company = None
|
||||
brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()")
|
||||
brief = "".join(brief_nodes) if brief_nodes else None
|
||||
cover_url = content.xpath("//div[@id='tralbumArt']/a/@href")[0].strip()
|
||||
bandcamp_page_data = json.loads(content.xpath(
|
||||
"//meta[@name='bc-page-properties']/@content")[0].strip())
|
||||
other_info = {}
|
||||
other_info['bandcamp_album_id'] = bandcamp_page_data['item_id']
|
||||
|
||||
raw_img, ext = self.download_image(cover_url, url)
|
||||
|
||||
data = {
|
||||
'title': title,
|
||||
'artist': artist,
|
||||
'genre': genre,
|
||||
'track_list': track_list,
|
||||
'release_date': release_date,
|
||||
'duration': duration,
|
||||
'company': company,
|
||||
'brief': brief,
|
||||
'other_info': other_info,
|
||||
'source_site': self.site_name,
|
||||
'source_url': effective_url,
|
||||
'cover_url': cover_url,
|
||||
}
|
||||
|
||||
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
|
||||
return data, raw_img
|
||||
|
||||
@classmethod
|
||||
def get_effective_url(cls, raw_url):
|
||||
url = cls.regex.findall(raw_url)
|
||||
return url[0] if len(url) > 0 else None
|
|
@ -1143,6 +1143,17 @@ select::placeholder {
|
|||
border-color: #4285F4;
|
||||
}
|
||||
|
||||
.source-label.source-label__bandcamp {
|
||||
color: white;
|
||||
background-color: #28A0C1;
|
||||
display: inline-block;
|
||||
}
|
||||
|
||||
.source-label.source-label__bandcamp span {
|
||||
display: inline-block;
|
||||
margin: 0 4px;
|
||||
}
|
||||
|
||||
.main-section-wrapper {
|
||||
padding: 32px 48px 32px 36px;
|
||||
background-color: #f7f7f7;
|
||||
|
|
2
common/static/css/boofilsic.min.css
vendored
2
common/static/css/boofilsic.min.css
vendored
File diff suppressed because one or more lines are too long
|
@ -15,6 +15,8 @@ $goodreads-color-primary: #372213
|
|||
$goodreads-color-secondary: #F4F1EA
|
||||
$tmdb-color-primary: #91CCA3
|
||||
$tmdb-color-secondary: #1FB4E2
|
||||
$bandcamp-color-primary: #28A0C1
|
||||
$bandcamp-color-secondary: white
|
||||
|
||||
.source-label
|
||||
display: inline
|
||||
|
@ -78,4 +80,13 @@ $tmdb-color-secondary: #1FB4E2
|
|||
&.source-label__googlebooks
|
||||
color: white
|
||||
background-color: #4285F4
|
||||
border-color: #4285F4
|
||||
border-color: #4285F4
|
||||
&.source-label__bandcamp
|
||||
color: $bandcamp-color-secondary
|
||||
background-color: $bandcamp-color-primary
|
||||
// transform: skewX(-30deg)
|
||||
display: inline-block
|
||||
&.source-label__bandcamp span
|
||||
// transform: skewX(30deg)
|
||||
display: inline-block
|
||||
margin: 0 4px
|
||||
|
|
|
@ -49,7 +49,12 @@ class Album(Entity):
|
|||
return self.title
|
||||
|
||||
def get_embed_link(self):
|
||||
return self.source_url.replace("open.spotify.com/", "open.spotify.com/embed/") if self.source_site == SourceSiteEnum.SPOTIFY.value else None
|
||||
if self.source_site == SourceSiteEnum.SPOTIFY.value:
|
||||
return self.source_url.replace("open.spotify.com/", "open.spotify.com/embed/")
|
||||
elif self.source_site == SourceSiteEnum.BANDCAMP.value:
|
||||
return f"https://bandcamp.com/EmbeddedPlayer/album={self.other_info['bandcamp_album_id']}/size=large/bgcol=ffffff/linkcol=19A2CA/artwork=small/transparent=true/"
|
||||
else:
|
||||
return None
|
||||
|
||||
def get_absolute_url(self):
|
||||
return reverse("music:retrieve_album", args=[self.id])
|
||||
|
|
|
@ -350,7 +350,7 @@
|
|||
{% endif %}
|
||||
</div>
|
||||
|
||||
{% if album.source_site == "spotify" %}
|
||||
{% if album.source_site == "spotify" or album.source_site == "bandcamp" %}
|
||||
<iframe src="{{ album.get_embed_link }}" height="320" frameborder="0" allowtransparency="true" allow="encrypted-media"></iframe>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
|
10
sync/jobs.py
10
sync/jobs.py
|
@ -394,7 +394,9 @@ sync_task_manager = SyncTaskManger()
|
|||
|
||||
# sync_task_manager.start()
|
||||
|
||||
signal.signal(signal.SIGTERM, sync_task_manager.stop)
|
||||
if sys.platform.startswith('linux'):
|
||||
signal.signal(signal.SIGHUP, sync_task_manager.stop)
|
||||
signal.signal(signal.SIGINT, sync_task_manager.stop)
|
||||
if not settings.DEBUG:
|
||||
# TODO: it seems this prevent ^C from working properly
|
||||
signal.signal(signal.SIGTERM, sync_task_manager.stop)
|
||||
if sys.platform.startswith('linux'):
|
||||
signal.signal(signal.SIGHUP, sync_task_manager.stop)
|
||||
signal.signal(signal.SIGINT, sync_task_manager.stop)
|
||||
|
|
Loading…
Add table
Reference in a new issue