supports m.douban.com and customized bandcamp domain

This commit is contained in:
Your Name 2022-02-05 11:02:16 -05:00
parent 93968ed2d3
commit 5635b06e73
4 changed files with 40 additions and 19 deletions

View file

@ -1,5 +1,5 @@
from django.core.management.base import BaseCommand
from common.scraper import scraper_registry
from common.scraper import get_scraper_by_url, get_normalized_url
import pprint
@ -11,17 +11,13 @@ class Command(BaseCommand):
def handle(self, *args, **options):
url = str(options['url'])
matched_host = None
for host in scraper_registry:
if host in url:
matched_host = host
break
url = get_normalized_url(url)
scraper = get_scraper_by_url(url)
if matched_host is None:
if scraper is None:
self.stdout.write(self.style.ERROR(f'Unable to match a scraper for {url}'))
return
scraper = scraper_registry[matched_host]
effective_url = scraper.get_effective_url(url)
self.stdout.write(f'Fetching {effective_url} via {scraper.__name__}')
data, img = scraper.scrape(effective_url)

View file

@ -6,6 +6,8 @@ import re
import dateparser
import datetime
import filetype
import dns.resolver
import urllib.parse
from lxml import html
from django.core.files.uploadedfile import SimpleUploadedFile
from common.models import SourceSiteEnum
@ -42,6 +44,11 @@ logger = logging.getLogger(__name__)
scraper_registry = {}
def get_normalized_url(raw_url):
url = re.sub(r'//m.douban.com/(\w+)/', r'//\1.douban.com/', raw_url)
return url
def log_url(func):
"""
Catch exceptions and log then pass the exceptions.
@ -218,3 +225,27 @@ from common.scrapers.imdb import ImdbMovieScraper
from common.scrapers.spotify import SpotifyAlbumScraper, SpotifyTrackScraper
from common.scrapers.douban import DoubanAlbumScraper, DoubanBookScraper, DoubanGameScraper, DoubanMovieScraper
from common.scrapers.bangumi import BangumiScraper
def get_scraper_by_url(url):
parsed_url = urllib.parse.urlparse(url)
hostname = parsed_url.netloc
for host in scraper_registry:
if host == hostname:
return scraper_registry[host]
# TODO move this logic to scraper class
try:
answers = dns.resolver.query(hostname, 'CNAME')
for rdata in answers:
if str(rdata.target) == 'dom.bandcamp.com.':
return BandcampAlbumScraper
except Exception as e:
pass
try:
answers = dns.resolver.query(hostname, 'A')
for rdata in answers:
if str(rdata.address) == '35.241.62.186':
return BandcampAlbumScraper
except Exception as e:
pass
return None

View file

@ -15,7 +15,7 @@ class BandcampAlbumScraper(AbstractScraper):
data_class = Album
form_class = AlbumForm
regex = re.compile(r"https://[\w-]+\.bandcamp\.com/album/[^?#]+")
regex = re.compile(r"https://[a-zA-Z0-0\-\.]+/album/[^?#]+")
def scrape(self, url, response=None):
effective_url = self.get_effective_url(url)

View file

@ -19,7 +19,7 @@ from mastodon.decorators import mastodon_request_included
from users.views import home as user_home
from common.models import MarkStatusEnum
from common.utils import PageLinksGenerator
from common.scraper import scraper_registry
from common.scraper import get_scraper_by_url, get_normalized_url
from common.config import *
from common.searcher import ExternalSources
from management.models import Announcement
@ -383,18 +383,12 @@ def jump_or_scrape(request, url):
if this_site in url:
return redirect(url)
# match url to registerd sites
matched_host = None
for host in scraper_registry:
if host in url:
matched_host = host
break
if matched_host is None:
url = get_normalized_url(url)
scraper = get_scraper_by_url(url)
if scraper is None:
# invalid url
return render(request, 'common/error.html', {'msg': _("链接非法,查询失败")})
else:
scraper = scraper_registry[matched_host]
try:
# raise ObjectDoesNotExist
effective_url = scraper.get_effective_url(url)