diff --git a/common/management/commands/scrape.py b/common/management/commands/scrape.py index 000218c9..d48898e6 100644 --- a/common/management/commands/scrape.py +++ b/common/management/commands/scrape.py @@ -1,5 +1,5 @@ from django.core.management.base import BaseCommand -from common.scraper import scraper_registry +from common.scraper import get_scraper_by_url, get_normalized_url import pprint @@ -11,17 +11,13 @@ class Command(BaseCommand): def handle(self, *args, **options): url = str(options['url']) - matched_host = None - for host in scraper_registry: - if host in url: - matched_host = host - break + url = get_normalized_url(url) + scraper = get_scraper_by_url(url) - if matched_host is None: + if scraper is None: self.stdout.write(self.style.ERROR(f'Unable to match a scraper for {url}')) return - scraper = scraper_registry[matched_host] effective_url = scraper.get_effective_url(url) self.stdout.write(f'Fetching {effective_url} via {scraper.__name__}') data, img = scraper.scrape(effective_url) diff --git a/common/scraper.py b/common/scraper.py index 53e26f45..47e2b906 100644 --- a/common/scraper.py +++ b/common/scraper.py @@ -6,6 +6,8 @@ import re import dateparser import datetime import filetype +import dns.resolver +import urllib.parse from lxml import html from django.core.files.uploadedfile import SimpleUploadedFile from common.models import SourceSiteEnum @@ -42,6 +44,11 @@ logger = logging.getLogger(__name__) scraper_registry = {} +def get_normalized_url(raw_url): + url = re.sub(r'//m.douban.com/(\w+)/', r'//\1.douban.com/', raw_url) + return url + + def log_url(func): """ Catch exceptions and log then pass the exceptions. @@ -218,3 +225,27 @@ from common.scrapers.imdb import ImdbMovieScraper from common.scrapers.spotify import SpotifyAlbumScraper, SpotifyTrackScraper from common.scrapers.douban import DoubanAlbumScraper, DoubanBookScraper, DoubanGameScraper, DoubanMovieScraper from common.scrapers.bangumi import BangumiScraper + + +def get_scraper_by_url(url): + parsed_url = urllib.parse.urlparse(url) + hostname = parsed_url.netloc + for host in scraper_registry: + if host == hostname: + return scraper_registry[host] + # TODO move this logic to scraper class + try: + answers = dns.resolver.query(hostname, 'CNAME') + for rdata in answers: + if str(rdata.target) == 'dom.bandcamp.com.': + return BandcampAlbumScraper + except Exception as e: + pass + try: + answers = dns.resolver.query(hostname, 'A') + for rdata in answers: + if str(rdata.address) == '35.241.62.186': + return BandcampAlbumScraper + except Exception as e: + pass + return None diff --git a/common/scrapers/bandcamp.py b/common/scrapers/bandcamp.py index 0457c77a..ab187cf9 100644 --- a/common/scrapers/bandcamp.py +++ b/common/scrapers/bandcamp.py @@ -15,7 +15,7 @@ class BandcampAlbumScraper(AbstractScraper): data_class = Album form_class = AlbumForm - regex = re.compile(r"https://[\w-]+\.bandcamp\.com/album/[^?#]+") + regex = re.compile(r"https://[a-zA-Z0-0\-\.]+/album/[^?#]+") def scrape(self, url, response=None): effective_url = self.get_effective_url(url) diff --git a/common/views.py b/common/views.py index ce813fba..885be39c 100644 --- a/common/views.py +++ b/common/views.py @@ -19,7 +19,7 @@ from mastodon.decorators import mastodon_request_included from users.views import home as user_home from common.models import MarkStatusEnum from common.utils import PageLinksGenerator -from common.scraper import scraper_registry +from common.scraper import get_scraper_by_url, get_normalized_url from common.config import * from common.searcher import ExternalSources from management.models import Announcement @@ -383,18 +383,12 @@ def jump_or_scrape(request, url): if this_site in url: return redirect(url) - # match url to registerd sites - matched_host = None - for host in scraper_registry: - if host in url: - matched_host = host - break - - if matched_host is None: + url = get_normalized_url(url) + scraper = get_scraper_by_url(url) + if scraper is None: # invalid url return render(request, 'common/error.html', {'msg': _("链接非法,查询失败")}) else: - scraper = scraper_registry[matched_host] try: # raise ObjectDoesNotExist effective_url = scraper.get_effective_url(url)