supports m.douban.com and customized bandcamp domain
This commit is contained in:
parent
93968ed2d3
commit
5635b06e73
4 changed files with 40 additions and 19 deletions
|
@ -1,5 +1,5 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
from common.scraper import scraper_registry
|
||||
from common.scraper import get_scraper_by_url, get_normalized_url
|
||||
import pprint
|
||||
|
||||
|
||||
|
@ -11,17 +11,13 @@ class Command(BaseCommand):
|
|||
|
||||
def handle(self, *args, **options):
|
||||
url = str(options['url'])
|
||||
matched_host = None
|
||||
for host in scraper_registry:
|
||||
if host in url:
|
||||
matched_host = host
|
||||
break
|
||||
url = get_normalized_url(url)
|
||||
scraper = get_scraper_by_url(url)
|
||||
|
||||
if matched_host is None:
|
||||
if scraper is None:
|
||||
self.stdout.write(self.style.ERROR(f'Unable to match a scraper for {url}'))
|
||||
return
|
||||
|
||||
scraper = scraper_registry[matched_host]
|
||||
effective_url = scraper.get_effective_url(url)
|
||||
self.stdout.write(f'Fetching {effective_url} via {scraper.__name__}')
|
||||
data, img = scraper.scrape(effective_url)
|
||||
|
|
|
@ -6,6 +6,8 @@ import re
|
|||
import dateparser
|
||||
import datetime
|
||||
import filetype
|
||||
import dns.resolver
|
||||
import urllib.parse
|
||||
from lxml import html
|
||||
from django.core.files.uploadedfile import SimpleUploadedFile
|
||||
from common.models import SourceSiteEnum
|
||||
|
@ -42,6 +44,11 @@ logger = logging.getLogger(__name__)
|
|||
scraper_registry = {}
|
||||
|
||||
|
||||
def get_normalized_url(raw_url):
|
||||
url = re.sub(r'//m.douban.com/(\w+)/', r'//\1.douban.com/', raw_url)
|
||||
return url
|
||||
|
||||
|
||||
def log_url(func):
|
||||
"""
|
||||
Catch exceptions and log then pass the exceptions.
|
||||
|
@ -218,3 +225,27 @@ from common.scrapers.imdb import ImdbMovieScraper
|
|||
from common.scrapers.spotify import SpotifyAlbumScraper, SpotifyTrackScraper
|
||||
from common.scrapers.douban import DoubanAlbumScraper, DoubanBookScraper, DoubanGameScraper, DoubanMovieScraper
|
||||
from common.scrapers.bangumi import BangumiScraper
|
||||
|
||||
|
||||
def get_scraper_by_url(url):
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
hostname = parsed_url.netloc
|
||||
for host in scraper_registry:
|
||||
if host == hostname:
|
||||
return scraper_registry[host]
|
||||
# TODO move this logic to scraper class
|
||||
try:
|
||||
answers = dns.resolver.query(hostname, 'CNAME')
|
||||
for rdata in answers:
|
||||
if str(rdata.target) == 'dom.bandcamp.com.':
|
||||
return BandcampAlbumScraper
|
||||
except Exception as e:
|
||||
pass
|
||||
try:
|
||||
answers = dns.resolver.query(hostname, 'A')
|
||||
for rdata in answers:
|
||||
if str(rdata.address) == '35.241.62.186':
|
||||
return BandcampAlbumScraper
|
||||
except Exception as e:
|
||||
pass
|
||||
return None
|
||||
|
|
|
@ -15,7 +15,7 @@ class BandcampAlbumScraper(AbstractScraper):
|
|||
data_class = Album
|
||||
form_class = AlbumForm
|
||||
|
||||
regex = re.compile(r"https://[\w-]+\.bandcamp\.com/album/[^?#]+")
|
||||
regex = re.compile(r"https://[a-zA-Z0-0\-\.]+/album/[^?#]+")
|
||||
|
||||
def scrape(self, url, response=None):
|
||||
effective_url = self.get_effective_url(url)
|
||||
|
|
|
@ -19,7 +19,7 @@ from mastodon.decorators import mastodon_request_included
|
|||
from users.views import home as user_home
|
||||
from common.models import MarkStatusEnum
|
||||
from common.utils import PageLinksGenerator
|
||||
from common.scraper import scraper_registry
|
||||
from common.scraper import get_scraper_by_url, get_normalized_url
|
||||
from common.config import *
|
||||
from common.searcher import ExternalSources
|
||||
from management.models import Announcement
|
||||
|
@ -383,18 +383,12 @@ def jump_or_scrape(request, url):
|
|||
if this_site in url:
|
||||
return redirect(url)
|
||||
|
||||
# match url to registerd sites
|
||||
matched_host = None
|
||||
for host in scraper_registry:
|
||||
if host in url:
|
||||
matched_host = host
|
||||
break
|
||||
|
||||
if matched_host is None:
|
||||
url = get_normalized_url(url)
|
||||
scraper = get_scraper_by_url(url)
|
||||
if scraper is None:
|
||||
# invalid url
|
||||
return render(request, 'common/error.html', {'msg': _("链接非法,查询失败")})
|
||||
else:
|
||||
scraper = scraper_registry[matched_host]
|
||||
try:
|
||||
# raise ObjectDoesNotExist
|
||||
effective_url = scraper.get_effective_url(url)
|
||||
|
|
Loading…
Add table
Reference in a new issue