lib.itmens/common/scraper.py

252 lines
8.6 KiB
Python
Raw Normal View History

2020-05-12 14:05:12 +08:00
import requests
import functools
2020-05-12 14:05:12 +08:00
import random
import logging
2020-05-12 14:05:12 +08:00
import re
2021-02-12 19:23:23 +01:00
import dateparser
2021-02-15 21:27:50 +01:00
import datetime
import filetype
import dns.resolver
import urllib.parse
2021-02-15 21:27:50 +01:00
from lxml import html
from django.core.files.uploadedfile import SimpleUploadedFile
2020-11-22 14:11:59 +01:00
from common.models import SourceSiteEnum
from django.conf import settings
2021-12-10 21:55:16 -05:00
from django.core.exceptions import ValidationError
2020-11-22 14:11:59 +01:00
2020-05-12 14:05:12 +08:00
RE_NUMBERS = re.compile(r"\d+\d*")
RE_WHITESPACES = re.compile(r"\s+")
2020-11-22 14:11:59 +01:00
2020-05-12 14:05:12 +08:00
DEFAULT_REQUEST_HEADERS = {
2021-02-12 19:23:23 +01:00
'Host': '',
2020-05-12 14:05:12 +08:00
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:70.0) Gecko/20100101 Firefox/70.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
# well, since brotli lib is so bothering, remove `br`
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'no-cache',
2020-05-12 14:05:12 +08:00
}
# luminati account credentials
PORT = 22225
2021-12-10 21:55:16 -05:00
logger = logging.getLogger(__name__)
# register all implemented scraper in form of {host: scraper_class,}
scraper_registry = {}
2020-11-22 14:11:59 +01:00
def get_normalized_url(raw_url):
url = re.sub(r'//m.douban.com/(\w+)/', r'//\1.douban.com/', raw_url)
return url
def log_url(func):
"""
Catch exceptions and log then pass the exceptions.
2020-12-07 15:07:31 +01:00
First postion argument (except cls/self) of decorated function must be the url.
"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
2021-02-15 21:27:50 +01:00
# log the url and trace stack
2021-10-11 14:00:34 -04:00
logger.error(f"Scrape Failed URL: {args[1]}\n{e}")
if settings.DEBUG:
logger.error("Expections during scraping:", exc_info=e)
raise e
return wrapper
2021-12-10 07:19:16 -05:00
2021-02-26 16:36:44 +01:00
def parse_date(raw_str):
return dateparser.parse(
2021-10-06 21:21:24 -04:00
raw_str,
2021-02-26 16:36:44 +01:00
settings={
2021-12-09 22:00:09 -05:00
"RELATIVE_BASE": datetime.datetime(1900, 1, 1)
2021-02-26 16:36:44 +01:00
}
)
2020-05-12 14:05:12 +08:00
2021-12-09 22:00:09 -05:00
2020-11-22 14:11:59 +01:00
class AbstractScraper:
2021-02-15 21:27:50 +01:00
"""
Scrape entities. The entities means those defined in the models.py file,
like Book, Movie......
"""
2020-11-22 14:11:59 +01:00
# subclasses must specify those two variables
# site means general sites, like amazon/douban etc
2021-02-12 19:23:23 +01:00
site_name = None
# host means technically hostname
2020-11-22 14:11:59 +01:00
host = None
# corresponding data class
data_class = None
# corresponding form class
form_class = None
# used to extract effective url
regex = None
2021-02-15 21:27:50 +01:00
# scraped raw image
raw_img = None
# scraped raw data
raw_data = {}
2020-11-22 14:11:59 +01:00
def __init_subclass__(cls, **kwargs):
# this statement initialize the subclasses
super().__init_subclass__(**kwargs)
2021-02-12 19:23:23 +01:00
assert cls.site_name is not None, "class variable `site_name` must be specified"
assert bool(cls.host), "class variable `host` must be specified"
assert cls.data_class is not None, "class variable `data_class` must be specified"
assert cls.form_class is not None, "class variable `form_class` must be specified"
assert cls.regex is not None, "class variable `regex` must be specified"
2021-02-12 19:23:23 +01:00
assert isinstance(cls.host, str) or (isinstance(cls.host, list) and isinstance(
cls.host[0], str)), "`host` must be type str or list"
assert cls.site_name in SourceSiteEnum, "`site_name` must be one of `SourceSiteEnum` value"
assert hasattr(cls, 'scrape') and callable(
cls.scrape), "scaper must have method `.scrape()`"
2020-11-22 14:11:59 +01:00
# decorate the scrape method
cls.scrape = classmethod(log_url(cls.scrape))
2021-10-06 21:21:24 -04:00
2021-02-12 19:23:23 +01:00
# register scraper
if isinstance(cls.host, list):
for host in cls.host:
scraper_registry[host] = cls
else:
scraper_registry[cls.host] = cls
2020-11-22 14:11:59 +01:00
def scrape(self, url):
"""
Scrape/request model schema specified data from given url and return it.
Implementations of subclasses to this method would be decorated as class method.
return (data_dict, image)
2021-02-15 21:27:50 +01:00
Should set the `raw_data` and the `raw_img`
2020-11-22 14:11:59 +01:00
"""
raise NotImplementedError("Subclass should implement this method")
@classmethod
def get_effective_url(cls, raw_url):
2021-02-15 21:27:50 +01:00
"""
The return value should be identical with that saved in DB as `source_url`
"""
2021-10-11 14:00:34 -04:00
url = cls.regex.findall(raw_url.replace('http:', 'https:')) # force all http to be https
2020-11-22 14:11:59 +01:00
if not url:
raise ValueError("not valid url")
2020-12-07 15:07:31 +01:00
return url[0]
@classmethod
def download_page(cls, url, headers):
url = cls.get_effective_url(url)
2020-11-22 14:11:59 +01:00
session_id = random.random()
proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %
(settings.LUMINATI_USERNAME, session_id, settings.LUMINATI_PASSWORD, PORT))
2020-11-22 14:11:59 +01:00
proxies = {
'http': proxy_url,
'https': proxy_url,
}
2021-09-10 22:57:40 -04:00
if settings.LUMINATI_USERNAME is None:
proxies = None
2021-02-12 19:23:23 +01:00
r = requests.get(url, proxies=proxies,
2021-12-10 07:19:16 -05:00
headers=headers, timeout=settings.SCRAPING_TIMEOUT)
2020-11-22 14:11:59 +01:00
2021-02-25 21:00:44 +01:00
if r.status_code != 200:
raise RuntimeError(f"download page failed, status code {r.status_code}")
# with open('temp.html', 'w', encoding='utf-8') as fp:
# fp.write(r.content.decode('utf-8'))
2020-11-22 14:11:59 +01:00
return html.fromstring(r.content.decode('utf-8'))
@classmethod
def download_image(cls, url, item_url=None):
2020-11-22 14:11:59 +01:00
if url is None:
return None, None
2020-11-22 14:11:59 +01:00
raw_img = None
session_id = random.random()
proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %
(settings.LUMINATI_USERNAME, session_id, settings.LUMINATI_PASSWORD, PORT))
2020-11-22 14:11:59 +01:00
proxies = {
'http': proxy_url,
'https': proxy_url,
}
2021-09-10 22:57:40 -04:00
if settings.LUMINATI_USERNAME is None:
proxies = None
2020-11-22 14:11:59 +01:00
if url:
img_response = requests.get(
url,
headers={
'accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'accept-encoding': 'gzip, deflate',
'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,fr-FR;q=0.6,fr;q=0.5,zh-TW;q=0.4',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 Edg/81.0.416.72',
'cache-control': 'no-cache',
'dnt': '1',
},
proxies=proxies,
2021-12-10 07:19:16 -05:00
timeout=settings.SCRAPING_TIMEOUT,
2020-11-22 14:11:59 +01:00
)
if img_response.status_code == 200:
raw_img = img_response.content
2021-02-15 21:27:50 +01:00
content_type = img_response.headers.get('Content-Type')
ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
2021-02-25 21:52:28 +01:00
else:
ext = None
2021-02-15 21:27:50 +01:00
return raw_img, ext
@classmethod
def save(cls, request_user):
entity_cover = {
'cover': SimpleUploadedFile('temp.' + cls.img_ext, cls.raw_img)
} if cls.img_ext is not None else None
2021-02-15 21:27:50 +01:00
form = cls.form_class(cls.raw_data, entity_cover)
if form.is_valid():
form.instance.last_editor = request_user
form.save()
cls.instance = form.instance
else:
logger.error(str(form.errors))
raise ValidationError("Form invalid.")
return form
2020-11-22 14:11:59 +01:00
2021-12-09 22:00:09 -05:00
from common.scrapers.bandcamp import BandcampAlbumScraper
2021-12-10 21:55:16 -05:00
from common.scrapers.goodreads import GoodreadsScraper
from common.scrapers.google import GoogleBooksScraper
from common.scrapers.tmdb import TmdbMovieScraper
from common.scrapers.steam import SteamGameScraper
from common.scrapers.imdb import ImdbMovieScraper
from common.scrapers.spotify import SpotifyAlbumScraper, SpotifyTrackScraper
from common.scrapers.douban import DoubanAlbumScraper, DoubanBookScraper, DoubanGameScraper, DoubanMovieScraper
from common.scrapers.bangumi import BangumiScraper
def get_scraper_by_url(url):
parsed_url = urllib.parse.urlparse(url)
hostname = parsed_url.netloc
for host in scraper_registry:
if host == hostname:
return scraper_registry[host]
# TODO move this logic to scraper class
try:
answers = dns.resolver.query(hostname, 'CNAME')
for rdata in answers:
if str(rdata.target) == 'dom.bandcamp.com.':
return BandcampAlbumScraper
except Exception as e:
pass
try:
answers = dns.resolver.query(hostname, 'A')
for rdata in answers:
if str(rdata.address) == '35.241.62.186':
return BandcampAlbumScraper
except Exception as e:
pass
return None