lib.itmens/common/scraper.py

import requests
import functools
import random
import logging
import re
import dateparser
import datetime
import filetype
import dns.resolver
import urllib.parse
from lxml import html
from django.core.files.uploadedfile import SimpleUploadedFile
from common.models import SourceSiteEnum
from django.conf import settings
from django.core.exceptions import ValidationError


RE_NUMBERS = re.compile(r"\d+\d*")
RE_WHITESPACES = re.compile(r"\s+")


DEFAULT_REQUEST_HEADERS = {
    'Host': '',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:70.0) Gecko/20100101 Firefox/70.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    # well, since brotli lib is so bothering, remove `br`
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
    'DNT': '1',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'no-cache',
}


# luminati account credentials
PORT = 22225


logger = logging.getLogger(__name__)


# register all implemented scraper in form of {host: scraper_class,}
scraper_registry = {}


def get_normalized_url(raw_url):
    url = re.sub(r'//m.douban.com/(\w+)/', r'//\1.douban.com/', raw_url)
    return url


def log_url(func):
    """
    Catch exceptions and log then pass the exceptions.
    First postion argument (except cls/self) of decorated function must be the url.
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            # log the url and trace stack
            logger.error(f"Scrape Failed URL: {args[1]}\n{e}")
            if settings.DEBUG:
                logger.error("Expections during scraping:", exc_info=e)
            raise e

    return wrapper


def parse_date(raw_str):
    return dateparser.parse(
        raw_str,
        settings={
            "RELATIVE_BASE": datetime.datetime(1900, 1, 1)
        }
    )


class AbstractScraper:
    """
    Scrape entities. The entities means those defined in the models.py file,
    like Book, Movie......
    """

    # subclasses must specify those two variables
    # site means general sites, like amazon/douban etc
    site_name = None
    # host means technically hostname
    host = None
    # corresponding data class
    data_class = None
    # corresponding form class
    form_class = None
    # used to extract effective url
    regex = None
    # scraped raw image
    raw_img = None
    # scraped raw data
    raw_data = {}

    def __init_subclass__(cls, **kwargs):
        # this statement initialize the subclasses
        super().__init_subclass__(**kwargs)
        assert cls.site_name is not None, "class variable `site_name` must be specified"
        assert bool(cls.host), "class variable `host` must be specified"
        assert cls.data_class is not None, "class variable `data_class` must be specified"
        assert cls.form_class is not None, "class variable `form_class` must be specified"
        assert cls.regex is not None, "class variable `regex` must be specified"
        assert isinstance(cls.host, str) or (isinstance(cls.host, list) and isinstance(
            cls.host[0], str)), "`host` must be type str or list"
        assert cls.site_name in SourceSiteEnum, "`site_name` must be one of `SourceSiteEnum` value"
        assert hasattr(cls, 'scrape') and callable(
            cls.scrape), "scaper must have method `.scrape()`"

        # decorate the scrape method
        cls.scrape = classmethod(log_url(cls.scrape))

        # register scraper
        if isinstance(cls.host, list):
            for host in cls.host:
                scraper_registry[host] = cls
        else:
            scraper_registry[cls.host] = cls

    def scrape(self, url):
        """
        Scrape/request model schema specified data from given url and return it.
        Implementations of subclasses to this method would be decorated as class method.
        return (data_dict, image)
        Should set the `raw_data` and the `raw_img`
        """
        raise NotImplementedError("Subclass should implement this method")

    @classmethod
    def get_effective_url(cls, raw_url):
        """
        The return value should be identical with that saved in DB as `source_url`
        """
        url = cls.regex.findall(raw_url.replace('http:', 'https:'))  # force all http to be https
        if not url:
            raise ValueError("not valid url")
        return url[0]

    @classmethod
    def download_page(cls, url, headers):
        url = cls.get_effective_url(url)

        session_id = random.random()
        proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %
                     (settings.LUMINATI_USERNAME, session_id, settings.LUMINATI_PASSWORD, PORT))
        proxies = {
            'http': proxy_url,
            'https': proxy_url,
        }
        if settings.LUMINATI_USERNAME is None:
            proxies = None
        r = requests.get(url, proxies=proxies,
                         headers=headers, timeout=settings.SCRAPING_TIMEOUT)

        if r.status_code != 200:
            raise RuntimeError(f"download page failed, status code {r.status_code}")
        # with open('temp.html', 'w', encoding='utf-8') as fp:
        #     fp.write(r.content.decode('utf-8'))
        return html.fromstring(r.content.decode('utf-8'))

    @classmethod
    def download_image(cls, url, item_url=None):
        if url is None:
            return None, None
        raw_img = None
        session_id = random.random()
        proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %
                     (settings.LUMINATI_USERNAME, session_id, settings.LUMINATI_PASSWORD, PORT))
        proxies = {
            'http': proxy_url,
            'https': proxy_url,
        }
        if settings.LUMINATI_USERNAME is None:
            proxies = None
        if url:
            img_response = requests.get(
                url,
                headers={
                    'accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
                    'accept-encoding': 'gzip, deflate',
                    'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,fr-FR;q=0.6,fr;q=0.5,zh-TW;q=0.4',
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 Edg/81.0.416.72',
                    'cache-control': 'no-cache',
                    'dnt': '1',
                },
                proxies=proxies,
                timeout=settings.SCRAPING_TIMEOUT,
            )
            if img_response.status_code == 200:
                raw_img = img_response.content
                content_type = img_response.headers.get('Content-Type')
                ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
            else:
                ext = None
        return raw_img, ext

    @classmethod
    def save(cls, request_user):
        entity_cover = {
            'cover': SimpleUploadedFile('temp.' + cls.img_ext, cls.raw_img)
        } if cls.img_ext is not None else None
        form = cls.form_class(cls.raw_data, entity_cover)
        if form.is_valid():
            form.instance.last_editor = request_user
            form.save()
            cls.instance = form.instance
        else:
            logger.error(str(form.errors))
            raise ValidationError("Form invalid.")
        return form


from common.scrapers.bandcamp import BandcampAlbumScraper
from common.scrapers.goodreads import GoodreadsScraper
from common.scrapers.google import GoogleBooksScraper
from common.scrapers.tmdb import TmdbMovieScraper
from common.scrapers.steam import SteamGameScraper
from common.scrapers.imdb import ImdbMovieScraper
from common.scrapers.spotify import SpotifyAlbumScraper, SpotifyTrackScraper
from common.scrapers.douban import DoubanAlbumScraper, DoubanBookScraper, DoubanGameScraper, DoubanMovieScraper
from common.scrapers.bangumi import BangumiScraper


def get_scraper_by_url(url):
    parsed_url = urllib.parse.urlparse(url)
    hostname = parsed_url.netloc
    for host in scraper_registry:
        if host == hostname:
            return scraper_registry[host]
    # TODO move this logic to scraper class
    try:
        answers = dns.resolver.query(hostname, 'CNAME')
        for rdata in answers:
            if str(rdata.target) == 'dom.bandcamp.com.':
                return BandcampAlbumScraper
    except Exception as e:
        pass
    try:
        answers = dns.resolver.query(hostname, 'A')
        for rdata in answers:
            if str(rdata.address) == '35.241.62.186':
                return BandcampAlbumScraper
    except Exception as e:
        pass
    return None
add click-to-scrape feature \| close #9 2020-05-12 14:05:12 +08:00			`import requests`
fix some display issues & add scrape url logger 2020-10-06 22:19:42 +02:00			`import functools`
add click-to-scrape feature \| close #9 2020-05-12 14:05:12 +08:00			`import random`
fix some display issues & add scrape url logger 2020-10-06 22:19:42 +02:00			`import logging`
add click-to-scrape feature \| close #9 2020-05-12 14:05:12 +08:00			`import re`
Implement basic Music functions 2021-02-12 19:23:23 +01:00			`import dateparser`
finish music part 2021-02-15 21:27:50 +01:00			`import datetime`
fix scraping failure with wepb image (merge upstream/fix-webp-scrape) 2021-10-27 15:31:49 +02:00			`import filetype`
supports m.douban.com and customized bandcamp domain 2022-02-05 11:02:16 -05:00			`import dns.resolver`
			`import urllib.parse`
finish music part 2021-02-15 21:27:50 +01:00			`from lxml import html`
			`from django.core.files.uploadedfile import SimpleUploadedFile`
restructure scrapers 2020-11-22 14:11:59 +01:00			`from common.models import SourceSiteEnum`
quick hack allow alternative settings and login via any Mastodon site 2021-09-10 20:24:22 -04:00			`from django.conf import settings`
individual scrapers 2021-12-10 21:55:16 -05:00			`from django.core.exceptions import ValidationError`
restructure scrapers 2020-11-22 14:11:59 +01:00
add click-to-scrape feature \| close #9 2020-05-12 14:05:12 +08:00
			`RE_NUMBERS = re.compile(r"\d+\d*")`
			`RE_WHITESPACES = re.compile(r"\s+")`
restructure scrapers 2020-11-22 14:11:59 +01:00
add click-to-scrape feature \| close #9 2020-05-12 14:05:12 +08:00
			`DEFAULT_REQUEST_HEADERS = {`
Implement basic Music functions 2021-02-12 19:23:23 +01:00			`'Host': '',`
add click-to-scrape feature \| close #9 2020-05-12 14:05:12 +08:00			`'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:70.0) Gecko/20100101 Firefox/70.0',`
			`'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',`
			`'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',`
			# well, since brotli lib is so bothering, remove `br`
			`'Accept-Encoding': 'gzip, deflate',`
			`'Connection': 'keep-alive',`
			`'DNT': '1',`
			`'Upgrade-Insecure-Requests': '1',`
add douban movie scraper & movie app skeleton 2020-09-29 21:46:21 +02:00			`'Cache-Control': 'no-cache',`
add click-to-scrape feature \| close #9 2020-05-12 14:05:12 +08:00			`}`


			`# luminati account credentials`
			`PORT = 22225`

individual scrapers 2021-12-10 21:55:16 -05:00
fix some display issues & add scrape url logger 2020-10-06 22:19:42 +02:00			`logger = logging.getLogger(__name__)`


add backend support for multiple entity sources 2020-11-23 23:18:14 +01:00			`# register all implemented scraper in form of {host: scraper_class,}`
			`scraper_registry = {}`
restructure scrapers 2020-11-22 14:11:59 +01:00

supports m.douban.com and customized bandcamp domain 2022-02-05 11:02:16 -05:00			`def get_normalized_url(raw_url):`
			`url = re.sub(r'//m.douban.com/(\w+)/', r'//\1.douban.com/', raw_url)`
			`return url`


fix some display issues & add scrape url logger 2020-10-06 22:19:42 +02:00			`def log_url(func):`
			`"""`
			`Catch exceptions and log then pass the exceptions.`
minor modifications 2020-12-07 15:07:31 +01:00			`First postion argument (except cls/self) of decorated function must be the url.`
fix some display issues & add scrape url logger 2020-10-06 22:19:42 +02:00			`"""`
			`@functools.wraps(func)`
			`def wrapper(args, *kwargs):`
			`try:`
			`return func(args, *kwargs)`
			`except Exception as e:`
finish music part 2021-02-15 21:27:50 +01:00			`# log the url and trace stack`
Scrape Douban via Wayback Machine 2021-10-11 14:00:34 -04:00			`logger.error(f"Scrape Failed URL: {args[1]}\n{e}")`
			`if settings.DEBUG:`
			`logger.error("Expections during scraping:", exc_info=e)`
fix some display issues & add scrape url logger 2020-10-06 22:19:42 +02:00			`raise e`

			`return wrapper`

adjust timeout 2021-12-10 07:19:16 -05:00
add bangumi support 2021-02-26 16:36:44 +01:00			`def parse_date(raw_str):`
			`return dateparser.parse(`
half-baked support for TMDB 2021-10-06 21:21:24 -04:00			`raw_str,`
add bangumi support 2021-02-26 16:36:44 +01:00			`settings={`
add missing MovieGenre 2021-12-09 22:00:09 -05:00			`"RELATIVE_BASE": datetime.datetime(1900, 1, 1)`
add bangumi support 2021-02-26 16:36:44 +01:00			`}`
			`)`
add click-to-scrape feature \| close #9 2020-05-12 14:05:12 +08:00
add missing MovieGenre 2021-12-09 22:00:09 -05:00
restructure scrapers 2020-11-22 14:11:59 +01:00			`class AbstractScraper:`
finish music part 2021-02-15 21:27:50 +01:00			`"""`
			`Scrape entities. The entities means those defined in the models.py file,`
			`like Book, Movie......`
			`"""`
restructure scrapers 2020-11-22 14:11:59 +01:00
			`# subclasses must specify those two variables`
add backend support for multiple entity sources 2020-11-23 23:18:14 +01:00			`# site means general sites, like amazon/douban etc`
Implement basic Music functions 2021-02-12 19:23:23 +01:00			`site_name = None`
add backend support for multiple entity sources 2020-11-23 23:18:14 +01:00			`# host means technically hostname`
restructure scrapers 2020-11-22 14:11:59 +01:00			`host = None`
add backend support for multiple entity sources 2020-11-23 23:18:14 +01:00			`# corresponding data class`
			`data_class = None`
			`# corresponding form class`
			`form_class = None`
			`# used to extract effective url`
			`regex = None`
finish music part 2021-02-15 21:27:50 +01:00			`# scraped raw image`
			`raw_img = None`
			`# scraped raw data`
			`raw_data = {}`
restructure scrapers 2020-11-22 14:11:59 +01:00
			`def __init_subclass__(cls, **kwargs):`
			`# this statement initialize the subclasses`
			`super().__init_subclass__(**kwargs)`
Implement basic Music functions 2021-02-12 19:23:23 +01:00			assert cls.site_name is not None, "class variable `site_name` must be specified"
			assert bool(cls.host), "class variable `host` must be specified"
add backend support for multiple entity sources 2020-11-23 23:18:14 +01:00			assert cls.data_class is not None, "class variable `data_class` must be specified"
			assert cls.form_class is not None, "class variable `form_class` must be specified"
			assert cls.regex is not None, "class variable `regex` must be specified"
Implement basic Music functions 2021-02-12 19:23:23 +01:00			`assert isinstance(cls.host, str) or (isinstance(cls.host, list) and isinstance(`
			cls.host[0], str)), "`host` must be type str or list"
			assert cls.site_name in SourceSiteEnum, "`site_name` must be one of `SourceSiteEnum` value"
			`assert hasattr(cls, 'scrape') and callable(`
			cls.scrape), "scaper must have method `.scrape()`"

restructure scrapers 2020-11-22 14:11:59 +01:00			`# decorate the scrape method`
			`cls.scrape = classmethod(log_url(cls.scrape))`
half-baked support for TMDB 2021-10-06 21:21:24 -04:00
Implement basic Music functions 2021-02-12 19:23:23 +01:00			`# register scraper`
			`if isinstance(cls.host, list):`
			`for host in cls.host:`
			`scraper_registry[host] = cls`
			`else:`
			`scraper_registry[cls.host] = cls`
restructure scrapers 2020-11-22 14:11:59 +01:00
			`def scrape(self, url):`
			`"""`
			`Scrape/request model schema specified data from given url and return it.`
			`Implementations of subclasses to this method would be decorated as class method.`
add backend support for multiple entity sources 2020-11-23 23:18:14 +01:00			`return (data_dict, image)`
finish music part 2021-02-15 21:27:50 +01:00			Should set the `raw_data` and the `raw_img`
restructure scrapers 2020-11-22 14:11:59 +01:00			`"""`
			`raise NotImplementedError("Subclass should implement this method")`

			`@classmethod`
add backend support for multiple entity sources 2020-11-23 23:18:14 +01:00			`def get_effective_url(cls, raw_url):`
finish music part 2021-02-15 21:27:50 +01:00			`"""`
			The return value should be identical with that saved in DB as `source_url`
			`"""`
Scrape Douban via Wayback Machine 2021-10-11 14:00:34 -04:00			`url = cls.regex.findall(raw_url.replace('http:', 'https:')) # force all http to be https`
restructure scrapers 2020-11-22 14:11:59 +01:00			`if not url:`
			`raise ValueError("not valid url")`
minor modifications 2020-12-07 15:07:31 +01:00			`return url[0]`
add backend support for multiple entity sources 2020-11-23 23:18:14 +01:00
			`@classmethod`
			`def download_page(cls, url, headers):`
			`url = cls.get_effective_url(url)`
restructure scrapers 2020-11-22 14:11:59 +01:00
			`session_id = random.random()`
			`proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %`
quick hack allow alternative settings and login via any Mastodon site 2021-09-10 20:24:22 -04:00			`(settings.LUMINATI_USERNAME, session_id, settings.LUMINATI_PASSWORD, PORT))`
restructure scrapers 2020-11-22 14:11:59 +01:00			`proxies = {`
			`'http': proxy_url,`
			`'https': proxy_url,`
			`}`
allow disable proxy 2021-09-10 22:57:40 -04:00			`if settings.LUMINATI_USERNAME is None:`
			`proxies = None`
Implement basic Music functions 2021-02-12 19:23:23 +01:00			`r = requests.get(url, proxies=proxies,`
adjust timeout 2021-12-10 07:19:16 -05:00			`headers=headers, timeout=settings.SCRAPING_TIMEOUT)`
restructure scrapers 2020-11-22 14:11:59 +01:00
trick steam mature content test 2021-02-25 21:00:44 +01:00			`if r.status_code != 200:`
			`raise RuntimeError(f"download page failed, status code {r.status_code}")`
			`# with open('temp.html', 'w', encoding='utf-8') as fp:`
			`# fp.write(r.content.decode('utf-8'))`
restructure scrapers 2020-11-22 14:11:59 +01:00			`return html.fromstring(r.content.decode('utf-8'))`

			`@classmethod`
direct dl first before wayback; check corrupted image 2021-10-11 22:03:17 -04:00			`def download_image(cls, url, item_url=None):`
restructure scrapers 2020-11-22 14:11:59 +01:00			`if url is None:`
minor fix for GoogleBooks search and scrape 2021-10-30 01:02:59 -04:00			`return None, None`
restructure scrapers 2020-11-22 14:11:59 +01:00			`raw_img = None`
			`session_id = random.random()`
			`proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %`
quick hack allow alternative settings and login via any Mastodon site 2021-09-10 20:24:22 -04:00			`(settings.LUMINATI_USERNAME, session_id, settings.LUMINATI_PASSWORD, PORT))`
restructure scrapers 2020-11-22 14:11:59 +01:00			`proxies = {`
			`'http': proxy_url,`
			`'https': proxy_url,`
			`}`
allow disable proxy 2021-09-10 22:57:40 -04:00			`if settings.LUMINATI_USERNAME is None:`
			`proxies = None`
restructure scrapers 2020-11-22 14:11:59 +01:00			`if url:`
			`img_response = requests.get(`
			`url,`
			`headers={`
			`'accept': 'image/webp,image/apng,image/,/*;q=0.8',`
			`'accept-encoding': 'gzip, deflate',`
			`'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,fr-FR;q=0.6,fr;q=0.5,zh-TW;q=0.4',`
			`'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 Edg/81.0.416.72',`
			`'cache-control': 'no-cache',`
			`'dnt': '1',`
			`},`
			`proxies=proxies,`
adjust timeout 2021-12-10 07:19:16 -05:00			`timeout=settings.SCRAPING_TIMEOUT,`
restructure scrapers 2020-11-22 14:11:59 +01:00			`)`
			`if img_response.status_code == 200:`
			`raw_img = img_response.content`
finish music part 2021-02-15 21:27:50 +01:00			`content_type = img_response.headers.get('Content-Type')`
fix scraping failure with wepb image (merge upstream/fix-webp-scrape) 2021-10-27 15:31:49 +02:00			`ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension`
update steam scraper 2021-02-25 21:52:28 +01:00			`else:`
			`ext = None`
finish music part 2021-02-15 21:27:50 +01:00			`return raw_img, ext`

			`@classmethod`
			`def save(cls, request_user):`
			`entity_cover = {`
fix scraping failure with wepb image (merge upstream/fix-webp-scrape) 2021-10-27 15:31:49 +02:00			`'cover': SimpleUploadedFile('temp.' + cls.img_ext, cls.raw_img)`
Allow save scraped data without cover image 2021-10-11 14:02:54 -04:00			`} if cls.img_ext is not None else None`
finish music part 2021-02-15 21:27:50 +01:00			`form = cls.form_class(cls.raw_data, entity_cover)`
			`if form.is_valid():`
			`form.instance.last_editor = request_user`
			`form.save()`
			`cls.instance = form.instance`
			`else:`
			`logger.error(str(form.errors))`
			`raise ValidationError("Form invalid.")`
			`return form`
restructure scrapers 2020-11-22 14:11:59 +01:00

add missing MovieGenre 2021-12-09 22:00:09 -05:00			`from common.scrapers.bandcamp import BandcampAlbumScraper`
individual scrapers 2021-12-10 21:55:16 -05:00			`from common.scrapers.goodreads import GoodreadsScraper`
			`from common.scrapers.google import GoogleBooksScraper`
			`from common.scrapers.tmdb import TmdbMovieScraper`
			`from common.scrapers.steam import SteamGameScraper`
			`from common.scrapers.imdb import ImdbMovieScraper`
			`from common.scrapers.spotify import SpotifyAlbumScraper, SpotifyTrackScraper`
			`from common.scrapers.douban import DoubanAlbumScraper, DoubanBookScraper, DoubanGameScraper, DoubanMovieScraper`
			`from common.scrapers.bangumi import BangumiScraper`
supports m.douban.com and customized bandcamp domain 2022-02-05 11:02:16 -05:00

			`def get_scraper_by_url(url):`
			`parsed_url = urllib.parse.urlparse(url)`
			`hostname = parsed_url.netloc`
			`for host in scraper_registry:`
			`if host == hostname:`
			`return scraper_registry[host]`
			`# TODO move this logic to scraper class`
			`try:`
			`answers = dns.resolver.query(hostname, 'CNAME')`
			`for rdata in answers:`
			`if str(rdata.target) == 'dom.bandcamp.com.':`
			`return BandcampAlbumScraper`
			`except Exception as e:`
			`pass`
			`try:`
			`answers = dns.resolver.query(hostname, 'A')`
			`for rdata in answers:`
			`if str(rdata.address) == '35.241.62.186':`
			`return BandcampAlbumScraper`
			`except Exception as e:`
			`pass`
			`return None`