lib.itmens/common/scraper.py
2021-10-29 10:11:49 +02:00

1361 lines
49 KiB
Python

import requests
import functools
import random
import logging
import re
import dateparser
import datetime
import time
import filetype
from lxml import html
from threading import Thread
from boofilsic.settings import LUMINATI_USERNAME, LUMINATI_PASSWORD, DEBUG, IMDB_API_KEY, SCRAPERAPI_KEY
from boofilsic.settings import SPOTIFY_CREDENTIAL
from django.utils import timezone
from django.utils.translation import ugettext_lazy as _
from django.core.exceptions import ObjectDoesNotExist, ValidationError
from django.core.files.uploadedfile import SimpleUploadedFile
from common.models import SourceSiteEnum
from movies.models import Movie, MovieGenreEnum
from movies.forms import MovieForm
from books.models import Book
from books.forms import BookForm
from music.models import Album, Song
from music.forms import AlbumForm, SongForm
from games.models import Game
from games.forms import GameForm
RE_NUMBERS = re.compile(r"\d+\d*")
RE_WHITESPACES = re.compile(r"\s+")
DEFAULT_REQUEST_HEADERS = {
'Host': '',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:70.0) Gecko/20100101 Firefox/70.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
# well, since brotli lib is so bothering, remove `br`
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'no-cache',
}
# in seconds
TIMEOUT = 60
# luminati account credentials
PORT = 22225
logger = logging.getLogger(__name__)
# register all implemented scraper in form of {host: scraper_class,}
scraper_registry = {}
def log_url(func):
"""
Catch exceptions and log then pass the exceptions.
First postion argument (except cls/self) of decorated function must be the url.
"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
# log the url and trace stack
logger.error(f"Scrape Failed URL: {args[1]}")
logger.error("Expections during scraping:", exc_info=e)
raise e
return wrapper
def parse_date(raw_str):
return dateparser.parse(
raw_str,
settings={
"RELATIVE_BASE": datetime.datetime(1900, 1, 1)
}
)
class AbstractScraper:
"""
Scrape entities. The entities means those defined in the models.py file,
like Book, Movie......
"""
# subclasses must specify those two variables
# site means general sites, like amazon/douban etc
site_name = None
# host means technically hostname
host = None
# corresponding data class
data_class = None
# corresponding form class
form_class = None
# used to extract effective url
regex = None
# scraped raw image
raw_img = None
# scraped raw data
raw_data = {}
def __init_subclass__(cls, **kwargs):
# this statement initialize the subclasses
super().__init_subclass__(**kwargs)
assert cls.site_name is not None, "class variable `site_name` must be specified"
assert bool(cls.host), "class variable `host` must be specified"
assert cls.data_class is not None, "class variable `data_class` must be specified"
assert cls.form_class is not None, "class variable `form_class` must be specified"
assert cls.regex is not None, "class variable `regex` must be specified"
assert isinstance(cls.host, str) or (isinstance(cls.host, list) and isinstance(
cls.host[0], str)), "`host` must be type str or list"
assert cls.site_name in SourceSiteEnum, "`site_name` must be one of `SourceSiteEnum` value"
assert hasattr(cls, 'scrape') and callable(
cls.scrape), "scaper must have method `.scrape()`"
# decorate the scrape method
cls.scrape = classmethod(log_url(cls.scrape))
# register scraper
if isinstance(cls.host, list):
for host in cls.host:
scraper_registry[host] = cls
else:
scraper_registry[cls.host] = cls
def scrape(self, url):
"""
Scrape/request model schema specified data from given url and return it.
Implementations of subclasses to this method would be decorated as class method.
return (data_dict, image)
Should set the `raw_data` and the `raw_img`
"""
raise NotImplementedError("Subclass should implement this method")
@classmethod
def get_effective_url(cls, raw_url):
"""
The return value should be identical with that saved in DB as `source_url`
"""
url = cls.regex.findall(raw_url)
if not url:
raise ValueError("not valid url")
return url[0]
@classmethod
def download_page(cls, url, headers):
url = cls.get_effective_url(url)
session_id = random.random()
proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %
(LUMINATI_USERNAME, session_id, LUMINATI_PASSWORD, PORT))
proxies = {
'http': proxy_url,
'https': proxy_url,
}
# if DEBUG:
# proxies = None
r = requests.get(url, proxies=proxies,
headers=headers, timeout=TIMEOUT)
if r.status_code != 200:
raise RuntimeError(f"download page failed, status code {r.status_code}")
# with open('temp.html', 'w', encoding='utf-8') as fp:
# fp.write(r.content.decode('utf-8'))
return html.fromstring(r.content.decode('utf-8'))
@classmethod
def download_image(cls, url):
if url is None:
return
raw_img = None
session_id = random.random()
proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %
(LUMINATI_USERNAME, session_id, LUMINATI_PASSWORD, PORT))
proxies = {
'http': proxy_url,
'https': proxy_url,
}
# if DEBUG:
# proxies = None
if url:
img_response = requests.get(
url,
headers={
'accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'accept-encoding': 'gzip, deflate',
'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,fr-FR;q=0.6,fr;q=0.5,zh-TW;q=0.4',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 Edg/81.0.416.72',
'cache-control': 'no-cache',
'dnt': '1',
},
proxies=proxies,
timeout=TIMEOUT,
)
if img_response.status_code == 200:
raw_img = img_response.content
content_type = img_response.headers.get('Content-Type')
ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
else:
ext = None
return raw_img, ext
@classmethod
def save(cls, request_user):
entity_cover = {
'cover': SimpleUploadedFile('temp.' + cls.img_ext, cls.raw_img)
}
form = cls.form_class(cls.raw_data, entity_cover)
if form.is_valid():
form.instance.last_editor = request_user
form.save()
cls.instance = form.instance
else:
logger.error(str(form.errors))
raise ValidationError("Form invalid.")
return form
class DoubanScrapperMixin:
@classmethod
def download_page(cls, url, headers):
url = cls.get_effective_url(url)
scraper_api_endpoint = f'http://api.scraperapi.com?api_key={SCRAPERAPI_KEY}&url={url}'
r = requests.get(scraper_api_endpoint, timeout=TIMEOUT)
if r.status_code != 200:
raise RuntimeError(f"download page failed, status code {r.status_code}")
# with open('temp.html', 'w', encoding='utf-8') as fp:
# fp.write(r.content.decode('utf-8'))
return html.fromstring(r.content.decode('utf-8'))
class DoubanBookScraper(DoubanScrapperMixin, AbstractScraper):
site_name = SourceSiteEnum.DOUBAN.value
host = "book.douban.com"
data_class = Book
form_class = BookForm
regex = re.compile(r"https://book\.douban\.com/subject/\d+/{0,1}")
def scrape(self, url):
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = self.host
content = self.download_page(url, headers)
# parsing starts here
try:
title = content.xpath("/html/body//h1/span/text()")[0].strip()
except IndexError:
raise ValueError("given url contains no book info")
subtitle_elem = content.xpath(
"//div[@id='info']//span[text()='副标题:']/following::text()")
subtitle = subtitle_elem[0].strip() if subtitle_elem else None
orig_title_elem = content.xpath(
"//div[@id='info']//span[text()='原作名:']/following::text()")
orig_title = orig_title_elem[0].strip() if orig_title_elem else None
language_elem = content.xpath(
"//div[@id='info']//span[text()='语言:']/following::text()")
language = language_elem[0].strip() if language_elem else None
pub_house_elem = content.xpath(
"//div[@id='info']//span[text()='出版社:']/following::text()")
pub_house = pub_house_elem[0].strip() if pub_house_elem else None
pub_date_elem = content.xpath(
"//div[@id='info']//span[text()='出版年:']/following::text()")
pub_date = pub_date_elem[0].strip() if pub_date_elem else ''
year_month_day = RE_NUMBERS.findall(pub_date)
if len(year_month_day) in (2, 3):
pub_year = int(year_month_day[0])
pub_month = int(year_month_day[1])
elif len(year_month_day) == 1:
pub_year = int(year_month_day[0])
pub_month = None
else:
pub_year = None
pub_month = None
if pub_year and pub_month and pub_year < pub_month:
pub_year, pub_month = pub_month, pub_year
pub_year = None if pub_year is not None and not pub_year in range(
0, 3000) else pub_year
pub_month = None if pub_month is not None and not pub_month in range(
1, 12) else pub_month
binding_elem = content.xpath(
"//div[@id='info']//span[text()='装帧:']/following::text()")
binding = binding_elem[0].strip() if binding_elem else None
price_elem = content.xpath(
"//div[@id='info']//span[text()='定价:']/following::text()")
price = price_elem[0].strip() if price_elem else None
pages_elem = content.xpath(
"//div[@id='info']//span[text()='页数:']/following::text()")
pages = pages_elem[0].strip() if pages_elem else None
if pages is not None:
pages = int(RE_NUMBERS.findall(pages)[
0]) if RE_NUMBERS.findall(pages) else None
isbn_elem = content.xpath(
"//div[@id='info']//span[text()='ISBN:']/following::text()")
isbn = isbn_elem[0].strip() if isbn_elem else None
brief_elem = content.xpath(
"//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
brief = '\n'.join(p.strip()
for p in brief_elem) if brief_elem else None
contents = None
try:
contents_elem = content.xpath(
"//h2/span[text()='目录']/../following-sibling::div[1]")[0]
# if next the id of next sibling contains `dir`, that would be the full contents
if "dir" in contents_elem.getnext().xpath("@id")[0]:
contents_elem = contents_elem.getnext()
contents = '\n'.join(p.strip() for p in contents_elem.xpath(
"text()")[:-2]) if contents_elem else None
else:
contents = '\n'.join(p.strip() for p in contents_elem.xpath(
"text()")) if contents_elem else None
except:
pass
img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
raw_img, ext = self.download_image(img_url)
# there are two html formats for authors and translators
authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""")
if not authors_elem:
authors_elem = content.xpath(
"""//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""")
if authors_elem:
authors = []
for author in authors_elem:
authors.append(RE_WHITESPACES.sub(' ', author.strip()))
else:
authors = None
translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""")
if not translators_elem:
translators_elem = content.xpath(
"""//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""")
if translators_elem:
translators = []
for translator in translators_elem:
translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
else:
translators = None
other = {}
cncode_elem = content.xpath(
"//div[@id='info']//span[text()='统一书号:']/following::text()")
if cncode_elem:
other['统一书号'] = cncode_elem[0].strip()
series_elem = content.xpath(
"//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
if series_elem:
other['丛书'] = series_elem[0].strip()
imprint_elem = content.xpath(
"//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
if imprint_elem:
other['出品方'] = imprint_elem[0].strip()
data = {
'title': title,
'subtitle': subtitle,
'orig_title': orig_title,
'author': authors,
'translator': translators,
'language': language,
'pub_house': pub_house,
'pub_year': pub_year,
'pub_month': pub_month,
'binding': binding,
'price': price,
'pages': pages,
'isbn': isbn,
'brief': brief,
'contents': contents,
'other_info': other,
'source_site': self.site_name,
'source_url': self.get_effective_url(url),
}
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
class DoubanMovieScraper(DoubanScrapperMixin, AbstractScraper):
site_name = SourceSiteEnum.DOUBAN.value
host = 'movie.douban.com'
data_class = Movie
form_class = MovieForm
regex = re.compile(r"https://movie\.douban\.com/subject/\d+/{0,1}")
def scrape(self, url):
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = self.host
content = self.download_page(url, headers)
# parsing starts here
try:
raw_title = content.xpath(
"//span[@property='v:itemreviewed']/text()")[0].strip()
except IndexError:
raise ValueError("given url contains no movie info")
orig_title = content.xpath(
"//img[@rel='v:image']/@alt")[0].strip()
title = raw_title.split(orig_title)[0].strip()
# if has no chinese title
if title == '':
title = orig_title
if title == orig_title:
orig_title = None
# there are two html formats for authors and translators
other_title_elem = content.xpath(
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
other_title = other_title_elem[0].strip().split(
' / ') if other_title_elem else None
imdb_elem = content.xpath(
"//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()")
imdb_code = imdb_elem[0].strip() if imdb_elem else None
director_elem = content.xpath(
"//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()")
director = director_elem if director_elem else None
playwright_elem = content.xpath(
"//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()")
playwright = playwright_elem if playwright_elem else None
actor_elem = content.xpath(
"//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()")
actor = actor_elem if actor_elem else None
# construct genre translator
genre_translator = {}
attrs = [attr for attr in dir(MovieGenreEnum) if not '__' in attr]
for attr in attrs:
genre_translator[getattr(MovieGenreEnum, attr).label] = getattr(
MovieGenreEnum, attr).value
genre_elem = content.xpath("//span[@property='v:genre']/text()")
if genre_elem:
genre = []
for g in genre_elem:
genre.append(genre_translator[g])
else:
genre = None
showtime_elem = content.xpath(
"//span[@property='v:initialReleaseDate']/text()")
if showtime_elem:
showtime = []
for st in showtime_elem:
parts = st.split('(')
if len(parts) == 1:
time = st.split('(')[0]
region = ''
else:
time = st.split('(')[0]
region = st.split('(')[1][0:-1]
showtime.append({time: region})
else:
showtime = None
site_elem = content.xpath(
"//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href")
site = site_elem[0].strip() if site_elem else None
area_elem = content.xpath(
"//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]")
if area_elem:
area = [a.strip() for a in area_elem[0].split(' / ')]
else:
area = None
language_elem = content.xpath(
"//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]")
if language_elem:
language = [a.strip() for a in language_elem[0].split(' / ')]
else:
language = None
year_elem = content.xpath("//span[@class='year']/text()")
year = int(year_elem[0][1:-1]) if year_elem else None
duration_elem = content.xpath("//span[@property='v:runtime']/text()")
other_duration_elem = content.xpath(
"//span[@property='v:runtime']/following-sibling::text()[1]")
if duration_elem:
duration = duration_elem[0].strip()
if other_duration_elem:
duration += other_duration_elem[0].rstrip()
else:
duration = None
season_elem = content.xpath(
"//*[@id='season']/option[@selected='selected']/text()")
if not season_elem:
season_elem = content.xpath(
"//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]")
season = int(season_elem[0].strip()) if season_elem else None
else:
season = int(season_elem[0].strip())
episodes_elem = content.xpath(
"//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]")
episodes = int(episodes_elem[0].strip()) if episodes_elem else None
single_episode_length_elem = content.xpath(
"//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]")
single_episode_length = single_episode_length_elem[0].strip(
) if single_episode_length_elem else None
# if has field `episodes` not none then must be series
is_series = True if episodes else False
brief_elem = content.xpath("//span[@class='all hidden']")
if not brief_elem:
brief_elem = content.xpath("//span[@property='v:summary']")
brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
'./text()')]) if brief_elem else None
img_url_elem = content.xpath("//img[@rel='v:image']/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
raw_img, ext = self.download_image(img_url)
data = {
'title': title,
'orig_title': orig_title,
'other_title': other_title,
'imdb_code': imdb_code,
'director': director,
'playwright': playwright,
'actor': actor,
'genre': genre,
'showtime': showtime,
'site': site,
'area': area,
'language': language,
'year': year,
'duration': duration,
'season': season,
'episodes': episodes,
'single_episode_length': single_episode_length,
'brief': brief,
'is_series': is_series,
'source_site': self.site_name,
'source_url': self.get_effective_url(url),
}
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
class DoubanAlbumScraper(DoubanScrapperMixin, AbstractScraper):
site_name = SourceSiteEnum.DOUBAN.value
host = 'music.douban.com'
data_class = Album
form_class = AlbumForm
regex = re.compile(r"https://music\.douban\.com/subject/\d+/{0,1}")
def scrape(self, url):
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = self.host
content = self.download_page(url, headers)
# parsing starts here
try:
title = content.xpath("//h1/span/text()")[0].strip()
except IndexError:
raise ValueError("given url contains no album info")
if not title:
raise ValueError("given url contains no album info")
artists_elem = content.xpath("""//div[@id='info']/span/span[@class='pl']/a/text()""")
artist = None if not artists_elem else artists_elem
genre_elem = content.xpath(
"//div[@id='info']//span[text()='流派:']/following::text()[1]")
genre = genre_elem[0].strip() if genre_elem else None
date_elem = content.xpath(
"//div[@id='info']//span[text()='发行时间:']/following::text()[1]")
release_date = parse_date(date_elem[0].strip()) if date_elem else None
company_elem = content.xpath(
"//div[@id='info']//span[text()='出版者:']/following::text()[1]")
company = company_elem[0].strip() if company_elem else None
track_list_elem = content.xpath(
"//div[@class='track-list']/div[@class='indent']/div/text()"
)
if track_list_elem:
track_list = '\n'.join([track.strip() for track in track_list_elem])
else:
track_list = None
brief_elem = content.xpath("//span[@class='all hidden']")
if not brief_elem:
brief_elem = content.xpath("//span[@property='v:summary']")
brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
'./text()')]) if brief_elem else None
other_info = {}
other_elem = content.xpath(
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
if other_elem:
other_info['又名'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]")
if other_elem:
other_info['专辑类型'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]")
if other_elem:
other_info['介质'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]")
if other_elem:
other_info['ISRC'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]")
if other_elem:
other_info['条形码'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]")
if other_elem:
other_info['碟片数'] = other_elem[0].strip()
img_url_elem = content.xpath("//div[@id='mainpic']//img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
raw_img, ext = self.download_image(img_url)
data = {
'title': title,
'artist': artist,
'genre': genre,
'release_date': release_date,
'duration': None,
'company': company,
'track_list': track_list,
'brief': brief,
'other_info': other_info,
'source_site': self.site_name,
'source_url': self.get_effective_url(url),
}
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
spotify_token = None
spotify_token_expire_time = time.time()
class SpotifyTrackScraper(AbstractScraper):
site_name = SourceSiteEnum.SPOTIFY.value
host = 'https://open.spotify.com/track/'
data_class = Song
form_class = SongForm
regex = re.compile(r"(?<=https://open\.spotify\.com/track/)[a-zA-Z0-9]+")
def scrape(self, url):
"""
Request from API, not really scraping
"""
global spotify_token, spotify_token_expire_time
if spotify_token is None or is_spotify_token_expired():
invoke_spotify_token()
effective_url = self.get_effective_url(url)
if effective_url is None:
raise ValueError("not valid url")
api_url = self.get_api_url(effective_url)
headers = {
'Authorization': f"Bearer {spotify_token}"
}
r = requests.get(api_url, headers=headers)
res_data = r.json()
artist = []
for artist_dict in res_data['artists']:
artist.append(artist_dict['name'])
if not artist:
artist = None
title = res_data['name']
release_date = parse_date(res_data['album']['release_date'])
duration = res_data['duration_ms']
if res_data['external_ids'].get('isrc'):
isrc = res_data['external_ids']['isrc']
else:
isrc = None
raw_img, ext = self.download_image(res_data['album']['images'][0]['url'])
data = {
'title': title,
'artist': artist,
'genre': None,
'release_date': release_date,
'duration': duration,
'isrc': isrc,
'album': None,
'brief': None,
'other_info': None,
'source_site': self.site_name,
'source_url': effective_url,
}
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
@classmethod
def get_effective_url(cls, raw_url):
code = cls.regex.findall(raw_url)
if code:
return f"https://open.spotify.com/track/{code[0]}"
else:
return None
@classmethod
def get_api_url(cls, url):
return "https://api.spotify.com/v1/tracks/" + cls.regex.findall(url)[0]
class SpotifyAlbumScraper(AbstractScraper):
site_name = SourceSiteEnum.SPOTIFY.value
# API URL
host = 'https://open.spotify.com/album/'
data_class = Album
form_class = AlbumForm
regex = re.compile(r"(?<=https://open\.spotify\.com/album/)[a-zA-Z0-9]+")
def scrape(self, url):
"""
Request from API, not really scraping
"""
global spotify_token, spotify_token_expire_time
if spotify_token is None or is_spotify_token_expired():
invoke_spotify_token()
effective_url = self.get_effective_url(url)
if effective_url is None:
raise ValueError("not valid url")
api_url = self.get_api_url(effective_url)
headers = {
'Authorization': f"Bearer {spotify_token}"
}
r = requests.get(api_url, headers=headers)
res_data = r.json()
artist = []
for artist_dict in res_data['artists']:
artist.append(artist_dict['name'])
title = res_data['name']
genre = ', '.join(res_data['genres'])
company = []
for com in res_data['copyrights']:
company.append(com['text'])
duration = 0
track_list = []
track_urls = []
for track in res_data['tracks']['items']:
track_urls.append(track['external_urls']['spotify'])
duration += track['duration_ms']
if res_data['tracks']['items'][-1]['disc_number'] > 1:
# more than one disc
track_list.append(str(
track['disc_number']) + '-' + str(track['track_number']) + '. ' + track['name'])
else:
track_list.append(str(track['track_number']) + '. ' + track['name'])
track_list = '\n'.join(track_list)
release_date = parse_date(res_data['release_date'])
other_info = {}
if res_data['external_ids'].get('upc'):
# bar code
other_info['UPC'] = res_data['external_ids']['upc']
raw_img, ext = self.download_image(res_data['images'][0]['url'])
data = {
'title': title,
'artist': artist,
'genre': genre,
'track_list': track_list,
'release_date': release_date,
'duration': duration,
'company': company,
'brief': None,
'other_info': other_info,
'source_site': self.site_name,
'source_url': effective_url,
}
# set tracks_data, used for adding tracks
self.track_urls = track_urls
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
@classmethod
def get_effective_url(cls, raw_url):
code = cls.regex.findall(raw_url)
if code:
return f"https://open.spotify.com/album/{code[0]}"
else:
return None
@classmethod
def save(cls, request_user):
form = super().save(request_user)
task = Thread(
target=cls.add_tracks,
args=(form.instance, request_user),
daemon=True
)
task.start()
return form
@classmethod
def get_api_url(cls, url):
return "https://api.spotify.com/v1/albums/" + cls.regex.findall(url)[0]
@classmethod
def add_tracks(cls, album: Album, request_user):
to_be_updated_tracks = []
for track_url in cls.track_urls:
track = cls.get_track_or_none(track_url)
# seems lik if fire too many requests at the same time
# spotify would limit access
if track is None:
task = Thread(
target=cls.scrape_and_save_track,
args=(track_url, album, request_user),
daemon=True
)
task.start()
task.join()
else:
to_be_updated_tracks.append(track)
cls.bulk_update_track_album(to_be_updated_tracks, album, request_user)
@classmethod
def get_track_or_none(cls, track_url: str):
try:
instance = Song.objects.get(source_url=track_url)
return instance
except ObjectDoesNotExist:
return None
@classmethod
def scrape_and_save_track(cls, url: str, album: Album, request_user):
data, img = SpotifyTrackScraper.scrape(url)
SpotifyTrackScraper.raw_data['album'] = album
SpotifyTrackScraper.save(request_user)
@classmethod
def bulk_update_track_album(cls, tracks, album, request_user):
for track in tracks:
track.last_editor = request_user
track.edited_time = timezone.now()
track.album = album
Song.objects.bulk_update(tracks, [
'last_editor',
'edited_time',
'album'
])
def is_spotify_token_expired():
global spotify_token_expire_time
return True if spotify_token_expire_time <= time.time() else False
def invoke_spotify_token():
global spotify_token, spotify_token_expire_time
r = requests.post(
"https://accounts.spotify.com/api/token",
data={
"grant_type": "client_credentials"
},
headers={
"Authorization": f"Basic {SPOTIFY_CREDENTIAL}"
}
)
data = r.json()
if r.status_code == 401:
# token expired, try one more time
# this maybe caused by external operations,
# for example debugging using a http client
r = requests.post(
"https://accounts.spotify.com/api/token",
data={
"grant_type": "client_credentials"
},
headers={
"Authorization": f"Basic {SPOTIFY_CREDENTIAL}"
}
)
data = r.json()
elif r.status_code != 200:
raise Exception(f"Request to spotify API fails. Reason: {r.reason}")
# minus 2 for execution time error
spotify_token_expire_time = int(data['expires_in']) + time.time() - 2
spotify_token = data['access_token']
class ImdbMovieScraper(AbstractScraper):
site_name = SourceSiteEnum.IMDB.value
host = 'https://www.imdb.com/title/'
data_class = Movie
form_class = MovieForm
regex = re.compile(r"(?<=https://www\.imdb\.com/title/)[a-zA-Z0-9]+")
def scrape(self, url):
effective_url = self.get_effective_url(url)
if effective_url is None:
raise ValueError("not valid url")
api_url = self.get_api_url(effective_url)
r = requests.get(api_url)
res_data = r.json()
if not res_data['type'] in ['Movie', 'TVSeries']:
raise ValueError("not movie/series item")
if res_data['type'] == 'Movie':
is_series = False
elif res_data['type'] == 'TVSeries':
is_series = True
title = res_data['title']
orig_title = res_data['originalTitle']
imdb_code = self.regex.findall(effective_url)[0]
director = []
for direct_dict in res_data['directorList']:
director.append(direct_dict['name'])
playwright = []
for writer_dict in res_data['writerList']:
playwright.append(writer_dict['name'])
actor = []
for actor_dict in res_data['actorList']:
actor.append(actor_dict['name'])
genre = res_data['genres'].split(', ')
area = res_data['countries'].split(', ')
language = res_data['languages'].split(', ')
year = int(res_data['year'])
duration = res_data['runtimeStr']
brief = res_data['plotLocal'] if res_data['plotLocal'] else res_data['plot']
if res_data['releaseDate']:
showtime = [{res_data['releaseDate']: "发布日期"}]
else:
showtime = None
other_info = {}
if res_data['contentRating']:
other_info['分级'] = res_data['contentRating']
if res_data['imDbRating']:
other_info['IMDb评分'] = res_data['imDbRating']
if res_data['metacriticRating']:
other_info['Metacritic评分'] = res_data['metacriticRating']
if res_data['awards']:
other_info['奖项'] = res_data['awards']
raw_img, ext = self.download_image(res_data['image'])
data = {
'title': title,
'orig_title': orig_title,
'other_title': None,
'imdb_code': imdb_code,
'director': director,
'playwright': playwright,
'actor': actor,
'genre': genre,
'showtime': showtime,
'site': None,
'area': area,
'language': language,
'year': year,
'duration': duration,
'season': None,
'episodes': None,
'single_episode_length': None,
'brief': brief,
'is_series': is_series,
'other_info': other_info,
'source_site': self.site_name,
'source_url': effective_url,
}
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
@classmethod
def get_effective_url(cls, raw_url):
code = cls.regex.findall(raw_url)
if code:
return f"https://www.imdb.com/title/{code[0]}/"
else:
return None
@classmethod
def get_api_url(cls, url):
return f"https://imdb-api.com/zh/API/Title/{IMDB_API_KEY}/{cls.regex.findall(url)[0]}/FullActor,"
class DoubanGameScraper(DoubanScrapperMixin, AbstractScraper):
site_name = SourceSiteEnum.DOUBAN.value
host = 'www.douban.com/game/'
data_class = Game
form_class = GameForm
regex = re.compile(r"https://www\.douban\.com/game/\d+/{0,1}")
def scrape(self, url):
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = 'www.douban.com'
content = self.download_page(url, headers)
try:
raw_title = content.xpath(
"//div[@id='content']/h1/text()")[0].strip()
except IndexError:
raise ValueError("given url contains no game info")
title = raw_title
other_title_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()")
other_title = other_title_elem[0].strip().split(' / ') if other_title_elem else None
developer_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()")
developer = developer_elem[0].strip().split(' / ') if developer_elem else None
publisher_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()")
publisher = publisher_elem[0].strip().split(' / ') if publisher_elem else None
platform_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()")
platform = platform_elem if platform_elem else None
genre_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()")
genre = None
if genre_elem:
genre = [g for g in genre_elem if g != '游戏']
date_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()")
release_date = parse_date(date_elem[0].strip()) if date_elem else None
brief_elem = content.xpath("//div[@class='mod item-desc']/p/text()")
brief = '\n'.join(brief_elem) if brief_elem else None
img_url_elem = content.xpath(
"//div[@class='item-subject-info']/div[@class='pic']//img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
raw_img, ext = self.download_image(img_url)
data = {
'title': title,
'other_title': other_title,
'developer': developer,
'publisher': publisher,
'release_date': release_date,
'genre': genre,
'platform': platform,
'brief': brief,
'other_info': None,
'source_site': self.site_name,
'source_url': self.get_effective_url(url),
}
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
class SteamGameScraper(AbstractScraper):
site_name = SourceSiteEnum.STEAM.value
host = 'store.steampowered.com'
data_class = Game
form_class = GameForm
regex = re.compile(r"https://store\.steampowered\.com/app/\d+/{0,1}")
def scrape(self, url):
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = self.host
headers['Cookie'] = "wants_mature_content=1; birthtime=754700401;"
content = self.download_page(url, headers)
title = content.xpath("//div[@class='apphub_AppName']/text()")[0]
developer = content.xpath("//div[@id='developers_list']/a/text()")
publisher = content.xpath("//div[@class='glance_ctn']//div[@class='dev_row'][2]//a/text()")
release_date = parse_date(
content.xpath(
"//div[@class='release_date']/div[@class='date']/text()")[0]
)
genre = content.xpath(
"//div[@class='details_block']/b[2]/following-sibling::a/text()")
platform = ['PC']
brief = content.xpath(
"//div[@class='game_description_snippet']/text()")[0].strip()
img_url = content.xpath(
"//img[@class='game_header_image_full']/@src"
)[0].replace("header.jpg", "library_600x900.jpg")
raw_img, ext = self.download_image(img_url)
# no 600x900 picture
if raw_img is None:
img_url = content.xpath("//img[@class='game_header_image_full']/@src")[0]
raw_img, ext = self.download_image(img_url)
data = {
'title': title,
'other_title': None,
'developer': developer,
'publisher': publisher,
'release_date': release_date,
'genre': genre,
'platform': platform,
'brief': brief,
'other_info': None,
'source_site': self.site_name,
'source_url': self.get_effective_url(url),
}
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
def find_entity(source_url):
"""
for bangumi
"""
# to be added when new scrape method is implemented
result = Game.objects.filter(source_url=source_url)
if result:
return result[0]
else:
raise ObjectDoesNotExist
class BangumiScraper(AbstractScraper):
site_name = SourceSiteEnum.BANGUMI.value
host = 'bgm.tv'
# for interface coherence
data_class = type("FakeDataClass", (object,), {})()
data_class.objects = type("FakeObjectsClass", (object,), {})()
data_class.objects.get = find_entity
# should be set at scrape_* method
form_class = ''
regex = re.compile(r"https{0,1}://bgm\.tv/subject/\d+")
def scrape(self, url):
"""
This is the scraping portal
"""
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = self.host
content = self.download_page(url, headers)
# download image
img_url = 'http:' + content.xpath("//div[@class='infobox']//img[1]/@src")[0]
raw_img, ext = self.download_image(img_url)
# Test category
category_code = content.xpath("//div[@id='headerSearch']//option[@selected]/@value")[0]
handler_map = {
'1': self.scrape_book,
'2': self.scrape_movie,
'3': self.scrape_album,
'4': self.scrape_game
}
data = handler_map[category_code](self, content)
data['source_url'] = self.get_effective_url(url)
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
def scrape_game(self, content):
self.data_class = Game
self.form_class = GameForm
title_elem = content.xpath("//a[@property='v:itemreviewed']/text()")
if not title_elem:
raise ValueError("no game info found on this page")
title = None
else:
title = title_elem[0].strip()
other_title_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'别名')]]/text()")
if not other_title_elem:
other_title_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'别名')]]/a/text()")
other_title = other_title_elem if other_title_elem else []
chinese_name_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'中文')]]/text()")
if not chinese_name_elem:
chinese_name_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'中文')]]/a/text()")
if chinese_name_elem:
chinese_name = chinese_name_elem[0]
# switch chinese name with original name
title, chinese_name = chinese_name, title
# actually the name appended is original
other_title.append(chinese_name)
developer_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'开发')]]/text()")
if not developer_elem:
developer_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'开发')]]/a/text()")
developer = developer_elem if developer_elem else None
publisher_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'发行:')]]/text()")
if not publisher_elem:
publisher_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'发行:')]]/a/text()")
publisher = publisher_elem if publisher_elem else None
platform_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'平台')]]/text()")
if not platform_elem:
platform_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'平台')]]/a/text()")
platform = platform_elem if platform_elem else None
genre_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'类型')]]/text()")
if not genre_elem:
genre_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'类型')]]/a/text()")
genre = genre_elem if genre_elem else None
date_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'发行日期')]]/text()")
if not date_elem:
date_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'发行日期')]]/a/text()")
release_date = parse_date(date_elem[0]) if date_elem else None
brief = ''.join(content.xpath("//div[@property='v:summary']/text()"))
other_info = {}
other_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'人数')]]/text()")
if other_elem:
other_info['游玩人数'] = other_elem[0]
other_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'引擎')]]/text()")
if other_elem:
other_info['引擎'] = ' '.join(other_elem)
other_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'售价')]]/text()")
if other_elem:
other_info['售价'] = ' '.join(other_elem)
other_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'官方网站')]]/text()")
if other_elem:
other_info['网站'] = other_elem[0]
other_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'剧本')]]/a/text()") or content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'剧本')]]/text()")
if other_elem:
other_info['剧本'] = ' '.join(other_elem)
other_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'编剧')]]/a/text()") or content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'编剧')]]/text()")
if other_elem:
other_info['编剧'] = ' '.join(other_elem)
other_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'音乐')]]/a/text()") or content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'音乐')]]/text()")
if other_elem:
other_info['音乐'] = ' '.join(other_elem)
other_elem = content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'美术')]]/a/text()") or content.xpath(
"//ul[@id='infobox']/li[child::span[contains(text(),'美术')]]/text()")
if other_elem:
other_info['美术'] = ' '.join(other_elem)
data = {
'title': title,
'other_title': None,
'developer': developer,
'publisher': publisher,
'release_date': release_date,
'genre': genre,
'platform': platform,
'brief': brief,
'other_info': other_info,
'source_site': self.site_name,
}
return data
def scrape_movie(self, content):
self.data_class = Movie
self.form_class = MovieForm
raise NotImplementedError
def scrape_book(self, content):
self.data_class = Book
self.form_class = BookForm
raise NotImplementedError
def scrape_album(self, content):
self.data_class = Album
self.form_class = AlbumForm
raise NotImplementedError
# https://developers.google.com/youtube/v3/docs/?apix=true
# https://developers.google.com/books/docs/v1/using