import requests
import re
import filetype
from lxml import html
from common.models import SourceSiteEnum
from movies.models import Movie, MovieGenreEnum
from movies.forms import MovieForm
from books.models import Book
from books.forms import BookForm
from music.models import Album
from music.forms import AlbumForm
from games.models import Game
from games.forms import GameForm
from django.core.validators import URLValidator
from django.conf import settings
from PIL import Image
from io import BytesIO
from common.scraper import *
class DoubanScrapperMixin:
@classmethod
def download_page(cls, url, headers):
url = cls.get_effective_url(url)
r = None
error = 'DoubanScrapper: error occured when downloading ' + url
content = None
last_error = None
def get(url):
nonlocal r
# print('Douban GET ' + url)
try:
r = requests.get(url, timeout=settings.SCRAPING_TIMEOUT)
except Exception as e:
r = requests.Response()
r.status_code = f"Exception when GET {url} {e}" + url
# print('Douban CODE ' + str(r.status_code))
return r
def check_content():
nonlocal r, error, content, last_error
content = None
last_error = None
if r.status_code == 200:
content = r.content.decode('utf-8')
if content.find('关于豆瓣') == -1:
if content.find('你的 IP 发出') == -1:
error = error + 'Content not authentic' # response is garbage
else:
error = error + 'IP banned'
content = None
last_error = 'network'
elif content.find('
页面不存在') != -1 or content.find('呃... 你想访问的条目豆瓣不收录。') != -1: # re.search('不存在[^<]+', content, re.MULTILINE):
content = None
last_error = 'censorship'
error = error + 'Not found or hidden by Douban'
elif r.status_code == 204:
content = None
last_error = 'censorship'
error = error + 'Not found or hidden by Douban'
else:
content = None
last_error = 'network'
error = error + str(r.status_code)
def fix_wayback_links():
nonlocal content
# fix links
content = re.sub(r'href="http[^"]+http', r'href="http', content)
# https://img9.doubanio.com/view/subject/{l|m|s}/public/s1234.jpg
content = re.sub(r'src="[^"]+/(s\d+\.\w+)"',
r'src="https://img9.doubanio.com/view/subject/m/public/\1"', content)
# https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2681329386.jpg
# https://img9.doubanio.com/view/photo/{l|m|s}/public/p1234.webp
content = re.sub(r'src="[^"]+/(p\d+\.\w+)"',
r'src="https://img9.doubanio.com/view/photo/m/public/\1"', content)
# Wayback Machine: get latest available
def wayback():
nonlocal r, error, content
error = error + '\nWayback: '
get('http://archive.org/wayback/available?url=' + url)
if r.status_code == 200:
w = r.json()
if w['archived_snapshots'] and w['archived_snapshots']['closest']:
get(w['archived_snapshots']['closest']['url'])
check_content()
if content is not None:
fix_wayback_links()
else:
error = error + 'No snapshot available'
else:
error = error + str(r.status_code)
# Wayback Machine: guess via CDX API
def wayback_cdx():
nonlocal r, error, content
error = error + '\nWayback: '
get('http://web.archive.org/cdx/search/cdx?url=' + url)
if r.status_code == 200:
dates = re.findall(r'[^\s]+\s+(\d+)\s+[^\s]+\s+[^\s]+\s+\d+\s+[^\s]+\s+\d{5,}',
r.content.decode('utf-8'))
# assume snapshots whose size >9999 contain real content, use the latest one of them
if len(dates) > 0:
get('http://web.archive.org/web/' + dates[-1] + '/' + url)
check_content()
if content is not None:
fix_wayback_links()
else:
error = error + 'No snapshot available'
else:
error = error + str(r.status_code)
def latest():
nonlocal r, error, content
if settings.SCRAPESTACK_KEY is not None:
error = error + '\nScrapeStack: '
get(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}')
elif settings.SCRAPERAPI_KEY is not None:
error = error + '\nScraperAPI: '
get(f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}')
else:
error = error + '\nDirect: '
get(url)
check_content()
if last_error == 'network' and settings.PROXYCRAWL_KEY is not None:
error = error + '\nProxyCrawl: '
get(f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={url}')
check_content()
if last_error == 'censorship' and settings.LOCAL_PROXY is not None:
error = error + '\nLocal: '
get(f'{settings.LOCAL_PROXY}?url={url}')
check_content()
latest()
if content is None:
wayback_cdx()
if content is None:
raise RuntimeError(error)
# with open('/tmp/temp.html', 'w', encoding='utf-8') as fp:
# fp.write(content)
return html.fromstring(content)
@classmethod
def download_image(cls, url, item_url=None):
raw_img = None
ext = None
if settings.SCRAPESTACK_KEY is not None:
dl_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}'
elif settings.SCRAPERAPI_KEY is not None:
dl_url = f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}'
else:
dl_url = url
try:
img_response = requests.get(dl_url, timeout=settings.SCRAPING_TIMEOUT)
if img_response.status_code == 200:
raw_img = img_response.content
img = Image.open(BytesIO(raw_img))
img.load() # corrupted image will trigger exception
content_type = img_response.headers.get('Content-Type')
ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
else:
logger.error(f"Douban: download image failed {img_response.status_code} {dl_url} {item_url}")
# raise RuntimeError(f"Douban: download image failed {img_response.status_code} {dl_url}")
except Exception as e:
raw_img = None
ext = None
logger.error(f"Douban: download image failed {e} {dl_url} {item_url}")
if raw_img is None and settings.PROXYCRAWL_KEY is not None:
try:
dl_url = f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={url}'
img_response = requests.get(dl_url, timeout=settings.SCRAPING_TIMEOUT)
if img_response.status_code == 200:
raw_img = img_response.content
img = Image.open(BytesIO(raw_img))
img.load() # corrupted image will trigger exception
content_type = img_response.headers.get('Content-Type')
ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
else:
logger.error(f"Douban: download image failed {img_response.status_code} {dl_url} {item_url}")
except Exception as e:
raw_img = None
ext = None
logger.error(f"Douban: download image failed {e} {dl_url} {item_url}")
return raw_img, ext
class DoubanBookScraper(DoubanScrapperMixin, AbstractScraper):
site_name = SourceSiteEnum.DOUBAN.value
host = "book.douban.com"
data_class = Book
form_class = BookForm
regex = re.compile(r"https://book\.douban\.com/subject/\d+/{0,1}")
def scrape(self, url):
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = self.host
content = self.download_page(url, headers)
isbn_elem = content.xpath("//div[@id='info']//span[text()='ISBN:']/following::text()")
isbn = isbn_elem[0].strip() if isbn_elem else None
title_elem = content.xpath("/html/body//h1/span/text()")
title = title_elem[0].strip() if title_elem else None
if not title:
if isbn:
title = 'isbn: ' + isbn
else:
raise ValueError("given url contains no book title or isbn")
subtitle_elem = content.xpath(
"//div[@id='info']//span[text()='副标题:']/following::text()")
subtitle = subtitle_elem[0].strip()[:500] if subtitle_elem else None
orig_title_elem = content.xpath(
"//div[@id='info']//span[text()='原作名:']/following::text()")
orig_title = orig_title_elem[0].strip()[:500] if orig_title_elem else None
language_elem = content.xpath(
"//div[@id='info']//span[text()='语言:']/following::text()")
language = language_elem[0].strip() if language_elem else None
pub_house_elem = content.xpath(
"//div[@id='info']//span[text()='出版社:']/following::text()")
pub_house = pub_house_elem[0].strip() if pub_house_elem else None
pub_date_elem = content.xpath(
"//div[@id='info']//span[text()='出版年:']/following::text()")
pub_date = pub_date_elem[0].strip() if pub_date_elem else ''
year_month_day = RE_NUMBERS.findall(pub_date)
if len(year_month_day) in (2, 3):
pub_year = int(year_month_day[0])
pub_month = int(year_month_day[1])
elif len(year_month_day) == 1:
pub_year = int(year_month_day[0])
pub_month = None
else:
pub_year = None
pub_month = None
if pub_year and pub_month and pub_year < pub_month:
pub_year, pub_month = pub_month, pub_year
pub_year = None if pub_year is not None and pub_year not in range(
0, 3000) else pub_year
pub_month = None if pub_month is not None and pub_month not in range(
1, 12) else pub_month
binding_elem = content.xpath(
"//div[@id='info']//span[text()='装帧:']/following::text()")
binding = binding_elem[0].strip() if binding_elem else None
price_elem = content.xpath(
"//div[@id='info']//span[text()='定价:']/following::text()")
price = price_elem[0].strip() if price_elem else None
pages_elem = content.xpath(
"//div[@id='info']//span[text()='页数:']/following::text()")
pages = pages_elem[0].strip() if pages_elem else None
if pages is not None:
pages = int(RE_NUMBERS.findall(pages)[
0]) if RE_NUMBERS.findall(pages) else None
if pages and (pages > 999999 or pages < 1):
pages = None
brief_elem = content.xpath(
"//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
brief = '\n'.join(p.strip()
for p in brief_elem) if brief_elem else None
contents = None
try:
contents_elem = content.xpath(
"//h2/span[text()='目录']/../following-sibling::div[1]")[0]
# if next the id of next sibling contains `dir`, that would be the full contents
if "dir" in contents_elem.getnext().xpath("@id")[0]:
contents_elem = contents_elem.getnext()
contents = '\n'.join(p.strip() for p in contents_elem.xpath(
"text()")[:-2]) if contents_elem else None
else:
contents = '\n'.join(p.strip() for p in contents_elem.xpath(
"text()")) if contents_elem else None
except Exception:
pass
img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
raw_img, ext = self.download_image(img_url, url)
# there are two html formats for authors and translators
authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""")
if not authors_elem:
authors_elem = content.xpath(
"""//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""")
if authors_elem:
authors = []
for author in authors_elem:
authors.append(RE_WHITESPACES.sub(' ', author.strip())[:200])
else:
authors = None
translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""")
if not translators_elem:
translators_elem = content.xpath(
"""//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""")
if translators_elem:
translators = []
for translator in translators_elem:
translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
else:
translators = None
other = {}
cncode_elem = content.xpath(
"//div[@id='info']//span[text()='统一书号:']/following::text()")
if cncode_elem:
other['统一书号'] = cncode_elem[0].strip()
series_elem = content.xpath(
"//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
if series_elem:
other['丛书'] = series_elem[0].strip()
imprint_elem = content.xpath(
"//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
if imprint_elem:
other['出品方'] = imprint_elem[0].strip()
data = {
'title': title,
'subtitle': subtitle,
'orig_title': orig_title,
'author': authors,
'translator': translators,
'language': language,
'pub_house': pub_house,
'pub_year': pub_year,
'pub_month': pub_month,
'binding': binding,
'price': price,
'pages': pages,
'isbn': isbn,
'brief': brief,
'contents': contents,
'other_info': other,
'source_site': self.site_name,
'source_url': self.get_effective_url(url),
}
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
class DoubanMovieScraper(DoubanScrapperMixin, AbstractScraper):
site_name = SourceSiteEnum.DOUBAN.value
host = 'movie.douban.com'
data_class = Movie
form_class = MovieForm
regex = re.compile(r"https://movie\.douban\.com/subject/\d+/{0,1}")
def scrape(self, url):
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = self.host
content = self.download_page(url, headers)
# parsing starts here
try:
raw_title = content.xpath(
"//span[@property='v:itemreviewed']/text()")[0].strip()
except IndexError:
raise ValueError("given url contains no movie info")
orig_title = content.xpath(
"//img[@rel='v:image']/@alt")[0].strip()
title = raw_title.split(orig_title)[0].strip()
# if has no chinese title
if title == '':
title = orig_title
if title == orig_title:
orig_title = None
# there are two html formats for authors and translators
other_title_elem = content.xpath(
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
other_title = other_title_elem[0].strip().split(
' / ') if other_title_elem else None
imdb_elem = content.xpath(
"//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()")
if not imdb_elem:
imdb_elem = content.xpath(
"//div[@id='info']//span[text()='IMDb:']/following-sibling::text()[1]")
imdb_code = imdb_elem[0].strip() if imdb_elem else None
director_elem = content.xpath(
"//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()")
director = director_elem if director_elem else None
playwright_elem = content.xpath(
"//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()")
playwright = list(map(lambda a: a[:200], playwright_elem)) if playwright_elem else None
actor_elem = content.xpath(
"//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()")
actor = list(map(lambda a: a[:200], actor_elem)) if actor_elem else None
# construct genre translator
genre_translator = {}
attrs = [attr for attr in dir(MovieGenreEnum) if '__' not in attr]
for attr in attrs:
genre_translator[getattr(MovieGenreEnum, attr).label] = getattr(
MovieGenreEnum, attr).value
genre_elem = content.xpath("//span[@property='v:genre']/text()")
if genre_elem:
genre = []
for g in genre_elem:
g = g.split(' ')[0]
if g == '紀錄片': # likely some original data on douban was corrupted
g = '纪录片'
elif g == '鬼怪':
g = '惊悚'
if g in genre_translator:
genre.append(genre_translator[g])
elif g in genre_translator.values():
genre.append(g)
else:
logger.error(f'unable to map genre {g}')
else:
genre = None
showtime_elem = content.xpath(
"//span[@property='v:initialReleaseDate']/text()")
if showtime_elem:
showtime = []
for st in showtime_elem:
parts = st.split('(')
if len(parts) == 1:
time = st.split('(')[0]
region = ''
else:
time = st.split('(')[0]
region = st.split('(')[1][0:-1]
showtime.append({time: region})
else:
showtime = None
site_elem = content.xpath(
"//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href")
site = site_elem[0].strip()[:200] if site_elem else None
try:
validator = URLValidator()
validator(site)
except ValidationError:
site = None
area_elem = content.xpath(
"//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]")
if area_elem:
area = [a.strip()[:100] for a in area_elem[0].split('/')]
else:
area = None
language_elem = content.xpath(
"//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]")
if language_elem:
language = [a.strip() for a in language_elem[0].split(' / ')]
else:
language = None
year_elem = content.xpath("//span[@class='year']/text()")
year = int(re.search(r'\d+', year_elem[0])[0]) if year_elem and re.search(r'\d+', year_elem[0]) else None
duration_elem = content.xpath("//span[@property='v:runtime']/text()")
other_duration_elem = content.xpath(
"//span[@property='v:runtime']/following-sibling::text()[1]")
if duration_elem:
duration = duration_elem[0].strip()
if other_duration_elem:
duration += other_duration_elem[0].rstrip()
duration = duration.split('/')[0].strip()
else:
duration = None
season_elem = content.xpath(
"//*[@id='season']/option[@selected='selected']/text()")
if not season_elem:
season_elem = content.xpath(
"//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]")
season = int(season_elem[0].strip()) if season_elem else None
else:
season = int(season_elem[0].strip())
episodes_elem = content.xpath(
"//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]")
episodes = int(episodes_elem[0].strip()) if episodes_elem and episodes_elem[0].isdigit() else None
single_episode_length_elem = content.xpath(
"//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]")
single_episode_length = single_episode_length_elem[0].strip(
)[:100] if single_episode_length_elem else None
# if has field `episodes` not none then must be series
is_series = True if episodes else False
brief_elem = content.xpath("//span[@class='all hidden']")
if not brief_elem:
brief_elem = content.xpath("//span[@property='v:summary']")
brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
'./text()')]) if brief_elem else None
img_url_elem = content.xpath("//img[@rel='v:image']/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
raw_img, ext = self.download_image(img_url, url)
data = {
'title': title,
'orig_title': orig_title,
'other_title': other_title,
'imdb_code': imdb_code,
'director': director,
'playwright': playwright,
'actor': actor,
'genre': genre,
'showtime': showtime,
'site': site,
'area': area,
'language': language,
'year': year,
'duration': duration,
'season': season,
'episodes': episodes,
'single_episode_length': single_episode_length,
'brief': brief,
'is_series': is_series,
'source_site': self.site_name,
'source_url': self.get_effective_url(url),
}
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
class DoubanAlbumScraper(DoubanScrapperMixin, AbstractScraper):
site_name = SourceSiteEnum.DOUBAN.value
host = 'music.douban.com'
data_class = Album
form_class = AlbumForm
regex = re.compile(r"https://music\.douban\.com/subject/\d+/{0,1}")
def scrape(self, url):
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = self.host
content = self.download_page(url, headers)
# parsing starts here
try:
title = content.xpath("//h1/span/text()")[0].strip()
except IndexError:
raise ValueError("given url contains no album info")
if not title:
raise ValueError("given url contains no album info")
artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()")
artist = None if not artists_elem else list(map(lambda a: a[:200], artists_elem))
genre_elem = content.xpath(
"//div[@id='info']//span[text()='流派:']/following::text()[1]")
genre = genre_elem[0].strip() if genre_elem else None
date_elem = content.xpath(
"//div[@id='info']//span[text()='发行时间:']/following::text()[1]")
release_date = parse_date(date_elem[0].strip()) if date_elem else None
company_elem = content.xpath(
"//div[@id='info']//span[text()='出版者:']/following::text()[1]")
company = company_elem[0].strip() if company_elem else None
track_list_elem = content.xpath(
"//div[@class='track-list']/div[@class='indent']/div/text()"
)
if track_list_elem:
track_list = '\n'.join([track.strip() for track in track_list_elem])
else:
track_list = None
brief_elem = content.xpath("//span[@class='all hidden']")
if not brief_elem:
brief_elem = content.xpath("//span[@property='v:summary']")
brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
'./text()')]) if brief_elem else None
other_info = {}
other_elem = content.xpath(
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
if other_elem:
other_info['又名'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]")
if other_elem:
other_info['专辑类型'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]")
if other_elem:
other_info['介质'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]")
if other_elem:
other_info['ISRC'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]")
if other_elem:
other_info['条形码'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]")
if other_elem:
other_info['碟片数'] = other_elem[0].strip()
img_url_elem = content.xpath("//div[@id='mainpic']//img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
raw_img, ext = self.download_image(img_url, url)
data = {
'title': title,
'artist': artist,
'genre': genre,
'release_date': release_date,
'duration': None,
'company': company,
'track_list': track_list,
'brief': brief,
'other_info': other_info,
'source_site': self.site_name,
'source_url': self.get_effective_url(url),
}
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
class DoubanGameScraper(DoubanScrapperMixin, AbstractScraper):
site_name = SourceSiteEnum.DOUBAN.value
host = 'www.douban.com/game/'
data_class = Game
form_class = GameForm
regex = re.compile(r"https://www\.douban\.com/game/\d+/{0,1}")
def scrape(self, url):
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = 'www.douban.com'
content = self.download_page(url, headers)
try:
raw_title = content.xpath(
"//div[@id='content']/h1/text()")[0].strip()
except IndexError:
raise ValueError("given url contains no game info")
title = raw_title
other_title_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()")
other_title = other_title_elem[0].strip().split(' / ') if other_title_elem else None
developer_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()")
developer = developer_elem[0].strip().split(' / ') if developer_elem else None
publisher_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()")
publisher = publisher_elem[0].strip().split(' / ') if publisher_elem else None
platform_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()")
platform = platform_elem if platform_elem else None
genre_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()")
genre = None
if genre_elem:
genre = [g for g in genre_elem if g != '游戏']
date_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()")
release_date = parse_date(date_elem[0].strip()) if date_elem else None
brief_elem = content.xpath("//div[@class='mod item-desc']/p/text()")
brief = '\n'.join(brief_elem) if brief_elem else None
img_url_elem = content.xpath(
"//div[@class='item-subject-info']/div[@class='pic']//img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
raw_img, ext = self.download_image(img_url, url)
data = {
'title': title,
'other_title': other_title,
'developer': developer,
'publisher': publisher,
'release_date': release_date,
'genre': genre,
'platform': platform,
'brief': brief,
'other_info': None,
'source_site': self.site_name,
'source_url': self.get_effective_url(url),
}
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img