From a4f35ad3fb9348d7fd2364c02d7a06c76d17a473 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 11 Oct 2021 22:03:17 -0400 Subject: [PATCH] direct dl first before wayback; check corrupted image --- common/scraper.py | 71 +++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/common/scraper.py b/common/scraper.py index b1c9289b..5bf62d8f 100644 --- a/common/scraper.py +++ b/common/scraper.py @@ -23,6 +23,8 @@ from music.forms import AlbumForm, SongForm from games.models import Game from games.forms import GameForm from django.conf import settings +from PIL import Image +from io import BytesIO RE_NUMBERS = re.compile(r"\d+\d*") @@ -169,7 +171,7 @@ class AbstractScraper: return html.fromstring(r.content.decode('utf-8')) @classmethod - def download_image(cls, url): + def download_image(cls, url, item_url=None): if url is None: return raw_img = None @@ -244,14 +246,14 @@ class DoubanScrapperMixin: content = None if r.status_code == 200: content = r.content.decode('utf-8') + if content.find('关于豆瓣') == -1: + content = None + error = error + 'Content not authentic' # response is garbage + elif re.search('不存在[^<]+', content, re.MULTILINE): + content = None + error = error + 'Not found or hidden by Douban' else: error = error + str(r.status_code) - if content.find('关于豆瓣') == -1: - content = None - error = error + 'Content not authentic' # response is garbage - elif re.search('不存在[^<]+', content, re.MULTILINE): - content = None - error = error + 'Not found or hidden by Douban' def fix_wayback_links(): nonlocal content @@ -311,9 +313,9 @@ class DoubanScrapperMixin: get(f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}', 30) check_content() - wayback_cdx() + latest() if content is None: - latest() + wayback_cdx() if content is None: raise RuntimeError(error) @@ -322,7 +324,7 @@ class DoubanScrapperMixin: return html.fromstring(content) @classmethod - def download_image(cls, url): + def download_image(cls, url, item_url=None): raw_img = None ext = None @@ -330,16 +332,25 @@ class DoubanScrapperMixin: if settings.SCRAPERAPI_KEY is not None: dl_url = f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}' - img_response = requests.get(dl_url, timeout=30) - if img_response.status_code == 200: - raw_img = img_response.content - content_type = img_response.headers.get('Content-Type') - ext = guess_extension(content_type.partition(';')[0].strip()) - else: - raise RuntimeError(f"Douban: download image failed {img_response.status_code} {dl_url}") + try: + img_response = requests.get(dl_url, timeout=30) + if img_response.status_code == 200: + raw_img = img_response.content + content_type = img_response.headers.get('Content-Type') + ext = guess_extension(content_type.partition(';')[0].strip()) + img = Image.open(BytesIO(raw_img)) + img.load() # corrupted image will trigger exception + else: + logger.error(f"Douban: download image failed {img_response.status_code} {dl_url} {item_url}") + # raise RuntimeError(f"Douban: download image failed {img_response.status_code} {dl_url}") + except Exception as e: + raw_img = None + ext = None + logger.error(f"Douban: download image failed {e} {dl_url} {item_url}") return raw_img, ext + class DoubanBookScraper(DoubanScrapperMixin, AbstractScraper): site_name = SourceSiteEnum.DOUBAN.value host = "book.douban.com" @@ -436,7 +447,7 @@ class DoubanBookScraper(DoubanScrapperMixin, AbstractScraper): img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src") img_url = img_url_elem[0].strip() if img_url_elem else None - raw_img, ext = self.download_image(img_url) + raw_img, ext = self.download_image(img_url, url) # there are two html formats for authors and translators authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/ @@ -648,7 +659,7 @@ class DoubanMovieScraper(DoubanScrapperMixin, AbstractScraper): img_url_elem = content.xpath("//img[@rel='v:image']/@src") img_url = img_url_elem[0].strip() if img_url_elem else None - raw_img, ext = self.download_image(img_url) + raw_img, ext = self.download_image(img_url, url) data = { 'title': title, @@ -756,7 +767,7 @@ class DoubanAlbumScraper(DoubanScrapperMixin, AbstractScraper): img_url_elem = content.xpath("//div[@id='mainpic']//img/@src") img_url = img_url_elem[0].strip() if img_url_elem else None - raw_img, ext = self.download_image(img_url) + raw_img, ext = self.download_image(img_url, url) data = { 'title': title, @@ -822,7 +833,7 @@ class SpotifyTrackScraper(AbstractScraper): else: isrc = None - raw_img, ext = self.download_image(res_data['album']['images'][0]['url']) + raw_img, ext = self.download_image(res_data['album']['images'][0]['url'], url) data = { 'title': title, @@ -914,7 +925,7 @@ class SpotifyAlbumScraper(AbstractScraper): # bar code other_info['UPC'] = res_data['external_ids']['upc'] - raw_img, ext = self.download_image(res_data['images'][0]['url']) + raw_img, ext = self.download_image(res_data['images'][0]['url'], url) data = { 'title': title, @@ -1102,7 +1113,7 @@ class ImdbMovieScraper(AbstractScraper): if res_data['awards']: other_info['奖项'] = res_data['awards'] - raw_img, ext = self.download_image(res_data['image']) + raw_img, ext = self.download_image(res_data['image'], url) data = { 'title': title, @@ -1197,7 +1208,7 @@ class DoubanGameScraper(DoubanScrapperMixin, AbstractScraper): img_url_elem = content.xpath( "//div[@class='item-subject-info']/div[@class='pic']//img/@src") img_url = img_url_elem[0].strip() if img_url_elem else None - raw_img, ext = self.download_image(img_url) + raw_img, ext = self.download_image(img_url, url) data = { 'title': title, @@ -1250,12 +1261,12 @@ class SteamGameScraper(AbstractScraper): img_url = content.xpath( "//img[@class='game_header_image_full']/@src" )[0].replace("header.jpg", "library_600x900.jpg") - raw_img, ext = self.download_image(img_url) + raw_img, ext = self.download_image(img_url, url) # no 600x900 picture if raw_img is None: img_url = content.xpath("//img[@class='game_header_image_full']/@src")[0] - raw_img, ext = self.download_image(img_url) + raw_img, ext = self.download_image(img_url, url) data = { 'title': title, @@ -1310,7 +1321,7 @@ class BangumiScraper(AbstractScraper): # download image img_url = 'http:' + content.xpath("//div[@class='infobox']//img[1]/@src")[0] - raw_img, ext = self.download_image(img_url) + raw_img, ext = self.download_image(img_url, url) # Test category category_code = content.xpath("//div[@id='headerSearch']//option[@selected]/@value")[0] @@ -1535,13 +1546,13 @@ class GoodreadsScraper(AbstractScraper): genre = content.xpath('//div[@class="bigBoxBody"]/div/div/div/a/text()') genre = genre[0] if genre else None - book_title = re.sub('\n','',content.xpath('//h1[@id="bookTitle"]/text()')[0]).strip() + book_title = re.sub('\n', '', content.xpath('//h1[@id="bookTitle"]/text()')[0]).strip() author = content.xpath('//a[@class="authorName"]/span/text()')[0] contents = None img_url_elem = content.xpath("//img[@id='coverImage']/@src") img_url = img_url_elem[0].strip() if img_url_elem else None - raw_img, ext = self.download_image(img_url) + raw_img, ext = self.download_image(img_url, url) authors_elem = content.xpath("//a[@class='authorName'][not(../span[@class='authorName greyText smallText role'])]/span/text()") if authors_elem: @@ -1685,7 +1696,7 @@ class TmdbMovieScraper(AbstractScraper): other_info['Episodes'] = res_data['number_of_episodes'] img_url = 'https://image.tmdb.org/t/p/original/' + res_data['poster_path'] # TODO: use GET /configuration to get base url - raw_img, ext = self.download_image(img_url) + raw_img, ext = self.download_image(img_url, url) data = { 'title': title,