diff --git a/books/management/commands/fix-book-cover.py b/books/management/commands/fix-book-cover.py new file mode 100644 index 00000000..9d7bb12f --- /dev/null +++ b/books/management/commands/fix-book-cover.py @@ -0,0 +1,186 @@ +from django.core.management.base import BaseCommand +from django.core.files.uploadedfile import SimpleUploadedFile +from common.scraper import * + + +class DoubanPatcherMixin: + @classmethod + def download_page(cls, url, headers): + url = cls.get_effective_url(url) + r = None + error = 'DoubanScrapper: error occured when downloading ' + url + content = None + + def get(url, timeout): + nonlocal r + # print('Douban GET ' + url) + try: + r = requests.get(url, timeout=timeout) + except Exception as e: + r = requests.Response() + r.status_code = f"Exception when GET {url} {e}" + url + # print('Douban CODE ' + str(r.status_code)) + return r + + def check_content(): + nonlocal r, error, content + content = None + if r.status_code == 200: + content = r.content.decode('utf-8') + if content.find('关于豆瓣') == -1: + content = None + error = error + 'Content not authentic' # response is garbage + elif re.search('不存在[^<]+', content, re.MULTILINE): + content = None + error = error + 'Not found or hidden by Douban' + else: + error = error + str(r.status_code) + + def fix_wayback_links(): + nonlocal content + # fix links + content = re.sub(r'href="http[^"]+http', r'href="http', content) + # https://img9.doubanio.com/view/subject/{l|m|s}/public/s1234.jpg + content = re.sub(r'src="[^"]+/(s\d+\.\w+)"', + r'src="https://img9.doubanio.com/view/subject/m/public/\1"', content) + # https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2681329386.jpg + # https://img9.doubanio.com/view/photo/{l|m|s}/public/p1234.webp + content = re.sub(r'src="[^"]+/(p\d+\.\w+)"', + r'src="https://img9.doubanio.com/view/photo/m/public/\1"', content) + + # Wayback Machine: get latest available + def wayback(): + nonlocal r, error, content + error = error + '\nWayback: ' + get('http://archive.org/wayback/available?url=' + url, 10) + if r.status_code == 200: + w = r.json() + if w['archived_snapshots'] and w['archived_snapshots']['closest']: + get(w['archived_snapshots']['closest']['url'], 10) + check_content() + if content is not None: + fix_wayback_links() + else: + error = error + 'No snapshot available' + else: + error = error + str(r.status_code) + + # Wayback Machine: guess via CDX API + def wayback_cdx(): + nonlocal r, error, content + error = error + '\nWayback: ' + get('http://web.archive.org/cdx/search/cdx?url=' + url, 10) + if r.status_code == 200: + dates = re.findall(r'[^\s]+\s+(\d+)\s+[^\s]+\s+[^\s]+\s+\d+\s+[^\s]+\s+\d{5,}', + r.content.decode('utf-8')) + # assume snapshots whose size >9999 contain real content, use the latest one of them + if len(dates) > 0: + get('http://web.archive.org/web/' + dates[-1] + '/' + url, 10) + check_content() + if content is not None: + fix_wayback_links() + else: + error = error + 'No snapshot available' + else: + error = error + str(r.status_code) + + def latest(): + nonlocal r, error, content + if settings.SCRAPERAPI_KEY is None: + error = error + '\nDirect: ' + get(url, 60) + else: + error = error + '\nScraperAPI: ' + get(f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}', 60) + check_content() + + wayback_cdx() + if content is None: + latest() + + if content is None: + logger.error(error) + content = '' + return html.fromstring(content) + + @classmethod + def download_image(cls, url, item_url=None): + raw_img = None + ext = None + + dl_url = url + if settings.SCRAPERAPI_KEY is not None: + dl_url = f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}' + + try: + img_response = requests.get(dl_url, timeout=90) + if img_response.status_code == 200: + raw_img = img_response.content + img = Image.open(BytesIO(raw_img)) + img.load() # corrupted image will trigger exception + content_type = img_response.headers.get('Content-Type') + ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension + else: + logger.error(f"Douban: download image failed {img_response.status_code} {dl_url} {item_url}") + # raise RuntimeError(f"Douban: download image failed {img_response.status_code} {dl_url}") + except Exception as e: + raw_img = None + ext = None + logger.error(f"Douban: download image failed {e} {dl_url} {item_url}") + if raw_img is None and settings.SCRAPERAPI_KEY is not None: + try: + img_response = requests.get(dl_url, timeout=90) + if img_response.status_code == 200: + raw_img = img_response.content + img = Image.open(BytesIO(raw_img)) + img.load() # corrupted image will trigger exception + content_type = img_response.headers.get('Content-Type') + ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension + else: + logger.error(f"Douban: download image failed {img_response.status_code} {dl_url} {item_url}") + except Exception as e: + raw_img = None + ext = None + logger.error(f"Douban: download image failed {e} {dl_url} {item_url}") + return raw_img, ext + + +class DoubanBookPatcher(DoubanPatcherMixin, AbstractScraper): + site_name = SourceSiteEnum.DOUBAN.value + host = 'book.douban.com' + data_class = Book + form_class = BookForm + + regex = re.compile(r"https://book\.douban\.com/subject/\d+/{0,1}") + + def scrape(self, url): + headers = DEFAULT_REQUEST_HEADERS.copy() + headers['Host'] = self.host + content = self.download_page(url, headers) + img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src") + img_url = img_url_elem[0].strip() if img_url_elem else None + raw_img, ext = self.download_image(img_url, url) + return raw_img, ext + + +class Command(BaseCommand): + help = 'fix cover image' + + def add_arguments(self, parser): + parser.add_argument('threadId', type=int, help='% 8') + + def handle(self, *args, **options): + t = int(options['threadId']) + for m in Book.objects.filter(cover='book/default.svg', source_site='douban'): + if m.id % 8 == t: + self.stdout.write(f'Re-fetching {m.source_url}') + try: + raw_img, img_ext = DoubanBookPatcher.scrape(m.source_url) + if img_ext is not None: + m.cover = SimpleUploadedFile('temp.' + img_ext, raw_img) + m.save() + self.stdout.write(self.style.SUCCESS(f'Saved {m.source_url}')) + else: + self.stdout.write(self.style.ERROR(f'Skipped {m.source_url}')) + except Exception as e: + print(e) diff --git a/movies/management/commands/fix-cover.py b/movies/management/commands/fix-movie-poster.py similarity index 100% rename from movies/management/commands/fix-cover.py rename to movies/management/commands/fix-movie-poster.py diff --git a/music/management/commands/fix-album-cover.py b/music/management/commands/fix-album-cover.py new file mode 100644 index 00000000..9e413e51 --- /dev/null +++ b/music/management/commands/fix-album-cover.py @@ -0,0 +1,186 @@ +from django.core.management.base import BaseCommand +from django.core.files.uploadedfile import SimpleUploadedFile +from common.scraper import * + + +class DoubanPatcherMixin: + @classmethod + def download_page(cls, url, headers): + url = cls.get_effective_url(url) + r = None + error = 'DoubanScrapper: error occured when downloading ' + url + content = None + + def get(url, timeout): + nonlocal r + # print('Douban GET ' + url) + try: + r = requests.get(url, timeout=timeout) + except Exception as e: + r = requests.Response() + r.status_code = f"Exception when GET {url} {e}" + url + # print('Douban CODE ' + str(r.status_code)) + return r + + def check_content(): + nonlocal r, error, content + content = None + if r.status_code == 200: + content = r.content.decode('utf-8') + if content.find('关于豆瓣') == -1: + content = None + error = error + 'Content not authentic' # response is garbage + elif re.search('不存在[^<]+', content, re.MULTILINE): + content = None + error = error + 'Not found or hidden by Douban' + else: + error = error + str(r.status_code) + + def fix_wayback_links(): + nonlocal content + # fix links + content = re.sub(r'href="http[^"]+http', r'href="http', content) + # https://img9.doubanio.com/view/subject/{l|m|s}/public/s1234.jpg + content = re.sub(r'src="[^"]+/(s\d+\.\w+)"', + r'src="https://img9.doubanio.com/view/subject/m/public/\1"', content) + # https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2681329386.jpg + # https://img9.doubanio.com/view/photo/{l|m|s}/public/p1234.webp + content = re.sub(r'src="[^"]+/(p\d+\.\w+)"', + r'src="https://img9.doubanio.com/view/photo/m/public/\1"', content) + + # Wayback Machine: get latest available + def wayback(): + nonlocal r, error, content + error = error + '\nWayback: ' + get('http://archive.org/wayback/available?url=' + url, 10) + if r.status_code == 200: + w = r.json() + if w['archived_snapshots'] and w['archived_snapshots']['closest']: + get(w['archived_snapshots']['closest']['url'], 10) + check_content() + if content is not None: + fix_wayback_links() + else: + error = error + 'No snapshot available' + else: + error = error + str(r.status_code) + + # Wayback Machine: guess via CDX API + def wayback_cdx(): + nonlocal r, error, content + error = error + '\nWayback: ' + get('http://web.archive.org/cdx/search/cdx?url=' + url, 10) + if r.status_code == 200: + dates = re.findall(r'[^\s]+\s+(\d+)\s+[^\s]+\s+[^\s]+\s+\d+\s+[^\s]+\s+\d{5,}', + r.content.decode('utf-8')) + # assume snapshots whose size >9999 contain real content, use the latest one of them + if len(dates) > 0: + get('http://web.archive.org/web/' + dates[-1] + '/' + url, 10) + check_content() + if content is not None: + fix_wayback_links() + else: + error = error + 'No snapshot available' + else: + error = error + str(r.status_code) + + def latest(): + nonlocal r, error, content + if settings.SCRAPERAPI_KEY is None: + error = error + '\nDirect: ' + get(url, 60) + else: + error = error + '\nScraperAPI: ' + get(f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}', 60) + check_content() + + wayback_cdx() + if content is None: + latest() + + if content is None: + logger.error(error) + content = '' + return html.fromstring(content) + + @classmethod + def download_image(cls, url, item_url=None): + raw_img = None + ext = None + + dl_url = url + if settings.SCRAPERAPI_KEY is not None: + dl_url = f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}' + + try: + img_response = requests.get(dl_url, timeout=90) + if img_response.status_code == 200: + raw_img = img_response.content + img = Image.open(BytesIO(raw_img)) + img.load() # corrupted image will trigger exception + content_type = img_response.headers.get('Content-Type') + ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension + else: + logger.error(f"Douban: download image failed {img_response.status_code} {dl_url} {item_url}") + # raise RuntimeError(f"Douban: download image failed {img_response.status_code} {dl_url}") + except Exception as e: + raw_img = None + ext = None + logger.error(f"Douban: download image failed {e} {dl_url} {item_url}") + if raw_img is None and settings.SCRAPERAPI_KEY is not None: + try: + img_response = requests.get(dl_url, timeout=90) + if img_response.status_code == 200: + raw_img = img_response.content + img = Image.open(BytesIO(raw_img)) + img.load() # corrupted image will trigger exception + content_type = img_response.headers.get('Content-Type') + ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension + else: + logger.error(f"Douban: download image failed {img_response.status_code} {dl_url} {item_url}") + except Exception as e: + raw_img = None + ext = None + logger.error(f"Douban: download image failed {e} {dl_url} {item_url}") + return raw_img, ext + + +class DoubanAlbumPatcher(DoubanPatcherMixin, AbstractScraper): + site_name = SourceSiteEnum.DOUBAN.value + host = 'music.douban.com' + data_class = Album + form_class = AlbumForm + + regex = re.compile(r"https://music\.douban\.com/subject/\d+/{0,1}") + + def scrape(self, url): + headers = DEFAULT_REQUEST_HEADERS.copy() + headers['Host'] = self.host + content = self.download_page(url, headers) + img_url_elem = content.xpath("//div[@id='mainpic']//img/@src") + img_url = img_url_elem[0].strip() if img_url_elem else None + raw_img, ext = self.download_image(img_url, url) + return raw_img, ext + + +class Command(BaseCommand): + help = 'fix cover image' + + def add_arguments(self, parser): + parser.add_argument('threadId', type=int, help='% 8') + + def handle(self, *args, **options): + t = int(options['threadId']) + for m in Album.objects.filter(cover='album/default.svg', source_site='douban'): + if m.id % 8 == t: + self.stdout.write(f'Re-fetching {m.source_url}') + try: + raw_img, img_ext = DoubanAlbumPatcher.scrape(m.source_url) + if img_ext is not None: + m.cover = SimpleUploadedFile('temp.' + img_ext, raw_img) + m.save() + self.stdout.write(self.style.SUCCESS(f'Saved {m.source_url}')) + else: + self.stdout.write(self.style.ERROR(f'Skipped {m.source_url}')) + except Exception as e: + print(e)