From e389fc302d6e3646b97c586b6ed64a7f1e3c5e3b Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 8 Dec 2022 16:59:03 +0000 Subject: [PATCH] rename a few methods --- catalog/book/models.py | 2 +- catalog/common/__init__.py | 2 +- catalog/common/downloaders.py | 71 +++++++++++++++++++++++++++------- catalog/common/scrapers.py | 19 --------- catalog/common/sites.py | 6 +-- catalog/common/utils.py | 39 +------------------ catalog/sites/apple_podcast.py | 6 +-- catalog/sites/douban_book.py | 28 +++++++++++--- catalog/sites/douban_drama.py | 9 ++--- catalog/sites/douban_movie.py | 11 +++--- catalog/sites/goodreads.py | 16 ++++---- catalog/sites/imdb.py | 2 +- catalog/sites/spotify.py | 2 +- catalog/sites/tmdb.py | 12 +++--- 14 files changed, 115 insertions(+), 110 deletions(-) diff --git a/catalog/book/models.py b/catalog/book/models.py index 6e62c47e..131c89de 100644 --- a/catalog/book/models.py +++ b/catalog/book/models.py @@ -54,7 +54,7 @@ class Edition(Item): if work and work not in self.works.all(): self.works.add(work) # if not work: - # logger.info(f'Unable to find link for {w["url"]}') + # _logger.info(f'Unable to find link for {w["url"]}') class Work(Item): diff --git a/catalog/common/__init__.py b/catalog/common/__init__.py index 1ea3b6ae..1abc3ce7 100644 --- a/catalog/common/__init__.py +++ b/catalog/common/__init__.py @@ -5,4 +5,4 @@ from .scrapers import * from . import jsondata -__all__ = ('IdType', 'Item', 'ExternalResource', 'ResourceContent', 'ParseError', 'ScraperMixin', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'setMockMode', 'getMockMode', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP') +__all__ = ('IdType', 'Item', 'ExternalResource', 'ResourceContent', 'ParseError', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'get_mock_mode', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP') diff --git a/catalog/common/downloaders.py b/catalog/common/downloaders.py index 15186760..34bfc50f 100644 --- a/catalog/common/downloaders.py +++ b/catalog/common/downloaders.py @@ -4,13 +4,16 @@ from PIL import Image from io import BytesIO from requests.exceptions import RequestException from django.conf import settings -from .utils import MockResponse +from pathlib import Path +import json +from io import StringIO import re import time import logging +from lxml import html -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) RESPONSE_OK = 0 # response is ready for pasring @@ -18,29 +21,29 @@ RESPONSE_INVALID_CONTENT = -1 # content not valid but no need to retry RESPONSE_NETWORK_ERROR = -2 # network error, retry next proxied url RESPONSE_CENSORSHIP = -3 # censored, try sth special if possible -MockMode = False +_mock_mode = False def use_local_response(func): def _func(args): - setMockMode(True) + set_mock_mode(True) func(args) - setMockMode(False) + set_mock_mode(False) return _func -def setMockMode(enabled): - global MockMode - MockMode = enabled +def set_mock_mode(enabled): + global _mock_mode + _mock_mode = enabled -def getMockMode(): - global MockMode - return MockMode +def get_mock_mode(): + global _mock_mode + return _mock_mode class DownloadError(Exception): - def __init__(self, downloader): + def __init__(self, downloader, msg=None): self.url = downloader.url self.logs = downloader.logs if downloader.response_type == RESPONSE_INVALID_CONTENT: @@ -51,7 +54,7 @@ class DownloadError(Exception): error = "Censored Content" else: error = "Unknown Error" - self.message = f"Download Failed: {error}, url: {self.url}" + self.message = f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}" super().__init__(self.message) @@ -88,7 +91,7 @@ class BasicDownloader: def _download(self, url): try: - if not MockMode: + if not _mock_mode: # TODO cache = get/set from redis resp = requests.get(url, headers=self.headers, timeout=self.get_timeout()) if settings.DOWNLOADER_SAVEDIR: @@ -159,7 +162,9 @@ class RetryDownloader(BasicDownloader): elif self.response_type != RESPONSE_NETWORK_ERROR and retries == 0: raise DownloadError(self) elif retries > 0: + _logger.debug('Retry ' + self.url) time.sleep((settings.DOWNLOADER_RETRIES - retries) * 0.5) + raise DownloadError(self, 'max out of retries') class ImageDownloaderMixin: @@ -191,3 +196,41 @@ class BasicImageDownloader(ImageDownloaderMixin, BasicDownloader): class ProxiedImageDownloader(ImageDownloaderMixin, ProxiedDownloader): pass + + +_local_response_path = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/' + + +class MockResponse: + def get_mock_file(self, url): + fn = _local_response_path + re.sub(r'[^\w]', '_', url) + return re.sub(r'_key_[A-Za-z0-9]+', '_key_19890604', fn) + + def __init__(self, url): + self.url = url + fn = self.get_mock_file(url) + try: + self.content = Path(fn).read_bytes() + self.status_code = 200 + _logger.debug(f"use local response for {url} from {fn}") + except Exception: + self.content = b'Error: response file not found' + self.status_code = 404 + _logger.debug(f"local response not found for {url} at {fn}") + + @property + def text(self): + return self.content.decode('utf-8') + + def json(self): + return json.load(StringIO(self.text)) + + def html(self): + return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug + + @property + def headers(self): + return {'Content-Type': 'image/jpeg' if self.url.endswith('jpg') else 'text/html'} + + +requests.Response.html = MockResponse.html diff --git a/catalog/common/scrapers.py b/catalog/common/scrapers.py index 695f7b49..b97f6d0d 100644 --- a/catalog/common/scrapers.py +++ b/catalog/common/scrapers.py @@ -2,22 +2,3 @@ class ParseError(Exception): def __init__(self, scraper, field): msg = f'{type(scraper).__name__}: Error parsing field "{field}" for url {scraper.url}' super().__init__(msg) - - -class ScraperMixin: - def set_field(self, field, value=None): - self.data[field] = value - - def parse_str(self, query): - elem = self.html.xpath(query) - return elem[0].strip() if elem else None - - def parse_field(self, field, query, error_when_missing=False): - elem = self.html.xpath(query) - if elem: - self.data[field] = elem[0].strip() - elif error_when_missing: - raise ParseError(self, field) - else: - self.data[field] = None - return elem diff --git a/catalog/common/sites.py b/catalog/common/sites.py index e89893e9..7acbb5c1 100644 --- a/catalog/common/sites.py +++ b/catalog/common/sites.py @@ -5,7 +5,7 @@ from dataclasses import dataclass, field import logging -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) @dataclass @@ -100,7 +100,7 @@ class AbstractSite: resource_content = self.scrape() p.update_content(resource_content) if not p.ready: - logger.error(f'unable to get resource {self.url} ready') + _logger.error(f'unable to get resource {self.url} ready') return None if auto_create and p.item is None: self.get_item() @@ -115,7 +115,7 @@ class AbstractSite: if linked_site: linked_site.get_resource_ready(auto_link=False) else: - logger.error(f'unable to get site for {linked_resources["url"]}') + _logger.error(f'unable to get site for {linked_resources["url"]}') p.item.update_linked_items_from_external_resource(p) p.item.save() return p diff --git a/catalog/common/utils.py b/catalog/common/utils.py index 289ea855..39b115a9 100644 --- a/catalog/common/utils.py +++ b/catalog/common/utils.py @@ -1,14 +1,9 @@ -from pathlib import Path -# import hashlib -import json -from io import StringIO import logging -import re from django.utils import timezone import uuid -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) DEFAULT_ITEM_COVER = 'item/default.svg' @@ -17,35 +12,3 @@ DEFAULT_ITEM_COVER = 'item/default.svg' def item_cover_path(resource, filename): fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1] return 'items/' + resource.id_type + '/' + fn - - -TestDataDir = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/' - - -class MockResponse: - def get_mock_file(self, url): - fn = TestDataDir + re.sub(r'[^\w]', '_', url) - return re.sub(r'_key_[A-Za-z0-9]+', '_key_19890604', fn) - - def __init__(self, url): - self.url = url - fn = self.get_mock_file(url) - try: - self.content = Path(fn).read_bytes() - self.status_code = 200 - logger.debug(f"use local response for {url} from {fn}") - except Exception: - self.content = b'Error: response file not found' - self.status_code = 404 - logger.debug(f"local response not found for {url} at {fn}") - - @property - def text(self): - return self.content.decode('utf-8') - - def json(self): - return json.load(StringIO(self.text)) - - @property - def headers(self): - return {'Content-Type': 'image/jpeg' if self.url.endswith('jpg') else 'text/html'} diff --git a/catalog/sites/apple_podcast.py b/catalog/sites/apple_podcast.py index 8f06cd5f..ae2b3b54 100644 --- a/catalog/sites/apple_podcast.py +++ b/catalog/sites/apple_podcast.py @@ -1,9 +1,9 @@ from catalog.common import * -from catalog.podcast.models import * +from catalog.models import * import logging -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) @SiteList.register @@ -36,5 +36,5 @@ class ApplePodcast(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') return pd diff --git a/catalog/sites/douban_book.py b/catalog/sites/douban_book.py index 57132f0f..021857c5 100644 --- a/catalog/sites/douban_book.py +++ b/catalog/sites/douban_book.py @@ -1,4 +1,3 @@ -from lxml import html from catalog.common import * from .douban import * from catalog.book.models import * @@ -6,7 +5,26 @@ from catalog.book.utils import * import logging -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) + + +class ScraperMixin: + def set_field(self, field, value=None): + self.data[field] = value + + def parse_str(self, query): + elem = self.html.xpath(query) + return elem[0].strip() if elem else None + + def parse_field(self, field, query, error_when_missing=False): + elem = self.html.xpath(query) + if elem: + self.data[field] = elem[0].strip() + elif error_when_missing: + raise ParseError(self, field) + else: + self.data[field] = None + return elem @SiteList.register @@ -22,7 +40,7 @@ class DoubanBook(AbstractSite, ScraperMixin): def scrape(self): self.data = {} - self.html = html.fromstring(DoubanDownloader(self.url).download().text.strip()) + self.html = DoubanDownloader(self.url).download().html() self.parse_field('title', "/html/body//h1/span/text()") self.parse_field('isbn', "//div[@id='info']//span[text()='ISBN:']/following::text()") # TODO does douban store ASIN as ISBN, need more cleanup if so @@ -127,7 +145,7 @@ class DoubanBook(AbstractSite, ScraperMixin): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}') + _logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}') return pd @@ -151,7 +169,7 @@ class DoubanBook_Work(AbstractSite): return pd def scrape(self): - content = html.fromstring(DoubanDownloader(self.url).download().text.strip()) + content = DoubanDownloader(self.url).download().html() title_elem = content.xpath("//h1/text()") title = title_elem[0].split('全部版本(')[0].strip() if title_elem else None if not title: diff --git a/catalog/sites/douban_drama.py b/catalog/sites/douban_drama.py index 86157d05..4a0c27b7 100644 --- a/catalog/sites/douban_drama.py +++ b/catalog/sites/douban_drama.py @@ -1,11 +1,10 @@ -from lxml import html from catalog.common import * -from ..performance.models import Performance +from catalog.models import * from .douban import DoubanDownloader import logging -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) @SiteList.register @@ -20,7 +19,7 @@ class DoubanDrama(AbstractSite): return "https://www.douban.com/location/drama/" + id_value + "/" def scrape(self): - h = html.fromstring(DoubanDownloader(self.url).download().text) + h = DoubanDownloader(self.url).download().html() data = {} title_elem = h.xpath("/html/body//h1/span/text()") @@ -55,5 +54,5 @@ class DoubanDrama(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') return pd diff --git a/catalog/sites/douban_movie.py b/catalog/sites/douban_movie.py index a6ca1869..d2a971c5 100644 --- a/catalog/sites/douban_movie.py +++ b/catalog/sites/douban_movie.py @@ -1,4 +1,3 @@ -from lxml import html from catalog.common import * from .douban import * from catalog.movie.models import * @@ -9,7 +8,7 @@ from django.utils.translation import gettext_lazy as _ from .tmdb import TMDB_TV, search_tmdb_by_imdb_id -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) class MovieGenreEnum(models.TextChoices): @@ -67,7 +66,7 @@ class DoubanMovie(AbstractSite): return "https://movie.douban.com/subject/" + id_value + "/" def scrape(self): - content = html.fromstring(DoubanDownloader(self.url).download().text.strip()) + content = DoubanDownloader(self.url).download().html() try: raw_title = content.xpath( @@ -131,7 +130,7 @@ class DoubanMovie(AbstractSite): elif g in genre_translator.values(): genre.append(g) else: - logger.error(f'unable to map genre {g}') + _logger.error(f'unable to map genre {g}') else: genre = None @@ -253,7 +252,7 @@ class DoubanMovie(AbstractSite): pd.metadata['preferred_model'] = 'TVSeason' tmdb_show_id = res_data['tv_episode_results'][0]['show_id'] if res_data['tv_episode_results'][0]['episode_number'] != 1: - logger.error(f'Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}') + _logger.error(f'Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}') # TODO correct the IMDB id pd.lookup_ids[IdType.IMDB] = imdb_code if tmdb_show_id: @@ -272,5 +271,5 @@ class DoubanMovie(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') return pd diff --git a/catalog/sites/goodreads.py b/catalog/sites/goodreads.py index f6618d94..6584be92 100644 --- a/catalog/sites/goodreads.py +++ b/catalog/sites/goodreads.py @@ -16,7 +16,9 @@ class GoodreadsDownloader(RetryDownloader): elif response.status_code == 200: if response.text.find('__NEXT_DATA__') != -1: return RESPONSE_OK - else: # retry if return legacy version + else: + # Goodreads may return legacy version for a/b testing + # retry if so return RESPONSE_NETWORK_ERROR else: return RESPONSE_INVALID_CONTENT @@ -36,11 +38,10 @@ class Goodreads(AbstractSite): def scrape(self, response=None): data = {} if response is not None: - content = response.text + h = html.fromstring(response.text.strip()) else: dl = GoodreadsDownloader(self.url) - content = dl.download().text - h = html.fromstring(content.strip()) + h = dl.download().html() # Next.JS version of GoodReads # JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState'] elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()') @@ -55,7 +56,8 @@ class Goodreads(AbstractSite): o[t].append(v) b = next(filter(lambda x: x.get('title'), o['Book']), None) if not b: - raise ParseError(self, 'Book json') + # Goodreads may return empty page template when internal service timeouts + raise ParseError(self, 'Book in __NEXT_DATA__ json') data['title'] = b['title'] data['brief'] = b['description'] data['isbn'] = b['details'].get('isbn13') @@ -68,7 +70,7 @@ class Goodreads(AbstractSite): if w: data['required_resources'] = [{ 'model': 'Work', - 'id_type': IdType.Goodreads_Work, + 'id_type': IdType.Goodreads_Work, 'id_value': str(w['legacyId']), 'title': w['details']['originalTitle'], 'url': w['editions']['webUrl'], @@ -98,7 +100,7 @@ class Goodreads_Work(AbstractSite): return "https://www.goodreads.com/work/editions/" + id_value def scrape(self, response=None): - content = html.fromstring(BasicDownloader(self.url).download().text.strip()) + content = BasicDownloader(self.url).download().html() title_elem = content.xpath("//h1/a/text()") title = title_elem[0].strip() if title_elem else None if not title: diff --git a/catalog/sites/imdb.py b/catalog/sites/imdb.py index eb7b2b18..a6064afd 100644 --- a/catalog/sites/imdb.py +++ b/catalog/sites/imdb.py @@ -5,7 +5,7 @@ from catalog.tv.models import * import logging -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) @SiteList.register diff --git a/catalog/sites/spotify.py b/catalog/sites/spotify.py index 18066854..75914281 100644 --- a/catalog/sites/spotify.py +++ b/catalog/sites/spotify.py @@ -99,7 +99,7 @@ class Spotify(AbstractSite): def get_spotify_token(): global spotify_token, spotify_token_expire_time - if getMockMode(): + if get_mock_mode(): return 'mocked' if spotify_token is None or is_spotify_token_expired(): invoke_spotify_token() diff --git a/catalog/sites/tmdb.py b/catalog/sites/tmdb.py index 63b839d1..f0c65c90 100644 --- a/catalog/sites/tmdb.py +++ b/catalog/sites/tmdb.py @@ -11,7 +11,7 @@ from catalog.tv.models import * import logging -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) def search_tmdb_by_imdb_id(imdb_id): @@ -155,7 +155,7 @@ class TMDB_Movie(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') return pd @@ -264,7 +264,7 @@ class TMDB_TV(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') return pd @@ -311,13 +311,13 @@ class TMDB_TVSeason(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') # get external id from 1st episode if pd.lookup_ids[IdType.IMDB]: - logger.warning("Unexpected IMDB id for TMDB tv season") + _logger.warning("Unexpected IMDB id for TMDB tv season") elif len(pd.metadata['episode_number_list']) == 0: - logger.warning("Unable to lookup IMDB id for TMDB tv season with zero episodes") + _logger.warning("Unable to lookup IMDB id for TMDB tv season with zero episodes") else: ep = pd.metadata['episode_number_list'][0] api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"