rename a few methods

This commit is contained in:
Your Name 2022-12-08 16:59:03 +00:00
parent 7d04d29613
commit e389fc302d
14 changed files with 115 additions and 110 deletions

View file

@ -54,7 +54,7 @@ class Edition(Item):
if work and work not in self.works.all(): if work and work not in self.works.all():
self.works.add(work) self.works.add(work)
# if not work: # if not work:
# logger.info(f'Unable to find link for {w["url"]}') # _logger.info(f'Unable to find link for {w["url"]}')
class Work(Item): class Work(Item):

View file

@ -5,4 +5,4 @@ from .scrapers import *
from . import jsondata from . import jsondata
__all__ = ('IdType', 'Item', 'ExternalResource', 'ResourceContent', 'ParseError', 'ScraperMixin', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'setMockMode', 'getMockMode', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP') __all__ = ('IdType', 'Item', 'ExternalResource', 'ResourceContent', 'ParseError', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'get_mock_mode', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP')

View file

@ -4,13 +4,16 @@ from PIL import Image
from io import BytesIO from io import BytesIO
from requests.exceptions import RequestException from requests.exceptions import RequestException
from django.conf import settings from django.conf import settings
from .utils import MockResponse from pathlib import Path
import json
from io import StringIO
import re import re
import time import time
import logging import logging
from lxml import html
logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
RESPONSE_OK = 0 # response is ready for pasring RESPONSE_OK = 0 # response is ready for pasring
@ -18,29 +21,29 @@ RESPONSE_INVALID_CONTENT = -1 # content not valid but no need to retry
RESPONSE_NETWORK_ERROR = -2 # network error, retry next proxied url RESPONSE_NETWORK_ERROR = -2 # network error, retry next proxied url
RESPONSE_CENSORSHIP = -3 # censored, try sth special if possible RESPONSE_CENSORSHIP = -3 # censored, try sth special if possible
MockMode = False _mock_mode = False
def use_local_response(func): def use_local_response(func):
def _func(args): def _func(args):
setMockMode(True) set_mock_mode(True)
func(args) func(args)
setMockMode(False) set_mock_mode(False)
return _func return _func
def setMockMode(enabled): def set_mock_mode(enabled):
global MockMode global _mock_mode
MockMode = enabled _mock_mode = enabled
def getMockMode(): def get_mock_mode():
global MockMode global _mock_mode
return MockMode return _mock_mode
class DownloadError(Exception): class DownloadError(Exception):
def __init__(self, downloader): def __init__(self, downloader, msg=None):
self.url = downloader.url self.url = downloader.url
self.logs = downloader.logs self.logs = downloader.logs
if downloader.response_type == RESPONSE_INVALID_CONTENT: if downloader.response_type == RESPONSE_INVALID_CONTENT:
@ -51,7 +54,7 @@ class DownloadError(Exception):
error = "Censored Content" error = "Censored Content"
else: else:
error = "Unknown Error" error = "Unknown Error"
self.message = f"Download Failed: {error}, url: {self.url}" self.message = f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}"
super().__init__(self.message) super().__init__(self.message)
@ -88,7 +91,7 @@ class BasicDownloader:
def _download(self, url): def _download(self, url):
try: try:
if not MockMode: if not _mock_mode:
# TODO cache = get/set from redis # TODO cache = get/set from redis
resp = requests.get(url, headers=self.headers, timeout=self.get_timeout()) resp = requests.get(url, headers=self.headers, timeout=self.get_timeout())
if settings.DOWNLOADER_SAVEDIR: if settings.DOWNLOADER_SAVEDIR:
@ -159,7 +162,9 @@ class RetryDownloader(BasicDownloader):
elif self.response_type != RESPONSE_NETWORK_ERROR and retries == 0: elif self.response_type != RESPONSE_NETWORK_ERROR and retries == 0:
raise DownloadError(self) raise DownloadError(self)
elif retries > 0: elif retries > 0:
_logger.debug('Retry ' + self.url)
time.sleep((settings.DOWNLOADER_RETRIES - retries) * 0.5) time.sleep((settings.DOWNLOADER_RETRIES - retries) * 0.5)
raise DownloadError(self, 'max out of retries')
class ImageDownloaderMixin: class ImageDownloaderMixin:
@ -191,3 +196,41 @@ class BasicImageDownloader(ImageDownloaderMixin, BasicDownloader):
class ProxiedImageDownloader(ImageDownloaderMixin, ProxiedDownloader): class ProxiedImageDownloader(ImageDownloaderMixin, ProxiedDownloader):
pass pass
_local_response_path = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/'
class MockResponse:
def get_mock_file(self, url):
fn = _local_response_path + re.sub(r'[^\w]', '_', url)
return re.sub(r'_key_[A-Za-z0-9]+', '_key_19890604', fn)
def __init__(self, url):
self.url = url
fn = self.get_mock_file(url)
try:
self.content = Path(fn).read_bytes()
self.status_code = 200
_logger.debug(f"use local response for {url} from {fn}")
except Exception:
self.content = b'Error: response file not found'
self.status_code = 404
_logger.debug(f"local response not found for {url} at {fn}")
@property
def text(self):
return self.content.decode('utf-8')
def json(self):
return json.load(StringIO(self.text))
def html(self):
return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug
@property
def headers(self):
return {'Content-Type': 'image/jpeg' if self.url.endswith('jpg') else 'text/html'}
requests.Response.html = MockResponse.html

View file

@ -2,22 +2,3 @@ class ParseError(Exception):
def __init__(self, scraper, field): def __init__(self, scraper, field):
msg = f'{type(scraper).__name__}: Error parsing field "{field}" for url {scraper.url}' msg = f'{type(scraper).__name__}: Error parsing field "{field}" for url {scraper.url}'
super().__init__(msg) super().__init__(msg)
class ScraperMixin:
def set_field(self, field, value=None):
self.data[field] = value
def parse_str(self, query):
elem = self.html.xpath(query)
return elem[0].strip() if elem else None
def parse_field(self, field, query, error_when_missing=False):
elem = self.html.xpath(query)
if elem:
self.data[field] = elem[0].strip()
elif error_when_missing:
raise ParseError(self, field)
else:
self.data[field] = None
return elem

View file

@ -5,7 +5,7 @@ from dataclasses import dataclass, field
import logging import logging
logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
@dataclass @dataclass
@ -100,7 +100,7 @@ class AbstractSite:
resource_content = self.scrape() resource_content = self.scrape()
p.update_content(resource_content) p.update_content(resource_content)
if not p.ready: if not p.ready:
logger.error(f'unable to get resource {self.url} ready') _logger.error(f'unable to get resource {self.url} ready')
return None return None
if auto_create and p.item is None: if auto_create and p.item is None:
self.get_item() self.get_item()
@ -115,7 +115,7 @@ class AbstractSite:
if linked_site: if linked_site:
linked_site.get_resource_ready(auto_link=False) linked_site.get_resource_ready(auto_link=False)
else: else:
logger.error(f'unable to get site for {linked_resources["url"]}') _logger.error(f'unable to get site for {linked_resources["url"]}')
p.item.update_linked_items_from_external_resource(p) p.item.update_linked_items_from_external_resource(p)
p.item.save() p.item.save()
return p return p

View file

@ -1,14 +1,9 @@
from pathlib import Path
# import hashlib
import json
from io import StringIO
import logging import logging
import re
from django.utils import timezone from django.utils import timezone
import uuid import uuid
logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
DEFAULT_ITEM_COVER = 'item/default.svg' DEFAULT_ITEM_COVER = 'item/default.svg'
@ -17,35 +12,3 @@ DEFAULT_ITEM_COVER = 'item/default.svg'
def item_cover_path(resource, filename): def item_cover_path(resource, filename):
fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1] fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1]
return 'items/' + resource.id_type + '/' + fn return 'items/' + resource.id_type + '/' + fn
TestDataDir = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/'
class MockResponse:
def get_mock_file(self, url):
fn = TestDataDir + re.sub(r'[^\w]', '_', url)
return re.sub(r'_key_[A-Za-z0-9]+', '_key_19890604', fn)
def __init__(self, url):
self.url = url
fn = self.get_mock_file(url)
try:
self.content = Path(fn).read_bytes()
self.status_code = 200
logger.debug(f"use local response for {url} from {fn}")
except Exception:
self.content = b'Error: response file not found'
self.status_code = 404
logger.debug(f"local response not found for {url} at {fn}")
@property
def text(self):
return self.content.decode('utf-8')
def json(self):
return json.load(StringIO(self.text))
@property
def headers(self):
return {'Content-Type': 'image/jpeg' if self.url.endswith('jpg') else 'text/html'}

View file

@ -1,9 +1,9 @@
from catalog.common import * from catalog.common import *
from catalog.podcast.models import * from catalog.models import *
import logging import logging
logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
@SiteList.register @SiteList.register
@ -36,5 +36,5 @@ class ApplePodcast(AbstractSite):
pd.cover_image = imgdl.download().content pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention pd.cover_image_extention = imgdl.extention
except Exception: except Exception:
logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd return pd

View file

@ -1,4 +1,3 @@
from lxml import html
from catalog.common import * from catalog.common import *
from .douban import * from .douban import *
from catalog.book.models import * from catalog.book.models import *
@ -6,7 +5,26 @@ from catalog.book.utils import *
import logging import logging
logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
class ScraperMixin:
def set_field(self, field, value=None):
self.data[field] = value
def parse_str(self, query):
elem = self.html.xpath(query)
return elem[0].strip() if elem else None
def parse_field(self, field, query, error_when_missing=False):
elem = self.html.xpath(query)
if elem:
self.data[field] = elem[0].strip()
elif error_when_missing:
raise ParseError(self, field)
else:
self.data[field] = None
return elem
@SiteList.register @SiteList.register
@ -22,7 +40,7 @@ class DoubanBook(AbstractSite, ScraperMixin):
def scrape(self): def scrape(self):
self.data = {} self.data = {}
self.html = html.fromstring(DoubanDownloader(self.url).download().text.strip()) self.html = DoubanDownloader(self.url).download().html()
self.parse_field('title', "/html/body//h1/span/text()") self.parse_field('title', "/html/body//h1/span/text()")
self.parse_field('isbn', "//div[@id='info']//span[text()='ISBN:']/following::text()") self.parse_field('isbn', "//div[@id='info']//span[text()='ISBN:']/following::text()")
# TODO does douban store ASIN as ISBN, need more cleanup if so # TODO does douban store ASIN as ISBN, need more cleanup if so
@ -127,7 +145,7 @@ class DoubanBook(AbstractSite, ScraperMixin):
pd.cover_image = imgdl.download().content pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention pd.cover_image_extention = imgdl.extention
except Exception: except Exception:
logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}') _logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}')
return pd return pd
@ -151,7 +169,7 @@ class DoubanBook_Work(AbstractSite):
return pd return pd
def scrape(self): def scrape(self):
content = html.fromstring(DoubanDownloader(self.url).download().text.strip()) content = DoubanDownloader(self.url).download().html()
title_elem = content.xpath("//h1/text()") title_elem = content.xpath("//h1/text()")
title = title_elem[0].split('全部版本(')[0].strip() if title_elem else None title = title_elem[0].split('全部版本(')[0].strip() if title_elem else None
if not title: if not title:

View file

@ -1,11 +1,10 @@
from lxml import html
from catalog.common import * from catalog.common import *
from ..performance.models import Performance from catalog.models import *
from .douban import DoubanDownloader from .douban import DoubanDownloader
import logging import logging
logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
@SiteList.register @SiteList.register
@ -20,7 +19,7 @@ class DoubanDrama(AbstractSite):
return "https://www.douban.com/location/drama/" + id_value + "/" return "https://www.douban.com/location/drama/" + id_value + "/"
def scrape(self): def scrape(self):
h = html.fromstring(DoubanDownloader(self.url).download().text) h = DoubanDownloader(self.url).download().html()
data = {} data = {}
title_elem = h.xpath("/html/body//h1/span/text()") title_elem = h.xpath("/html/body//h1/span/text()")
@ -55,5 +54,5 @@ class DoubanDrama(AbstractSite):
pd.cover_image = imgdl.download().content pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention pd.cover_image_extention = imgdl.extention
except Exception: except Exception:
logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd return pd

View file

@ -1,4 +1,3 @@
from lxml import html
from catalog.common import * from catalog.common import *
from .douban import * from .douban import *
from catalog.movie.models import * from catalog.movie.models import *
@ -9,7 +8,7 @@ from django.utils.translation import gettext_lazy as _
from .tmdb import TMDB_TV, search_tmdb_by_imdb_id from .tmdb import TMDB_TV, search_tmdb_by_imdb_id
logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
class MovieGenreEnum(models.TextChoices): class MovieGenreEnum(models.TextChoices):
@ -67,7 +66,7 @@ class DoubanMovie(AbstractSite):
return "https://movie.douban.com/subject/" + id_value + "/" return "https://movie.douban.com/subject/" + id_value + "/"
def scrape(self): def scrape(self):
content = html.fromstring(DoubanDownloader(self.url).download().text.strip()) content = DoubanDownloader(self.url).download().html()
try: try:
raw_title = content.xpath( raw_title = content.xpath(
@ -131,7 +130,7 @@ class DoubanMovie(AbstractSite):
elif g in genre_translator.values(): elif g in genre_translator.values():
genre.append(g) genre.append(g)
else: else:
logger.error(f'unable to map genre {g}') _logger.error(f'unable to map genre {g}')
else: else:
genre = None genre = None
@ -253,7 +252,7 @@ class DoubanMovie(AbstractSite):
pd.metadata['preferred_model'] = 'TVSeason' pd.metadata['preferred_model'] = 'TVSeason'
tmdb_show_id = res_data['tv_episode_results'][0]['show_id'] tmdb_show_id = res_data['tv_episode_results'][0]['show_id']
if res_data['tv_episode_results'][0]['episode_number'] != 1: if res_data['tv_episode_results'][0]['episode_number'] != 1:
logger.error(f'Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}') _logger.error(f'Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}')
# TODO correct the IMDB id # TODO correct the IMDB id
pd.lookup_ids[IdType.IMDB] = imdb_code pd.lookup_ids[IdType.IMDB] = imdb_code
if tmdb_show_id: if tmdb_show_id:
@ -272,5 +271,5 @@ class DoubanMovie(AbstractSite):
pd.cover_image = imgdl.download().content pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention pd.cover_image_extention = imgdl.extention
except Exception: except Exception:
logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd return pd

View file

@ -16,7 +16,9 @@ class GoodreadsDownloader(RetryDownloader):
elif response.status_code == 200: elif response.status_code == 200:
if response.text.find('__NEXT_DATA__') != -1: if response.text.find('__NEXT_DATA__') != -1:
return RESPONSE_OK return RESPONSE_OK
else: # retry if return legacy version else:
# Goodreads may return legacy version for a/b testing
# retry if so
return RESPONSE_NETWORK_ERROR return RESPONSE_NETWORK_ERROR
else: else:
return RESPONSE_INVALID_CONTENT return RESPONSE_INVALID_CONTENT
@ -36,11 +38,10 @@ class Goodreads(AbstractSite):
def scrape(self, response=None): def scrape(self, response=None):
data = {} data = {}
if response is not None: if response is not None:
content = response.text h = html.fromstring(response.text.strip())
else: else:
dl = GoodreadsDownloader(self.url) dl = GoodreadsDownloader(self.url)
content = dl.download().text h = dl.download().html()
h = html.fromstring(content.strip())
# Next.JS version of GoodReads # Next.JS version of GoodReads
# JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState'] # JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']
elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()') elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()')
@ -55,7 +56,8 @@ class Goodreads(AbstractSite):
o[t].append(v) o[t].append(v)
b = next(filter(lambda x: x.get('title'), o['Book']), None) b = next(filter(lambda x: x.get('title'), o['Book']), None)
if not b: if not b:
raise ParseError(self, 'Book json') # Goodreads may return empty page template when internal service timeouts
raise ParseError(self, 'Book in __NEXT_DATA__ json')
data['title'] = b['title'] data['title'] = b['title']
data['brief'] = b['description'] data['brief'] = b['description']
data['isbn'] = b['details'].get('isbn13') data['isbn'] = b['details'].get('isbn13')
@ -68,7 +70,7 @@ class Goodreads(AbstractSite):
if w: if w:
data['required_resources'] = [{ data['required_resources'] = [{
'model': 'Work', 'model': 'Work',
'id_type': IdType.Goodreads_Work, 'id_type': IdType.Goodreads_Work,
'id_value': str(w['legacyId']), 'id_value': str(w['legacyId']),
'title': w['details']['originalTitle'], 'title': w['details']['originalTitle'],
'url': w['editions']['webUrl'], 'url': w['editions']['webUrl'],
@ -98,7 +100,7 @@ class Goodreads_Work(AbstractSite):
return "https://www.goodreads.com/work/editions/" + id_value return "https://www.goodreads.com/work/editions/" + id_value
def scrape(self, response=None): def scrape(self, response=None):
content = html.fromstring(BasicDownloader(self.url).download().text.strip()) content = BasicDownloader(self.url).download().html()
title_elem = content.xpath("//h1/a/text()") title_elem = content.xpath("//h1/a/text()")
title = title_elem[0].strip() if title_elem else None title = title_elem[0].strip() if title_elem else None
if not title: if not title:

View file

@ -5,7 +5,7 @@ from catalog.tv.models import *
import logging import logging
logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
@SiteList.register @SiteList.register

View file

@ -99,7 +99,7 @@ class Spotify(AbstractSite):
def get_spotify_token(): def get_spotify_token():
global spotify_token, spotify_token_expire_time global spotify_token, spotify_token_expire_time
if getMockMode(): if get_mock_mode():
return 'mocked' return 'mocked'
if spotify_token is None or is_spotify_token_expired(): if spotify_token is None or is_spotify_token_expired():
invoke_spotify_token() invoke_spotify_token()

View file

@ -11,7 +11,7 @@ from catalog.tv.models import *
import logging import logging
logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
def search_tmdb_by_imdb_id(imdb_id): def search_tmdb_by_imdb_id(imdb_id):
@ -155,7 +155,7 @@ class TMDB_Movie(AbstractSite):
pd.cover_image = imgdl.download().content pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention pd.cover_image_extention = imgdl.extention
except Exception: except Exception:
logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd return pd
@ -264,7 +264,7 @@ class TMDB_TV(AbstractSite):
pd.cover_image = imgdl.download().content pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention pd.cover_image_extention = imgdl.extention
except Exception: except Exception:
logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd return pd
@ -311,13 +311,13 @@ class TMDB_TVSeason(AbstractSite):
pd.cover_image = imgdl.download().content pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention pd.cover_image_extention = imgdl.extention
except Exception: except Exception:
logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
# get external id from 1st episode # get external id from 1st episode
if pd.lookup_ids[IdType.IMDB]: if pd.lookup_ids[IdType.IMDB]:
logger.warning("Unexpected IMDB id for TMDB tv season") _logger.warning("Unexpected IMDB id for TMDB tv season")
elif len(pd.metadata['episode_number_list']) == 0: elif len(pd.metadata['episode_number_list']) == 0:
logger.warning("Unable to lookup IMDB id for TMDB tv season with zero episodes") _logger.warning("Unable to lookup IMDB id for TMDB tv season with zero episodes")
else: else:
ep = pd.metadata['episode_number_list'][0] ep = pd.metadata['episode_number_list'][0]
api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits" api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"