rename a few methods

This commit is contained in:
Your Name 2022-12-08 16:59:03 +00:00
parent 7d04d29613
commit e389fc302d
14 changed files with 115 additions and 110 deletions

View file

@ -54,7 +54,7 @@ class Edition(Item):
if work and work not in self.works.all():
self.works.add(work)
# if not work:
# logger.info(f'Unable to find link for {w["url"]}')
# _logger.info(f'Unable to find link for {w["url"]}')
class Work(Item):

View file

@ -5,4 +5,4 @@ from .scrapers import *
from . import jsondata
__all__ = ('IdType', 'Item', 'ExternalResource', 'ResourceContent', 'ParseError', 'ScraperMixin', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'setMockMode', 'getMockMode', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP')
__all__ = ('IdType', 'Item', 'ExternalResource', 'ResourceContent', 'ParseError', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'get_mock_mode', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP')

View file

@ -4,13 +4,16 @@ from PIL import Image
from io import BytesIO
from requests.exceptions import RequestException
from django.conf import settings
from .utils import MockResponse
from pathlib import Path
import json
from io import StringIO
import re
import time
import logging
from lxml import html
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
RESPONSE_OK = 0 # response is ready for pasring
@ -18,29 +21,29 @@ RESPONSE_INVALID_CONTENT = -1 # content not valid but no need to retry
RESPONSE_NETWORK_ERROR = -2 # network error, retry next proxied url
RESPONSE_CENSORSHIP = -3 # censored, try sth special if possible
MockMode = False
_mock_mode = False
def use_local_response(func):
def _func(args):
setMockMode(True)
set_mock_mode(True)
func(args)
setMockMode(False)
set_mock_mode(False)
return _func
def setMockMode(enabled):
global MockMode
MockMode = enabled
def set_mock_mode(enabled):
global _mock_mode
_mock_mode = enabled
def getMockMode():
global MockMode
return MockMode
def get_mock_mode():
global _mock_mode
return _mock_mode
class DownloadError(Exception):
def __init__(self, downloader):
def __init__(self, downloader, msg=None):
self.url = downloader.url
self.logs = downloader.logs
if downloader.response_type == RESPONSE_INVALID_CONTENT:
@ -51,7 +54,7 @@ class DownloadError(Exception):
error = "Censored Content"
else:
error = "Unknown Error"
self.message = f"Download Failed: {error}, url: {self.url}"
self.message = f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}"
super().__init__(self.message)
@ -88,7 +91,7 @@ class BasicDownloader:
def _download(self, url):
try:
if not MockMode:
if not _mock_mode:
# TODO cache = get/set from redis
resp = requests.get(url, headers=self.headers, timeout=self.get_timeout())
if settings.DOWNLOADER_SAVEDIR:
@ -159,7 +162,9 @@ class RetryDownloader(BasicDownloader):
elif self.response_type != RESPONSE_NETWORK_ERROR and retries == 0:
raise DownloadError(self)
elif retries > 0:
_logger.debug('Retry ' + self.url)
time.sleep((settings.DOWNLOADER_RETRIES - retries) * 0.5)
raise DownloadError(self, 'max out of retries')
class ImageDownloaderMixin:
@ -191,3 +196,41 @@ class BasicImageDownloader(ImageDownloaderMixin, BasicDownloader):
class ProxiedImageDownloader(ImageDownloaderMixin, ProxiedDownloader):
pass
_local_response_path = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/'
class MockResponse:
def get_mock_file(self, url):
fn = _local_response_path + re.sub(r'[^\w]', '_', url)
return re.sub(r'_key_[A-Za-z0-9]+', '_key_19890604', fn)
def __init__(self, url):
self.url = url
fn = self.get_mock_file(url)
try:
self.content = Path(fn).read_bytes()
self.status_code = 200
_logger.debug(f"use local response for {url} from {fn}")
except Exception:
self.content = b'Error: response file not found'
self.status_code = 404
_logger.debug(f"local response not found for {url} at {fn}")
@property
def text(self):
return self.content.decode('utf-8')
def json(self):
return json.load(StringIO(self.text))
def html(self):
return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug
@property
def headers(self):
return {'Content-Type': 'image/jpeg' if self.url.endswith('jpg') else 'text/html'}
requests.Response.html = MockResponse.html

View file

@ -2,22 +2,3 @@ class ParseError(Exception):
def __init__(self, scraper, field):
msg = f'{type(scraper).__name__}: Error parsing field "{field}" for url {scraper.url}'
super().__init__(msg)
class ScraperMixin:
def set_field(self, field, value=None):
self.data[field] = value
def parse_str(self, query):
elem = self.html.xpath(query)
return elem[0].strip() if elem else None
def parse_field(self, field, query, error_when_missing=False):
elem = self.html.xpath(query)
if elem:
self.data[field] = elem[0].strip()
elif error_when_missing:
raise ParseError(self, field)
else:
self.data[field] = None
return elem

View file

@ -5,7 +5,7 @@ from dataclasses import dataclass, field
import logging
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
@dataclass
@ -100,7 +100,7 @@ class AbstractSite:
resource_content = self.scrape()
p.update_content(resource_content)
if not p.ready:
logger.error(f'unable to get resource {self.url} ready')
_logger.error(f'unable to get resource {self.url} ready')
return None
if auto_create and p.item is None:
self.get_item()
@ -115,7 +115,7 @@ class AbstractSite:
if linked_site:
linked_site.get_resource_ready(auto_link=False)
else:
logger.error(f'unable to get site for {linked_resources["url"]}')
_logger.error(f'unable to get site for {linked_resources["url"]}')
p.item.update_linked_items_from_external_resource(p)
p.item.save()
return p

View file

@ -1,14 +1,9 @@
from pathlib import Path
# import hashlib
import json
from io import StringIO
import logging
import re
from django.utils import timezone
import uuid
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
DEFAULT_ITEM_COVER = 'item/default.svg'
@ -17,35 +12,3 @@ DEFAULT_ITEM_COVER = 'item/default.svg'
def item_cover_path(resource, filename):
fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1]
return 'items/' + resource.id_type + '/' + fn
TestDataDir = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/'
class MockResponse:
def get_mock_file(self, url):
fn = TestDataDir + re.sub(r'[^\w]', '_', url)
return re.sub(r'_key_[A-Za-z0-9]+', '_key_19890604', fn)
def __init__(self, url):
self.url = url
fn = self.get_mock_file(url)
try:
self.content = Path(fn).read_bytes()
self.status_code = 200
logger.debug(f"use local response for {url} from {fn}")
except Exception:
self.content = b'Error: response file not found'
self.status_code = 404
logger.debug(f"local response not found for {url} at {fn}")
@property
def text(self):
return self.content.decode('utf-8')
def json(self):
return json.load(StringIO(self.text))
@property
def headers(self):
return {'Content-Type': 'image/jpeg' if self.url.endswith('jpg') else 'text/html'}

View file

@ -1,9 +1,9 @@
from catalog.common import *
from catalog.podcast.models import *
from catalog.models import *
import logging
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
@SiteList.register
@ -36,5 +36,5 @@ class ApplePodcast(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd

View file

@ -1,4 +1,3 @@
from lxml import html
from catalog.common import *
from .douban import *
from catalog.book.models import *
@ -6,7 +5,26 @@ from catalog.book.utils import *
import logging
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
class ScraperMixin:
def set_field(self, field, value=None):
self.data[field] = value
def parse_str(self, query):
elem = self.html.xpath(query)
return elem[0].strip() if elem else None
def parse_field(self, field, query, error_when_missing=False):
elem = self.html.xpath(query)
if elem:
self.data[field] = elem[0].strip()
elif error_when_missing:
raise ParseError(self, field)
else:
self.data[field] = None
return elem
@SiteList.register
@ -22,7 +40,7 @@ class DoubanBook(AbstractSite, ScraperMixin):
def scrape(self):
self.data = {}
self.html = html.fromstring(DoubanDownloader(self.url).download().text.strip())
self.html = DoubanDownloader(self.url).download().html()
self.parse_field('title', "/html/body//h1/span/text()")
self.parse_field('isbn', "//div[@id='info']//span[text()='ISBN:']/following::text()")
# TODO does douban store ASIN as ISBN, need more cleanup if so
@ -127,7 +145,7 @@ class DoubanBook(AbstractSite, ScraperMixin):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}')
_logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}')
return pd
@ -151,7 +169,7 @@ class DoubanBook_Work(AbstractSite):
return pd
def scrape(self):
content = html.fromstring(DoubanDownloader(self.url).download().text.strip())
content = DoubanDownloader(self.url).download().html()
title_elem = content.xpath("//h1/text()")
title = title_elem[0].split('全部版本(')[0].strip() if title_elem else None
if not title:

View file

@ -1,11 +1,10 @@
from lxml import html
from catalog.common import *
from ..performance.models import Performance
from catalog.models import *
from .douban import DoubanDownloader
import logging
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
@SiteList.register
@ -20,7 +19,7 @@ class DoubanDrama(AbstractSite):
return "https://www.douban.com/location/drama/" + id_value + "/"
def scrape(self):
h = html.fromstring(DoubanDownloader(self.url).download().text)
h = DoubanDownloader(self.url).download().html()
data = {}
title_elem = h.xpath("/html/body//h1/span/text()")
@ -55,5 +54,5 @@ class DoubanDrama(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd

View file

@ -1,4 +1,3 @@
from lxml import html
from catalog.common import *
from .douban import *
from catalog.movie.models import *
@ -9,7 +8,7 @@ from django.utils.translation import gettext_lazy as _
from .tmdb import TMDB_TV, search_tmdb_by_imdb_id
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
class MovieGenreEnum(models.TextChoices):
@ -67,7 +66,7 @@ class DoubanMovie(AbstractSite):
return "https://movie.douban.com/subject/" + id_value + "/"
def scrape(self):
content = html.fromstring(DoubanDownloader(self.url).download().text.strip())
content = DoubanDownloader(self.url).download().html()
try:
raw_title = content.xpath(
@ -131,7 +130,7 @@ class DoubanMovie(AbstractSite):
elif g in genre_translator.values():
genre.append(g)
else:
logger.error(f'unable to map genre {g}')
_logger.error(f'unable to map genre {g}')
else:
genre = None
@ -253,7 +252,7 @@ class DoubanMovie(AbstractSite):
pd.metadata['preferred_model'] = 'TVSeason'
tmdb_show_id = res_data['tv_episode_results'][0]['show_id']
if res_data['tv_episode_results'][0]['episode_number'] != 1:
logger.error(f'Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}')
_logger.error(f'Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}')
# TODO correct the IMDB id
pd.lookup_ids[IdType.IMDB] = imdb_code
if tmdb_show_id:
@ -272,5 +271,5 @@ class DoubanMovie(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd

View file

@ -16,7 +16,9 @@ class GoodreadsDownloader(RetryDownloader):
elif response.status_code == 200:
if response.text.find('__NEXT_DATA__') != -1:
return RESPONSE_OK
else: # retry if return legacy version
else:
# Goodreads may return legacy version for a/b testing
# retry if so
return RESPONSE_NETWORK_ERROR
else:
return RESPONSE_INVALID_CONTENT
@ -36,11 +38,10 @@ class Goodreads(AbstractSite):
def scrape(self, response=None):
data = {}
if response is not None:
content = response.text
h = html.fromstring(response.text.strip())
else:
dl = GoodreadsDownloader(self.url)
content = dl.download().text
h = html.fromstring(content.strip())
h = dl.download().html()
# Next.JS version of GoodReads
# JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']
elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()')
@ -55,7 +56,8 @@ class Goodreads(AbstractSite):
o[t].append(v)
b = next(filter(lambda x: x.get('title'), o['Book']), None)
if not b:
raise ParseError(self, 'Book json')
# Goodreads may return empty page template when internal service timeouts
raise ParseError(self, 'Book in __NEXT_DATA__ json')
data['title'] = b['title']
data['brief'] = b['description']
data['isbn'] = b['details'].get('isbn13')
@ -68,7 +70,7 @@ class Goodreads(AbstractSite):
if w:
data['required_resources'] = [{
'model': 'Work',
'id_type': IdType.Goodreads_Work,
'id_type': IdType.Goodreads_Work,
'id_value': str(w['legacyId']),
'title': w['details']['originalTitle'],
'url': w['editions']['webUrl'],
@ -98,7 +100,7 @@ class Goodreads_Work(AbstractSite):
return "https://www.goodreads.com/work/editions/" + id_value
def scrape(self, response=None):
content = html.fromstring(BasicDownloader(self.url).download().text.strip())
content = BasicDownloader(self.url).download().html()
title_elem = content.xpath("//h1/a/text()")
title = title_elem[0].strip() if title_elem else None
if not title:

View file

@ -5,7 +5,7 @@ from catalog.tv.models import *
import logging
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
@SiteList.register

View file

@ -99,7 +99,7 @@ class Spotify(AbstractSite):
def get_spotify_token():
global spotify_token, spotify_token_expire_time
if getMockMode():
if get_mock_mode():
return 'mocked'
if spotify_token is None or is_spotify_token_expired():
invoke_spotify_token()

View file

@ -11,7 +11,7 @@ from catalog.tv.models import *
import logging
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
def search_tmdb_by_imdb_id(imdb_id):
@ -155,7 +155,7 @@ class TMDB_Movie(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd
@ -264,7 +264,7 @@ class TMDB_TV(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd
@ -311,13 +311,13 @@ class TMDB_TVSeason(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
# get external id from 1st episode
if pd.lookup_ids[IdType.IMDB]:
logger.warning("Unexpected IMDB id for TMDB tv season")
_logger.warning("Unexpected IMDB id for TMDB tv season")
elif len(pd.metadata['episode_number_list']) == 0:
logger.warning("Unable to lookup IMDB id for TMDB tv season with zero episodes")
_logger.warning("Unable to lookup IMDB id for TMDB tv season with zero episodes")
else:
ep = pd.metadata['episode_number_list'][0]
api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"