lib.itmens/catalog/common/downloaders.py
2022-12-16 12:00:05 -05:00

247 lines
8.5 KiB
Python

import requests
import filetype
from PIL import Image
from io import BytesIO
from requests.exceptions import RequestException
from django.conf import settings
from pathlib import Path
import json
from io import StringIO
import re
import time
import logging
from lxml import html
_logger = logging.getLogger(__name__)
RESPONSE_OK = 0 # response is ready for pasring
RESPONSE_INVALID_CONTENT = -1 # content not valid but no need to retry
RESPONSE_NETWORK_ERROR = -2 # network error, retry next proxied url
RESPONSE_CENSORSHIP = -3 # censored, try sth special if possible
_mock_mode = False
def use_local_response(func):
def _func(args):
set_mock_mode(True)
func(args)
set_mock_mode(False)
return _func
def set_mock_mode(enabled):
global _mock_mode
_mock_mode = enabled
def get_mock_mode():
global _mock_mode
return _mock_mode
def get_mock_file(url):
fn = url.replace('***REMOVED***', '1234') # Thank you, Github Action -_-!
fn = re.sub(r'[^\w]', '_', fn)
fn = re.sub(r'_key_[*A-Za-z0-9]+', '_key_8964', fn)
return fn
class DownloadError(Exception):
def __init__(self, downloader, msg=None):
self.url = downloader.url
self.logs = downloader.logs
if downloader.response_type == RESPONSE_INVALID_CONTENT:
error = "Invalid Response"
elif downloader.response_type == RESPONSE_NETWORK_ERROR:
error = "Network Error"
elif downloader.response_type == RESPONSE_NETWORK_ERROR:
error = "Censored Content"
else:
error = "Unknown Error"
self.message = f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}"
super().__init__(self.message)
class BasicDownloader:
headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0',
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'no-cache',
}
def __init__(self, url, headers=None):
self.url = url
self.response_type = RESPONSE_OK
self.logs = []
if headers:
self.headers = headers
def get_timeout(self):
return settings.SCRAPING_TIMEOUT
def validate_response(self, response):
if response is None:
return RESPONSE_NETWORK_ERROR
elif response.status_code == 200:
return RESPONSE_OK
else:
return RESPONSE_INVALID_CONTENT
def _download(self, url):
try:
if not _mock_mode:
# TODO cache = get/set from redis
resp = requests.get(url, headers=self.headers, timeout=self.get_timeout())
if settings.DOWNLOADER_SAVEDIR:
with open(settings.DOWNLOADER_SAVEDIR + '/' + get_mock_file(url), 'w', encoding='utf-8') as fp:
fp.write(resp.text)
else:
resp = MockResponse(self.url)
response_type = self.validate_response(resp)
self.logs.append({'response_type': response_type, 'url': url, 'exception': None})
return resp, response_type
except RequestException as e:
self.logs.append({'response_type': RESPONSE_NETWORK_ERROR, 'url': url, 'exception': e})
return None, RESPONSE_NETWORK_ERROR
def download(self):
resp, self.response_type = self._download(self.url)
if self.response_type == RESPONSE_OK:
return resp
else:
raise DownloadError(self)
class ProxiedDownloader(BasicDownloader):
def get_proxied_urls(self):
urls = []
if settings.PROXYCRAWL_KEY is not None:
urls.append(f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={self.url}')
if settings.SCRAPESTACK_KEY is not None:
# urls.append(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
urls.append(f'http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
if settings.SCRAPERAPI_KEY is not None:
urls.append(f'http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={self.url}')
return urls
def get_special_proxied_url(self):
return f'{settings.LOCAL_PROXY}?url={self.url}' if settings.LOCAL_PROXY is not None else None
def download(self):
urls = self.get_proxied_urls()
last_try = False
url = urls.pop(0) if len(urls) else None
resp = None
while url:
resp, resp_type = self._download(url)
if resp_type == RESPONSE_OK or resp_type == RESPONSE_INVALID_CONTENT or last_try:
url = None
elif resp_type == RESPONSE_CENSORSHIP:
url = self.get_special_proxied_url()
last_try = True
else: # resp_type == RESPONSE_NETWORK_ERROR:
url = urls.pop(0) if len(urls) else None
self.response_type = resp_type
if self.response_type == RESPONSE_OK:
return resp
else:
raise DownloadError(self)
class RetryDownloader(BasicDownloader):
def download(self):
retries = settings.DOWNLOADER_RETRIES
while retries:
retries -= 1
resp, self.response_type = self._download(self.url)
if self.response_type == RESPONSE_OK:
return resp
elif self.response_type != RESPONSE_NETWORK_ERROR and retries == 0:
raise DownloadError(self)
elif retries > 0:
_logger.debug('Retry ' + self.url)
time.sleep((settings.DOWNLOADER_RETRIES - retries) * 0.5)
raise DownloadError(self, 'max out of retries')
class ImageDownloaderMixin:
def __init__(self, url, referer=None):
if referer is not None:
self.headers['Referer'] = referer
super().__init__(url)
def validate_response(self, response):
if response and response.status_code == 200:
try:
raw_img = response.content
img = Image.open(BytesIO(raw_img))
img.load() # corrupted image will trigger exception
content_type = response.headers.get('Content-Type')
self.extention = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
return RESPONSE_OK
except Exception:
return RESPONSE_NETWORK_ERROR
if response and response.status_code >= 400 and response.status_code < 500:
return RESPONSE_INVALID_CONTENT
else:
return RESPONSE_NETWORK_ERROR
class BasicImageDownloader(ImageDownloaderMixin, BasicDownloader):
@classmethod
def download_image(cls, image_url, page_url):
imgdl = cls(image_url, page_url)
try:
image = imgdl.download().content
image_extention = imgdl.extention
return image, image_extention
except Exception:
return None, None
class ProxiedImageDownloader(ImageDownloaderMixin, ProxiedDownloader):
pass
_local_response_path = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/'
class MockResponse:
def __init__(self, url):
self.url = url
fn = _local_response_path + get_mock_file(url)
try:
self.content = Path(fn).read_bytes()
self.status_code = 200
_logger.debug(f"use local response for {url} from {fn}")
except Exception:
self.content = b'Error: response file not found'
self.status_code = 404
_logger.debug(f"local response not found for {url} at {fn}")
@property
def text(self):
return self.content.decode('utf-8')
def json(self):
return json.load(StringIO(self.text))
def html(self):
return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5
@property
def headers(self):
return {'Content-Type': 'image/jpeg' if self.url.endswith('jpg') else 'text/html'}
requests.Response.html = MockResponse.html