lib.itmens/catalog/common/downloaders.py

246 lines
8.3 KiB
Python
Raw Normal View History

import requests
import filetype
from PIL import Image
from io import BytesIO
from requests.exceptions import RequestException
from django.conf import settings
2022-12-08 16:59:03 +00:00
from pathlib import Path
import json
from io import StringIO
import re
import time
import logging
2022-12-08 16:59:03 +00:00
from lxml import html
2022-12-08 16:59:03 +00:00
_logger = logging.getLogger(__name__)
RESPONSE_OK = 0 # response is ready for pasring
RESPONSE_INVALID_CONTENT = -1 # content not valid but no need to retry
RESPONSE_NETWORK_ERROR = -2 # network error, retry next proxied url
RESPONSE_CENSORSHIP = -3 # censored, try sth special if possible
2022-12-08 16:59:03 +00:00
_mock_mode = False
def use_local_response(func):
def _func(args):
2022-12-08 16:59:03 +00:00
set_mock_mode(True)
func(args)
2022-12-08 16:59:03 +00:00
set_mock_mode(False)
return _func
2022-12-08 16:59:03 +00:00
def set_mock_mode(enabled):
global _mock_mode
_mock_mode = enabled
2022-12-08 16:59:03 +00:00
def get_mock_mode():
global _mock_mode
return _mock_mode
2022-12-08 15:45:37 +00:00
2022-12-08 23:58:44 +00:00
def get_mock_file(url):
fn = re.sub(r'[^\w]', '_', url)
return re.sub(r'_key_[A-Za-z0-9]+', '_key_19890604', fn)
class DownloadError(Exception):
2022-12-08 16:59:03 +00:00
def __init__(self, downloader, msg=None):
self.url = downloader.url
self.logs = downloader.logs
if downloader.response_type == RESPONSE_INVALID_CONTENT:
error = "Invalid Response"
elif downloader.response_type == RESPONSE_NETWORK_ERROR:
error = "Network Error"
elif downloader.response_type == RESPONSE_NETWORK_ERROR:
error = "Censored Content"
else:
error = "Unknown Error"
2022-12-08 16:59:03 +00:00
self.message = f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}"
super().__init__(self.message)
class BasicDownloader:
headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0',
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'no-cache',
}
2022-12-08 15:45:37 +00:00
def __init__(self, url, headers=None):
self.url = url
self.response_type = RESPONSE_OK
self.logs = []
2022-12-08 15:45:37 +00:00
if headers:
self.headers = headers
def get_timeout(self):
return settings.SCRAPING_TIMEOUT
def validate_response(self, response):
if response is None:
return RESPONSE_NETWORK_ERROR
elif response.status_code == 200:
return RESPONSE_OK
else:
return RESPONSE_INVALID_CONTENT
def _download(self, url):
try:
2022-12-08 16:59:03 +00:00
if not _mock_mode:
# TODO cache = get/set from redis
resp = requests.get(url, headers=self.headers, timeout=self.get_timeout())
if settings.DOWNLOADER_SAVEDIR:
2022-12-08 23:58:44 +00:00
with open(settings.DOWNLOADER_SAVEDIR + '/' + get_mock_file(url), 'w', encoding='utf-8') as fp:
fp.write(resp.text)
else:
resp = MockResponse(self.url)
response_type = self.validate_response(resp)
self.logs.append({'response_type': response_type, 'url': url, 'exception': None})
return resp, response_type
except RequestException as e:
self.logs.append({'response_type': RESPONSE_NETWORK_ERROR, 'url': url, 'exception': e})
return None, RESPONSE_NETWORK_ERROR
def download(self):
resp, self.response_type = self._download(self.url)
if self.response_type == RESPONSE_OK:
return resp
else:
raise DownloadError(self)
class ProxiedDownloader(BasicDownloader):
def get_proxied_urls(self):
urls = []
if settings.PROXYCRAWL_KEY is not None:
urls.append(f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={self.url}')
if settings.SCRAPESTACK_KEY is not None:
# urls.append(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
urls.append(f'http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
if settings.SCRAPERAPI_KEY is not None:
urls.append(f'http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={self.url}')
return urls
def get_special_proxied_url(self):
return f'{settings.LOCAL_PROXY}?url={self.url}' if settings.LOCAL_PROXY is not None else None
def download(self):
urls = self.get_proxied_urls()
last_try = False
url = urls.pop(0) if len(urls) else None
resp = None
while url:
resp, resp_type = self._download(url)
if resp_type == RESPONSE_OK or resp_type == RESPONSE_INVALID_CONTENT or last_try:
url = None
elif resp_type == RESPONSE_CENSORSHIP:
url = self.get_special_proxied_url()
last_try = True
else: # resp_type == RESPONSE_NETWORK_ERROR:
url = urls.pop(0) if len(urls) else None
self.response_type = resp_type
if self.response_type == RESPONSE_OK:
return resp
else:
raise DownloadError(self)
class RetryDownloader(BasicDownloader):
def download(self):
retries = settings.DOWNLOADER_RETRIES
while retries:
retries -= 1
resp, self.response_type = self._download(self.url)
if self.response_type == RESPONSE_OK:
return resp
elif self.response_type != RESPONSE_NETWORK_ERROR and retries == 0:
raise DownloadError(self)
elif retries > 0:
2022-12-08 16:59:03 +00:00
_logger.debug('Retry ' + self.url)
time.sleep((settings.DOWNLOADER_RETRIES - retries) * 0.5)
2022-12-08 16:59:03 +00:00
raise DownloadError(self, 'max out of retries')
class ImageDownloaderMixin:
def __init__(self, url, referer=None):
if referer is not None:
self.headers['Referer'] = referer
super().__init__(url)
def validate_response(self, response):
if response and response.status_code == 200:
try:
raw_img = response.content
img = Image.open(BytesIO(raw_img))
img.load() # corrupted image will trigger exception
content_type = response.headers.get('Content-Type')
self.extention = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
return RESPONSE_OK
except Exception:
return RESPONSE_NETWORK_ERROR
if response and response.status_code >= 400 and response.status_code < 500:
return RESPONSE_INVALID_CONTENT
else:
return RESPONSE_NETWORK_ERROR
class BasicImageDownloader(ImageDownloaderMixin, BasicDownloader):
2022-12-08 23:58:44 +00:00
@classmethod
def download_image(cls, image_url, page_url):
imgdl = cls(image_url, page_url)
try:
image = imgdl.download().content
image_extention = imgdl.extention
return image, image_extention
except Exception:
return None, None
class ProxiedImageDownloader(ImageDownloaderMixin, ProxiedDownloader):
pass
2022-12-08 16:59:03 +00:00
_local_response_path = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/'
class MockResponse:
def __init__(self, url):
self.url = url
2022-12-08 23:58:44 +00:00
fn = _local_response_path + get_mock_file(url)
2022-12-08 16:59:03 +00:00
try:
self.content = Path(fn).read_bytes()
self.status_code = 200
_logger.debug(f"use local response for {url} from {fn}")
except Exception:
self.content = b'Error: response file not found'
self.status_code = 404
_logger.debug(f"local response not found for {url} at {fn}")
@property
def text(self):
return self.content.decode('utf-8')
def json(self):
return json.load(StringIO(self.text))
def html(self):
return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug
@property
def headers(self):
return {'Content-Type': 'image/jpeg' if self.url.endswith('jpg') else 'text/html'}
requests.Response.html = MockResponse.html