247 lines
8.5 KiB
Python
247 lines
8.5 KiB
Python
import requests
|
|
import filetype
|
|
from PIL import Image
|
|
from io import BytesIO
|
|
from requests.exceptions import RequestException
|
|
from django.conf import settings
|
|
from pathlib import Path
|
|
import json
|
|
from io import StringIO
|
|
import re
|
|
import time
|
|
import logging
|
|
from lxml import html
|
|
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
RESPONSE_OK = 0 # response is ready for pasring
|
|
RESPONSE_INVALID_CONTENT = -1 # content not valid but no need to retry
|
|
RESPONSE_NETWORK_ERROR = -2 # network error, retry next proxied url
|
|
RESPONSE_CENSORSHIP = -3 # censored, try sth special if possible
|
|
|
|
_mock_mode = False
|
|
|
|
|
|
def use_local_response(func):
|
|
def _func(args):
|
|
set_mock_mode(True)
|
|
func(args)
|
|
set_mock_mode(False)
|
|
return _func
|
|
|
|
|
|
def set_mock_mode(enabled):
|
|
global _mock_mode
|
|
_mock_mode = enabled
|
|
|
|
|
|
def get_mock_mode():
|
|
global _mock_mode
|
|
return _mock_mode
|
|
|
|
|
|
def get_mock_file(url):
|
|
fn = url.replace('***REMOVED***', '1234') # Thank you, Github Action -_-!
|
|
fn = re.sub(r'[^\w]', '_', fn)
|
|
fn = re.sub(r'_key_[*A-Za-z0-9]+', '_key_8964', fn)
|
|
return fn
|
|
|
|
|
|
class DownloadError(Exception):
|
|
def __init__(self, downloader, msg=None):
|
|
self.url = downloader.url
|
|
self.logs = downloader.logs
|
|
if downloader.response_type == RESPONSE_INVALID_CONTENT:
|
|
error = "Invalid Response"
|
|
elif downloader.response_type == RESPONSE_NETWORK_ERROR:
|
|
error = "Network Error"
|
|
elif downloader.response_type == RESPONSE_NETWORK_ERROR:
|
|
error = "Censored Content"
|
|
else:
|
|
error = "Unknown Error"
|
|
self.message = f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}"
|
|
super().__init__(self.message)
|
|
|
|
|
|
class BasicDownloader:
|
|
headers = {
|
|
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0',
|
|
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
|
|
'Accept-Encoding': 'gzip, deflate',
|
|
'Connection': 'keep-alive',
|
|
'DNT': '1',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'Cache-Control': 'no-cache',
|
|
}
|
|
|
|
def __init__(self, url, headers=None):
|
|
self.url = url
|
|
self.response_type = RESPONSE_OK
|
|
self.logs = []
|
|
if headers:
|
|
self.headers = headers
|
|
|
|
def get_timeout(self):
|
|
return settings.SCRAPING_TIMEOUT
|
|
|
|
def validate_response(self, response):
|
|
if response is None:
|
|
return RESPONSE_NETWORK_ERROR
|
|
elif response.status_code == 200:
|
|
return RESPONSE_OK
|
|
else:
|
|
return RESPONSE_INVALID_CONTENT
|
|
|
|
def _download(self, url):
|
|
try:
|
|
if not _mock_mode:
|
|
# TODO cache = get/set from redis
|
|
resp = requests.get(url, headers=self.headers, timeout=self.get_timeout())
|
|
if settings.DOWNLOADER_SAVEDIR:
|
|
with open(settings.DOWNLOADER_SAVEDIR + '/' + get_mock_file(url), 'w', encoding='utf-8') as fp:
|
|
fp.write(resp.text)
|
|
else:
|
|
resp = MockResponse(self.url)
|
|
response_type = self.validate_response(resp)
|
|
self.logs.append({'response_type': response_type, 'url': url, 'exception': None})
|
|
|
|
return resp, response_type
|
|
except RequestException as e:
|
|
self.logs.append({'response_type': RESPONSE_NETWORK_ERROR, 'url': url, 'exception': e})
|
|
return None, RESPONSE_NETWORK_ERROR
|
|
|
|
def download(self):
|
|
resp, self.response_type = self._download(self.url)
|
|
if self.response_type == RESPONSE_OK:
|
|
return resp
|
|
else:
|
|
raise DownloadError(self)
|
|
|
|
|
|
class ProxiedDownloader(BasicDownloader):
|
|
def get_proxied_urls(self):
|
|
urls = []
|
|
if settings.PROXYCRAWL_KEY is not None:
|
|
urls.append(f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={self.url}')
|
|
if settings.SCRAPESTACK_KEY is not None:
|
|
# urls.append(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
|
|
urls.append(f'http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
|
|
if settings.SCRAPERAPI_KEY is not None:
|
|
urls.append(f'http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={self.url}')
|
|
return urls
|
|
|
|
def get_special_proxied_url(self):
|
|
return f'{settings.LOCAL_PROXY}?url={self.url}' if settings.LOCAL_PROXY is not None else None
|
|
|
|
def download(self):
|
|
urls = self.get_proxied_urls()
|
|
last_try = False
|
|
url = urls.pop(0) if len(urls) else None
|
|
resp = None
|
|
while url:
|
|
resp, resp_type = self._download(url)
|
|
if resp_type == RESPONSE_OK or resp_type == RESPONSE_INVALID_CONTENT or last_try:
|
|
url = None
|
|
elif resp_type == RESPONSE_CENSORSHIP:
|
|
url = self.get_special_proxied_url()
|
|
last_try = True
|
|
else: # resp_type == RESPONSE_NETWORK_ERROR:
|
|
url = urls.pop(0) if len(urls) else None
|
|
self.response_type = resp_type
|
|
if self.response_type == RESPONSE_OK:
|
|
return resp
|
|
else:
|
|
raise DownloadError(self)
|
|
|
|
|
|
class RetryDownloader(BasicDownloader):
|
|
def download(self):
|
|
retries = settings.DOWNLOADER_RETRIES
|
|
while retries:
|
|
retries -= 1
|
|
resp, self.response_type = self._download(self.url)
|
|
if self.response_type == RESPONSE_OK:
|
|
return resp
|
|
elif self.response_type != RESPONSE_NETWORK_ERROR and retries == 0:
|
|
raise DownloadError(self)
|
|
elif retries > 0:
|
|
_logger.debug('Retry ' + self.url)
|
|
time.sleep((settings.DOWNLOADER_RETRIES - retries) * 0.5)
|
|
raise DownloadError(self, 'max out of retries')
|
|
|
|
|
|
class ImageDownloaderMixin:
|
|
def __init__(self, url, referer=None):
|
|
if referer is not None:
|
|
self.headers['Referer'] = referer
|
|
super().__init__(url)
|
|
|
|
def validate_response(self, response):
|
|
if response and response.status_code == 200:
|
|
try:
|
|
raw_img = response.content
|
|
img = Image.open(BytesIO(raw_img))
|
|
img.load() # corrupted image will trigger exception
|
|
content_type = response.headers.get('Content-Type')
|
|
self.extention = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
|
|
return RESPONSE_OK
|
|
except Exception:
|
|
return RESPONSE_NETWORK_ERROR
|
|
if response and response.status_code >= 400 and response.status_code < 500:
|
|
return RESPONSE_INVALID_CONTENT
|
|
else:
|
|
return RESPONSE_NETWORK_ERROR
|
|
|
|
|
|
class BasicImageDownloader(ImageDownloaderMixin, BasicDownloader):
|
|
@classmethod
|
|
def download_image(cls, image_url, page_url):
|
|
imgdl = cls(image_url, page_url)
|
|
try:
|
|
image = imgdl.download().content
|
|
image_extention = imgdl.extention
|
|
return image, image_extention
|
|
except Exception:
|
|
return None, None
|
|
|
|
|
|
class ProxiedImageDownloader(ImageDownloaderMixin, ProxiedDownloader):
|
|
pass
|
|
|
|
|
|
_local_response_path = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/'
|
|
|
|
|
|
class MockResponse:
|
|
def __init__(self, url):
|
|
self.url = url
|
|
fn = _local_response_path + get_mock_file(url)
|
|
try:
|
|
self.content = Path(fn).read_bytes()
|
|
self.status_code = 200
|
|
_logger.debug(f"use local response for {url} from {fn}")
|
|
except Exception:
|
|
self.content = b'Error: response file not found'
|
|
self.status_code = 404
|
|
_logger.debug(f"local response not found for {url} at {fn}")
|
|
|
|
@property
|
|
def text(self):
|
|
return self.content.decode('utf-8')
|
|
|
|
def json(self):
|
|
return json.load(StringIO(self.text))
|
|
|
|
def html(self):
|
|
return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5
|
|
|
|
@property
|
|
def headers(self):
|
|
return {'Content-Type': 'image/jpeg' if self.url.endswith('jpg') else 'text/html'}
|
|
|
|
|
|
requests.Response.html = MockResponse.html
|