lib.itmens/catalog/common/downloaders.py

317 lines
10 KiB
Python
Raw Normal View History

2022-12-08 16:59:03 +00:00
import json
import logging
import re
import time
from io import BytesIO, StringIO
from pathlib import Path
2023-08-11 01:43:19 -04:00
from typing import Tuple
2023-02-12 21:03:53 -05:00
from urllib.parse import quote
import filetype
import requests
from django.conf import settings
2023-07-20 21:59:49 -04:00
from django.core.cache import cache
2024-01-07 17:30:37 -05:00
from lxml import etree, html
from PIL import Image
2023-08-11 01:43:19 -04:00
from requests import Response
from requests.exceptions import RequestException
2022-12-08 16:59:03 +00:00
_logger = logging.getLogger(__name__)
RESPONSE_OK = 0 # response is ready for pasring
RESPONSE_INVALID_CONTENT = -1 # content not valid but no need to retry
RESPONSE_NETWORK_ERROR = -2 # network error, retry next proxied url
RESPONSE_CENSORSHIP = -3 # censored, try sth special if possible
2022-12-08 16:59:03 +00:00
_mock_mode = False
def use_local_response(func):
def _func(args):
2022-12-08 16:59:03 +00:00
set_mock_mode(True)
func(args)
2022-12-08 16:59:03 +00:00
set_mock_mode(False)
2022-12-29 23:57:02 -05:00
return _func
2022-12-08 16:59:03 +00:00
def set_mock_mode(enabled):
global _mock_mode
_mock_mode = enabled
2022-12-08 16:59:03 +00:00
def get_mock_mode():
global _mock_mode
return _mock_mode
2022-12-08 15:45:37 +00:00
2022-12-08 23:58:44 +00:00
def get_mock_file(url):
2022-12-29 23:57:02 -05:00
fn = url.replace("***REMOVED***", "1234") # Thank you, Github Action -_-!
fn = re.sub(r"[^\w]", "_", fn)
fn = re.sub(r"_key_[*A-Za-z0-9]+", "_key_8964", fn)
if len(fn) > 255:
fn = fn[:255]
2022-12-16 09:53:06 -05:00
return fn
2022-12-08 23:58:44 +00:00
2023-08-11 01:43:19 -04:00
_local_response_path = (
str(Path(__file__).parent.parent.parent.absolute()) + "/test_data/"
)
class MockResponse:
def __init__(self, url):
self.url = url
fn = _local_response_path + get_mock_file(url)
try:
self.content = Path(fn).read_bytes()
self.status_code = 200
_logger.debug(f"use local response for {url} from {fn}")
except Exception:
self.content = b"Error: response file not found"
self.status_code = 404
_logger.debug(f"local response not found for {url} at {fn}")
@property
def text(self):
return self.content.decode("utf-8")
def json(self):
return json.load(StringIO(self.text))
def html(self):
return html.fromstring( # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5
self.content.decode("utf-8")
)
2024-01-07 17:30:37 -05:00
def xml(self):
return etree.fromstring(self.content, base_url=self.url)
2023-08-11 01:43:19 -04:00
@property
def headers(self):
return {
"Content-Type": "image/jpeg" if self.url.endswith("jpg") else "text/html"
}
requests.Response.html = MockResponse.html # type:ignore
2024-01-07 17:30:37 -05:00
requests.Response.xml = MockResponse.xml # type:ignore
2023-08-11 01:43:19 -04:00
class DownloaderResponse(Response):
def html(self):
return html.fromstring( # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5
self.content.decode("utf-8")
)
2024-01-07 17:30:37 -05:00
def xml(self):
return etree.fromstring(self.content, base_url=self.url)
2023-08-11 01:43:19 -04:00
class DownloadError(Exception):
2022-12-08 16:59:03 +00:00
def __init__(self, downloader, msg=None):
self.url = downloader.url
self.logs = downloader.logs
if downloader.response_type == RESPONSE_INVALID_CONTENT:
error = "Invalid Response"
elif downloader.response_type == RESPONSE_NETWORK_ERROR:
error = "Network Error"
2024-01-06 00:12:43 -05:00
elif downloader.response_type == RESPONSE_CENSORSHIP:
error = "Censored Content"
else:
error = "Unknown Error"
2022-12-29 23:57:02 -05:00
self.message = (
f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}"
)
super().__init__(self.message)
class BasicDownloader:
headers = {
2023-01-29 20:05:30 -05:00
# "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
2022-12-29 23:57:02 -05:00
"User-Agent": "Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"DNT": "1",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": "no-cache",
}
2022-12-08 15:45:37 +00:00
def __init__(self, url, headers=None):
self.url = url
self.response_type = RESPONSE_OK
self.logs = []
2022-12-08 15:45:37 +00:00
if headers:
self.headers = headers
def get_timeout(self):
2023-08-24 05:48:14 +00:00
return settings.DOWNLOADER_REQUEST_TIMEOUT
2023-12-31 08:32:19 -05:00
def validate_response(self, response) -> int:
if response is None:
return RESPONSE_NETWORK_ERROR
elif response.status_code == 200:
return RESPONSE_OK
else:
return RESPONSE_INVALID_CONTENT
2023-12-05 23:14:29 -05:00
def _download(self, url) -> Tuple[DownloaderResponse | MockResponse | None, int]:
try:
2022-12-08 16:59:03 +00:00
if not _mock_mode:
2022-12-29 23:57:02 -05:00
resp = requests.get(
url, headers=self.headers, timeout=self.get_timeout()
)
if settings.DOWNLOADER_SAVEDIR:
try:
with open(
settings.DOWNLOADER_SAVEDIR + "/" + get_mock_file(url),
"w",
encoding="utf-8",
) as fp:
fp.write(resp.text)
except:
_logger.warn("Save downloaded data failed.")
else:
resp = MockResponse(self.url)
response_type = self.validate_response(resp)
2022-12-29 23:57:02 -05:00
self.logs.append(
{"response_type": response_type, "url": url, "exception": None}
)
2023-08-11 01:43:19 -04:00
return resp, response_type # type: ignore
except RequestException as e:
2022-12-29 23:57:02 -05:00
self.logs.append(
{"response_type": RESPONSE_NETWORK_ERROR, "url": url, "exception": e}
)
2023-12-05 23:14:29 -05:00
return None, RESPONSE_NETWORK_ERROR
def download(self):
resp, self.response_type = self._download(self.url)
2023-01-29 20:05:30 -05:00
if self.response_type == RESPONSE_OK and resp:
return resp
else:
raise DownloadError(self)
class ProxiedDownloader(BasicDownloader):
def get_proxied_urls(self):
2023-08-24 05:48:14 +00:00
if not settings.DOWNLOADER_PROXY_LIST:
return [self.url]
urls = []
2023-08-24 05:48:14 +00:00
for p in settings.DOWNLOADER_PROXY_LIST:
urls.append(p.replace("__URL__", quote(self.url)))
return urls
def get_special_proxied_url(self):
2022-12-29 23:57:02 -05:00
return (
2023-08-24 05:48:14 +00:00
settings.DOWNLOADER_BACKUP_PROXY.replace("__URL__", quote(self.url))
if settings.DOWNLOADER_BACKUP_PROXY
2022-12-29 23:57:02 -05:00
else None
)
def download(self):
urls = self.get_proxied_urls()
last_try = False
url = urls.pop(0) if len(urls) else None
resp = None
2023-05-22 14:52:02 -04:00
resp_type = None
while url:
resp, resp_type = self._download(url)
2022-12-29 23:57:02 -05:00
if (
resp_type == RESPONSE_OK
or resp_type == RESPONSE_INVALID_CONTENT
or last_try
):
url = None
elif resp_type == RESPONSE_CENSORSHIP:
url = self.get_special_proxied_url()
last_try = True
else: # resp_type == RESPONSE_NETWORK_ERROR:
url = urls.pop(0) if len(urls) else None
self.response_type = resp_type
2023-06-05 02:04:52 -04:00
if self.response_type == RESPONSE_OK and resp:
return resp
else:
raise DownloadError(self)
class RetryDownloader(BasicDownloader):
def download(self):
retries = settings.DOWNLOADER_RETRIES
while retries:
retries -= 1
resp, self.response_type = self._download(self.url)
2023-12-05 23:14:29 -05:00
if self.response_type == RESPONSE_OK and resp:
return resp
elif self.response_type != RESPONSE_NETWORK_ERROR and retries == 0:
raise DownloadError(self)
elif retries > 0:
2022-12-29 23:57:02 -05:00
_logger.debug("Retry " + self.url)
time.sleep((settings.DOWNLOADER_RETRIES - retries) * 0.5)
2022-12-29 23:57:02 -05:00
raise DownloadError(self, "max out of retries")
2023-07-20 21:59:49 -04:00
class CachedDownloader(BasicDownloader):
def download(self):
cache_key = "dl:" + self.url
resp = cache.get(cache_key)
if resp:
self.response_type = RESPONSE_OK
else:
resp = super().download()
if self.response_type == RESPONSE_OK:
cache.set(cache_key, resp, timeout=settings.DOWNLOADER_CACHE_TIMEOUT)
return resp
class ImageDownloaderMixin:
def __init__(self, url, referer=None):
2023-08-11 01:43:19 -04:00
self.extention = None
if referer is not None:
2023-08-11 01:43:19 -04:00
self.headers["Referer"] = referer # type: ignore
super().__init__(url) # type: ignore
def validate_response(self, response):
if response and response.status_code == 200:
try:
raw_img = response.content
img = Image.open(BytesIO(raw_img))
img.load() # corrupted image will trigger exception
2022-12-29 23:57:02 -05:00
content_type = response.headers.get("Content-Type")
2023-05-22 14:52:02 -04:00
file_type = filetype.get_type(
2022-12-29 23:57:02 -05:00
mime=content_type.partition(";")[0].strip()
2023-05-22 14:52:02 -04:00
)
if file_type is None:
return RESPONSE_NETWORK_ERROR
self.extention = file_type.extension
return RESPONSE_OK
except Exception:
return RESPONSE_NETWORK_ERROR
if response and response.status_code >= 400 and response.status_code < 500:
return RESPONSE_INVALID_CONTENT
else:
return RESPONSE_NETWORK_ERROR
2022-12-08 23:58:44 +00:00
@classmethod
2023-02-12 21:54:21 -05:00
def download_image(cls, image_url, page_url, headers=None):
2023-08-11 01:43:19 -04:00
imgdl: BasicDownloader = cls(image_url, page_url) # type:ignore
2023-02-12 21:54:21 -05:00
if headers is not None:
imgdl.headers = headers
2022-12-08 23:58:44 +00:00
try:
image = imgdl.download().content
2023-08-11 01:43:19 -04:00
image_extention = imgdl.extention # type:ignore
2022-12-08 23:58:44 +00:00
return image, image_extention
except Exception:
return None, None
class BasicImageDownloader(ImageDownloaderMixin, BasicDownloader):
pass
class ProxiedImageDownloader(ImageDownloaderMixin, ProxiedDownloader):
pass