2022-12-08 16:59:03 +00:00
|
|
|
import json
|
2023-08-10 11:27:31 -04:00
|
|
|
import logging
|
2022-12-07 19:09:05 -05:00
|
|
|
import re
|
|
|
|
import time
|
2023-08-10 11:27:31 -04:00
|
|
|
from io import BytesIO, StringIO
|
|
|
|
from pathlib import Path
|
2023-08-11 01:43:19 -04:00
|
|
|
from typing import Tuple
|
2023-02-12 21:03:53 -05:00
|
|
|
from urllib.parse import quote
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2023-08-10 11:27:31 -04:00
|
|
|
import filetype
|
|
|
|
import requests
|
|
|
|
from django.conf import settings
|
2023-07-20 21:59:49 -04:00
|
|
|
from django.core.cache import cache
|
2023-08-10 11:27:31 -04:00
|
|
|
from lxml import html
|
|
|
|
from PIL import Image
|
2023-08-11 01:43:19 -04:00
|
|
|
from requests import Response
|
2023-08-10 11:27:31 -04:00
|
|
|
from requests.exceptions import RequestException
|
|
|
|
|
2022-12-08 16:59:03 +00:00
|
|
|
_logger = logging.getLogger(__name__)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
|
|
|
|
RESPONSE_OK = 0 # response is ready for pasring
|
|
|
|
RESPONSE_INVALID_CONTENT = -1 # content not valid but no need to retry
|
|
|
|
RESPONSE_NETWORK_ERROR = -2 # network error, retry next proxied url
|
|
|
|
RESPONSE_CENSORSHIP = -3 # censored, try sth special if possible
|
|
|
|
|
2022-12-08 16:59:03 +00:00
|
|
|
_mock_mode = False
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
|
|
|
|
def use_local_response(func):
|
|
|
|
def _func(args):
|
2022-12-08 16:59:03 +00:00
|
|
|
set_mock_mode(True)
|
2022-12-07 19:09:05 -05:00
|
|
|
func(args)
|
2022-12-08 16:59:03 +00:00
|
|
|
set_mock_mode(False)
|
2022-12-29 23:57:02 -05:00
|
|
|
|
2022-12-07 19:09:05 -05:00
|
|
|
return _func
|
|
|
|
|
|
|
|
|
2022-12-08 16:59:03 +00:00
|
|
|
def set_mock_mode(enabled):
|
|
|
|
global _mock_mode
|
|
|
|
_mock_mode = enabled
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
|
2022-12-08 16:59:03 +00:00
|
|
|
def get_mock_mode():
|
|
|
|
global _mock_mode
|
|
|
|
return _mock_mode
|
2022-12-08 15:45:37 +00:00
|
|
|
|
|
|
|
|
2022-12-08 23:58:44 +00:00
|
|
|
def get_mock_file(url):
|
2022-12-29 23:57:02 -05:00
|
|
|
fn = url.replace("***REMOVED***", "1234") # Thank you, Github Action -_-!
|
|
|
|
fn = re.sub(r"[^\w]", "_", fn)
|
|
|
|
fn = re.sub(r"_key_[*A-Za-z0-9]+", "_key_8964", fn)
|
2023-02-12 21:28:22 -05:00
|
|
|
if len(fn) > 255:
|
|
|
|
fn = fn[:255]
|
2022-12-16 09:53:06 -05:00
|
|
|
return fn
|
2022-12-08 23:58:44 +00:00
|
|
|
|
|
|
|
|
2023-08-11 01:43:19 -04:00
|
|
|
_local_response_path = (
|
|
|
|
str(Path(__file__).parent.parent.parent.absolute()) + "/test_data/"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
class MockResponse:
|
|
|
|
def __init__(self, url):
|
|
|
|
self.url = url
|
|
|
|
fn = _local_response_path + get_mock_file(url)
|
|
|
|
try:
|
|
|
|
self.content = Path(fn).read_bytes()
|
|
|
|
self.status_code = 200
|
|
|
|
_logger.debug(f"use local response for {url} from {fn}")
|
|
|
|
except Exception:
|
|
|
|
self.content = b"Error: response file not found"
|
|
|
|
self.status_code = 404
|
|
|
|
_logger.debug(f"local response not found for {url} at {fn}")
|
|
|
|
|
|
|
|
@property
|
|
|
|
def text(self):
|
|
|
|
return self.content.decode("utf-8")
|
|
|
|
|
|
|
|
def json(self):
|
|
|
|
return json.load(StringIO(self.text))
|
|
|
|
|
|
|
|
def html(self):
|
|
|
|
return html.fromstring( # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5
|
|
|
|
self.content.decode("utf-8")
|
|
|
|
)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def headers(self):
|
|
|
|
return {
|
|
|
|
"Content-Type": "image/jpeg" if self.url.endswith("jpg") else "text/html"
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
requests.Response.html = MockResponse.html # type:ignore
|
|
|
|
|
|
|
|
|
|
|
|
class DownloaderResponse(Response):
|
|
|
|
def html(self):
|
|
|
|
return html.fromstring( # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5
|
|
|
|
self.content.decode("utf-8")
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2022-12-07 19:09:05 -05:00
|
|
|
class DownloadError(Exception):
|
2022-12-08 16:59:03 +00:00
|
|
|
def __init__(self, downloader, msg=None):
|
2022-12-07 19:09:05 -05:00
|
|
|
self.url = downloader.url
|
|
|
|
self.logs = downloader.logs
|
|
|
|
if downloader.response_type == RESPONSE_INVALID_CONTENT:
|
|
|
|
error = "Invalid Response"
|
|
|
|
elif downloader.response_type == RESPONSE_NETWORK_ERROR:
|
|
|
|
error = "Network Error"
|
|
|
|
elif downloader.response_type == RESPONSE_NETWORK_ERROR:
|
|
|
|
error = "Censored Content"
|
|
|
|
else:
|
|
|
|
error = "Unknown Error"
|
2022-12-29 23:57:02 -05:00
|
|
|
self.message = (
|
|
|
|
f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}"
|
|
|
|
)
|
2022-12-07 19:09:05 -05:00
|
|
|
super().__init__(self.message)
|
|
|
|
|
|
|
|
|
|
|
|
class BasicDownloader:
|
|
|
|
headers = {
|
2023-01-29 20:05:30 -05:00
|
|
|
# "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
|
2022-12-29 23:57:02 -05:00
|
|
|
"User-Agent": "Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1",
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
|
|
|
|
"Accept-Encoding": "gzip, deflate",
|
|
|
|
"Connection": "keep-alive",
|
|
|
|
"DNT": "1",
|
|
|
|
"Upgrade-Insecure-Requests": "1",
|
|
|
|
"Cache-Control": "no-cache",
|
2022-12-07 19:09:05 -05:00
|
|
|
}
|
|
|
|
|
2022-12-08 15:45:37 +00:00
|
|
|
def __init__(self, url, headers=None):
|
2022-12-07 19:09:05 -05:00
|
|
|
self.url = url
|
|
|
|
self.response_type = RESPONSE_OK
|
|
|
|
self.logs = []
|
2022-12-08 15:45:37 +00:00
|
|
|
if headers:
|
|
|
|
self.headers = headers
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
def get_timeout(self):
|
2023-08-24 05:48:14 +00:00
|
|
|
return settings.DOWNLOADER_REQUEST_TIMEOUT
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2023-12-31 08:32:19 -05:00
|
|
|
def validate_response(self, response) -> int:
|
2022-12-07 19:09:05 -05:00
|
|
|
if response is None:
|
|
|
|
return RESPONSE_NETWORK_ERROR
|
|
|
|
elif response.status_code == 200:
|
|
|
|
return RESPONSE_OK
|
|
|
|
else:
|
|
|
|
return RESPONSE_INVALID_CONTENT
|
|
|
|
|
2023-12-05 23:14:29 -05:00
|
|
|
def _download(self, url) -> Tuple[DownloaderResponse | MockResponse | None, int]:
|
2022-12-07 19:09:05 -05:00
|
|
|
try:
|
2022-12-08 16:59:03 +00:00
|
|
|
if not _mock_mode:
|
2022-12-29 23:57:02 -05:00
|
|
|
resp = requests.get(
|
|
|
|
url, headers=self.headers, timeout=self.get_timeout()
|
|
|
|
)
|
2022-12-07 19:09:05 -05:00
|
|
|
if settings.DOWNLOADER_SAVEDIR:
|
2023-02-12 21:28:22 -05:00
|
|
|
try:
|
|
|
|
with open(
|
|
|
|
settings.DOWNLOADER_SAVEDIR + "/" + get_mock_file(url),
|
|
|
|
"w",
|
|
|
|
encoding="utf-8",
|
|
|
|
) as fp:
|
|
|
|
fp.write(resp.text)
|
|
|
|
except:
|
|
|
|
_logger.warn("Save downloaded data failed.")
|
2022-12-07 19:09:05 -05:00
|
|
|
else:
|
|
|
|
resp = MockResponse(self.url)
|
|
|
|
response_type = self.validate_response(resp)
|
2022-12-29 23:57:02 -05:00
|
|
|
self.logs.append(
|
|
|
|
{"response_type": response_type, "url": url, "exception": None}
|
|
|
|
)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2023-08-11 01:43:19 -04:00
|
|
|
return resp, response_type # type: ignore
|
2022-12-07 19:09:05 -05:00
|
|
|
except RequestException as e:
|
2022-12-29 23:57:02 -05:00
|
|
|
self.logs.append(
|
|
|
|
{"response_type": RESPONSE_NETWORK_ERROR, "url": url, "exception": e}
|
|
|
|
)
|
2023-12-05 23:14:29 -05:00
|
|
|
return None, RESPONSE_NETWORK_ERROR
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
def download(self):
|
|
|
|
resp, self.response_type = self._download(self.url)
|
2023-01-29 20:05:30 -05:00
|
|
|
if self.response_type == RESPONSE_OK and resp:
|
2022-12-07 19:09:05 -05:00
|
|
|
return resp
|
|
|
|
else:
|
|
|
|
raise DownloadError(self)
|
|
|
|
|
|
|
|
|
|
|
|
class ProxiedDownloader(BasicDownloader):
|
|
|
|
def get_proxied_urls(self):
|
2023-08-24 05:48:14 +00:00
|
|
|
if not settings.DOWNLOADER_PROXY_LIST:
|
|
|
|
return [self.url]
|
2022-12-07 19:09:05 -05:00
|
|
|
urls = []
|
2023-08-24 05:48:14 +00:00
|
|
|
for p in settings.DOWNLOADER_PROXY_LIST:
|
|
|
|
urls.append(p.replace("__URL__", quote(self.url)))
|
2022-12-07 19:09:05 -05:00
|
|
|
return urls
|
|
|
|
|
|
|
|
def get_special_proxied_url(self):
|
2022-12-29 23:57:02 -05:00
|
|
|
return (
|
2023-08-24 05:48:14 +00:00
|
|
|
settings.DOWNLOADER_BACKUP_PROXY.replace("__URL__", quote(self.url))
|
|
|
|
if settings.DOWNLOADER_BACKUP_PROXY
|
2022-12-29 23:57:02 -05:00
|
|
|
else None
|
|
|
|
)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
def download(self):
|
|
|
|
urls = self.get_proxied_urls()
|
|
|
|
last_try = False
|
|
|
|
url = urls.pop(0) if len(urls) else None
|
|
|
|
resp = None
|
2023-05-22 14:52:02 -04:00
|
|
|
resp_type = None
|
2022-12-07 19:09:05 -05:00
|
|
|
while url:
|
|
|
|
resp, resp_type = self._download(url)
|
2022-12-29 23:57:02 -05:00
|
|
|
if (
|
|
|
|
resp_type == RESPONSE_OK
|
|
|
|
or resp_type == RESPONSE_INVALID_CONTENT
|
|
|
|
or last_try
|
|
|
|
):
|
2022-12-07 19:09:05 -05:00
|
|
|
url = None
|
|
|
|
elif resp_type == RESPONSE_CENSORSHIP:
|
|
|
|
url = self.get_special_proxied_url()
|
|
|
|
last_try = True
|
|
|
|
else: # resp_type == RESPONSE_NETWORK_ERROR:
|
|
|
|
url = urls.pop(0) if len(urls) else None
|
|
|
|
self.response_type = resp_type
|
2023-06-05 02:04:52 -04:00
|
|
|
if self.response_type == RESPONSE_OK and resp:
|
2022-12-07 19:09:05 -05:00
|
|
|
return resp
|
|
|
|
else:
|
|
|
|
raise DownloadError(self)
|
|
|
|
|
|
|
|
|
|
|
|
class RetryDownloader(BasicDownloader):
|
|
|
|
def download(self):
|
|
|
|
retries = settings.DOWNLOADER_RETRIES
|
|
|
|
while retries:
|
|
|
|
retries -= 1
|
|
|
|
resp, self.response_type = self._download(self.url)
|
2023-12-05 23:14:29 -05:00
|
|
|
if self.response_type == RESPONSE_OK and resp:
|
2022-12-07 19:09:05 -05:00
|
|
|
return resp
|
|
|
|
elif self.response_type != RESPONSE_NETWORK_ERROR and retries == 0:
|
|
|
|
raise DownloadError(self)
|
|
|
|
elif retries > 0:
|
2022-12-29 23:57:02 -05:00
|
|
|
_logger.debug("Retry " + self.url)
|
2022-12-07 19:09:05 -05:00
|
|
|
time.sleep((settings.DOWNLOADER_RETRIES - retries) * 0.5)
|
2022-12-29 23:57:02 -05:00
|
|
|
raise DownloadError(self, "max out of retries")
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
|
2023-07-20 21:59:49 -04:00
|
|
|
class CachedDownloader(BasicDownloader):
|
|
|
|
def download(self):
|
|
|
|
cache_key = "dl:" + self.url
|
|
|
|
resp = cache.get(cache_key)
|
|
|
|
if resp:
|
|
|
|
self.response_type = RESPONSE_OK
|
|
|
|
else:
|
|
|
|
resp = super().download()
|
|
|
|
if self.response_type == RESPONSE_OK:
|
|
|
|
cache.set(cache_key, resp, timeout=settings.DOWNLOADER_CACHE_TIMEOUT)
|
|
|
|
return resp
|
|
|
|
|
|
|
|
|
2022-12-07 19:09:05 -05:00
|
|
|
class ImageDownloaderMixin:
|
|
|
|
def __init__(self, url, referer=None):
|
2023-08-11 01:43:19 -04:00
|
|
|
self.extention = None
|
2022-12-07 19:09:05 -05:00
|
|
|
if referer is not None:
|
2023-08-11 01:43:19 -04:00
|
|
|
self.headers["Referer"] = referer # type: ignore
|
|
|
|
super().__init__(url) # type: ignore
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
def validate_response(self, response):
|
|
|
|
if response and response.status_code == 200:
|
|
|
|
try:
|
|
|
|
raw_img = response.content
|
|
|
|
img = Image.open(BytesIO(raw_img))
|
|
|
|
img.load() # corrupted image will trigger exception
|
2022-12-29 23:57:02 -05:00
|
|
|
content_type = response.headers.get("Content-Type")
|
2023-05-22 14:52:02 -04:00
|
|
|
file_type = filetype.get_type(
|
2022-12-29 23:57:02 -05:00
|
|
|
mime=content_type.partition(";")[0].strip()
|
2023-05-22 14:52:02 -04:00
|
|
|
)
|
|
|
|
if file_type is None:
|
|
|
|
return RESPONSE_NETWORK_ERROR
|
|
|
|
self.extention = file_type.extension
|
2022-12-07 19:09:05 -05:00
|
|
|
return RESPONSE_OK
|
|
|
|
except Exception:
|
|
|
|
return RESPONSE_NETWORK_ERROR
|
|
|
|
if response and response.status_code >= 400 and response.status_code < 500:
|
|
|
|
return RESPONSE_INVALID_CONTENT
|
|
|
|
else:
|
|
|
|
return RESPONSE_NETWORK_ERROR
|
|
|
|
|
2022-12-08 23:58:44 +00:00
|
|
|
@classmethod
|
2023-02-12 21:54:21 -05:00
|
|
|
def download_image(cls, image_url, page_url, headers=None):
|
2023-08-11 01:43:19 -04:00
|
|
|
imgdl: BasicDownloader = cls(image_url, page_url) # type:ignore
|
2023-02-12 21:54:21 -05:00
|
|
|
if headers is not None:
|
|
|
|
imgdl.headers = headers
|
2022-12-08 23:58:44 +00:00
|
|
|
try:
|
|
|
|
image = imgdl.download().content
|
2023-08-11 01:43:19 -04:00
|
|
|
image_extention = imgdl.extention # type:ignore
|
2022-12-08 23:58:44 +00:00
|
|
|
return image, image_extention
|
|
|
|
except Exception:
|
|
|
|
return None, None
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
|
2023-02-12 21:28:22 -05:00
|
|
|
class BasicImageDownloader(ImageDownloaderMixin, BasicDownloader):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2022-12-07 19:09:05 -05:00
|
|
|
class ProxiedImageDownloader(ImageDownloaderMixin, ProxiedDownloader):
|
|
|
|
pass
|