From 0aec486a0547f07eadd38f6ebcfa54dd61f46883 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 26 Jul 2024 17:32:08 -0400 Subject: [PATCH] use httpx for bandcamp --- catalog/common/__init__.py | 1 + catalog/common/downloaders.py | 61 ++++++++++++++++++++++++++++++++++- catalog/sites/bandcamp.py | 2 +- 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/catalog/common/__init__.py b/catalog/common/__init__.py index b39a38ed..8f5255e5 100644 --- a/catalog/common/__init__.py +++ b/catalog/common/__init__.py @@ -24,6 +24,7 @@ __all__ = ( # noqa "use_local_response", "RetryDownloader", "BasicDownloader", + "BasicDownloader2", "CachedDownloader", "ProxiedDownloader", "BasicImageDownloader", diff --git a/catalog/common/downloaders.py b/catalog/common/downloaders.py index e1c55f64..341ca00e 100644 --- a/catalog/common/downloaders.py +++ b/catalog/common/downloaders.py @@ -8,6 +8,7 @@ from typing import Tuple, cast from urllib.parse import quote import filetype +import httpx import requests from django.conf import settings from django.core.cache import cache @@ -103,6 +104,14 @@ class DownloaderResponse(Response): return etree.fromstring(self.content, base_url=self.url) +class DownloaderResponse2(httpx.Response): + def html(self): + return html.fromstring(self.content.decode("utf-8")) + + def xml(self): + return etree.fromstring(self.content, base_url=str(self.url)) + + class DownloadError(Exception): def __init__(self, downloader, msg=None): self.url = downloader.url @@ -163,7 +172,9 @@ class BasicDownloader: else: return RESPONSE_INVALID_CONTENT - def _download(self, url) -> Tuple[DownloaderResponse | MockResponse | None, int]: + def _download( + self, url + ) -> Tuple[DownloaderResponse | DownloaderResponse2 | MockResponse | None, int]: try: if not _mock_mode: resp = cast( @@ -187,6 +198,54 @@ class BasicDownloader: self.logs.append( {"response_type": response_type, "url": url, "exception": None} ) + return resp, response_type + except RequestException as e: + self.logs.append( + {"response_type": RESPONSE_NETWORK_ERROR, "url": url, "exception": e} + ) + return None, RESPONSE_NETWORK_ERROR + + def download(self): + resp, self.response_type = self._download(self.url) + if self.response_type == RESPONSE_OK and resp: + return resp + else: + raise DownloadError(self) + + +class BasicDownloader2(BasicDownloader): + def validate_response(self, response) -> int: + if response is None: + return RESPONSE_NETWORK_ERROR + elif response.status_code == 200: + return RESPONSE_OK + else: + return RESPONSE_INVALID_CONTENT + + def _download(self, url): + try: + if not _mock_mode: + resp = cast( + DownloaderResponse2, + httpx.get(url, headers=self.headers, timeout=self.get_timeout()), + ) + resp.__class__ = DownloaderResponse2 + if settings.DOWNLOADER_SAVEDIR: + try: + with open( + settings.DOWNLOADER_SAVEDIR + "/" + get_mock_file(url), + "w", + encoding="utf-8", + ) as fp: + fp.write(resp.text) + except Exception: + logger.warning("Save downloaded data failed.") + else: + resp = MockResponse(self.url) + response_type = self.validate_response(resp) + self.logs.append( + {"response_type": response_type, "url": url, "exception": None} + ) return resp, response_type except RequestException as e: diff --git a/catalog/sites/bandcamp.py b/catalog/sites/bandcamp.py index f33a0c9d..98c6b15b 100644 --- a/catalog/sites/bandcamp.py +++ b/catalog/sites/bandcamp.py @@ -49,7 +49,7 @@ class Bandcamp(AbstractSite): return False def scrape(self): - content = BasicDownloader(self.url).download().html() + content = BasicDownloader2(self.url).download().html() try: title = self.query_str(content, "//h2[@class='trackTitle']/text()") artist = [