handle item remote url change
This commit is contained in:
parent
3ed93668e1
commit
5b4c9d3048
3 changed files with 44 additions and 22 deletions
|
@ -66,7 +66,7 @@ class MockResponse:
|
||||||
try:
|
try:
|
||||||
self.content = Path(fn).read_bytes()
|
self.content = Path(fn).read_bytes()
|
||||||
self.status_code = 200
|
self.status_code = 200
|
||||||
logger.debug(f"use local response for {url} from {fn}")
|
# logger.debug(f"use local response for {url} from {fn}")
|
||||||
except Exception:
|
except Exception:
|
||||||
self.content = b"Error: response file not found"
|
self.content = b"Error: response file not found"
|
||||||
self.status_code = 404
|
self.status_code = 404
|
||||||
|
|
|
@ -10,11 +10,13 @@ ResourceContent persists as an ExternalResource which may link to an Item
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from hashlib import md5
|
||||||
from typing import TYPE_CHECKING, Type, TypeVar
|
from typing import TYPE_CHECKING, Type, TypeVar
|
||||||
|
|
||||||
import django_rq
|
import django_rq
|
||||||
import requests
|
import requests
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from django.core.cache import cache
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from validators import url as url_validate
|
from validators import url as url_validate
|
||||||
|
|
||||||
|
@ -302,6 +304,22 @@ class SiteManager:
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Site for {typ} not found")
|
raise ValueError(f"Site for {typ} not found")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_redirected_url(url: str) -> str:
|
||||||
|
k = "_redir_" + md5(url.encode()).hexdigest()
|
||||||
|
u = cache.get(k, default=None)
|
||||||
|
if u == "":
|
||||||
|
return url
|
||||||
|
elif u:
|
||||||
|
return u
|
||||||
|
try:
|
||||||
|
u = requests.head(url, allow_redirects=True, timeout=1).url
|
||||||
|
except requests.RequestException:
|
||||||
|
logger.warning(f"HEAD timeout: {url}")
|
||||||
|
u = url
|
||||||
|
cache.set(k, u if u != url else "", 3600)
|
||||||
|
return u
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_site_by_url(url: str) -> AbstractSite | None:
|
def get_site_by_url(url: str) -> AbstractSite | None:
|
||||||
if not url or not url_validate(
|
if not url or not url_validate(
|
||||||
|
@ -312,25 +330,29 @@ class SiteManager:
|
||||||
strict_query=False,
|
strict_query=False,
|
||||||
):
|
):
|
||||||
return None
|
return None
|
||||||
|
u = SiteManager.get_redirected_url(url)
|
||||||
cls = next(
|
cls = next(
|
||||||
filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None
|
filter(lambda p: p.validate_url(u), SiteManager.registry.values()), None
|
||||||
)
|
)
|
||||||
if cls is None and re.match(r"^https?://(spotify.link|t.co)/.+", url):
|
if cls is None and u != url:
|
||||||
try:
|
cls = next(
|
||||||
url2 = requests.head(url, allow_redirects=True, timeout=1).url
|
filter(
|
||||||
if url2 != url:
|
lambda p: p.validate_url(url),
|
||||||
cls = next(
|
SiteManager.registry.values(),
|
||||||
filter(
|
),
|
||||||
lambda p: p.validate_url(url2),
|
None,
|
||||||
SiteManager.registry.values(),
|
)
|
||||||
),
|
if cls:
|
||||||
None,
|
u = url
|
||||||
)
|
|
||||||
if cls:
|
|
||||||
url = url2
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
if cls is None:
|
if cls is None:
|
||||||
|
cls = next(
|
||||||
|
filter(
|
||||||
|
lambda p: p.validate_url_fallback(u),
|
||||||
|
SiteManager.registry.values(),
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
if cls is None and u != url:
|
||||||
cls = next(
|
cls = next(
|
||||||
filter(
|
filter(
|
||||||
lambda p: p.validate_url_fallback(url),
|
lambda p: p.validate_url_fallback(url),
|
||||||
|
@ -338,7 +360,9 @@ class SiteManager:
|
||||||
),
|
),
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
return cls(url) if cls else None
|
if cls:
|
||||||
|
u = url
|
||||||
|
return cls(u) if cls else None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_site_by_id(id_type: IdType | str, id_value: str) -> AbstractSite | None:
|
def get_site_by_id(id_type: IdType | str, id_value: str) -> AbstractSite | None:
|
||||||
|
|
|
@ -9,9 +9,9 @@ Scraping the website directly.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
|
||||||
|
|
||||||
import dateparser
|
import dateparser
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
from catalog.common import *
|
from catalog.common import *
|
||||||
from catalog.models import *
|
from catalog.models import *
|
||||||
|
@ -24,8 +24,6 @@ from common.models.misc import uniq
|
||||||
|
|
||||||
from .douban import *
|
from .douban import *
|
||||||
|
|
||||||
_logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@SiteManager.register
|
@SiteManager.register
|
||||||
class AppleMusic(AbstractSite):
|
class AppleMusic(AbstractSite):
|
||||||
|
@ -82,7 +80,7 @@ class AppleMusic(AbstractSite):
|
||||||
content = (
|
content = (
|
||||||
BasicDownloader(url, headers=self.headers).download().html()
|
BasicDownloader(url, headers=self.headers).download().html()
|
||||||
)
|
)
|
||||||
_logger.info(f"got localized content from {url}")
|
logger.info(f"got localized content from {url}")
|
||||||
elem = content.xpath(
|
elem = content.xpath(
|
||||||
"//script[@id='serialized-server-data']/text()"
|
"//script[@id='serialized-server-data']/text()"
|
||||||
)
|
)
|
||||||
|
|
Loading…
Add table
Reference in a new issue