handle item remote url change

This commit is contained in:
Your Name 2025-02-22 17:11:35 -05:00 committed by Henri Dickson
parent 3ed93668e1
commit 5b4c9d3048
3 changed files with 44 additions and 22 deletions

View file

@ -66,7 +66,7 @@ class MockResponse:
try:
self.content = Path(fn).read_bytes()
self.status_code = 200
logger.debug(f"use local response for {url} from {fn}")
# logger.debug(f"use local response for {url} from {fn}")
except Exception:
self.content = b"Error: response file not found"
self.status_code = 404

View file

@ -10,11 +10,13 @@ ResourceContent persists as an ExternalResource which may link to an Item
import json
import re
from dataclasses import dataclass, field
from hashlib import md5
from typing import TYPE_CHECKING, Type, TypeVar
import django_rq
import requests
from django.conf import settings
from django.core.cache import cache
from loguru import logger
from validators import url as url_validate
@ -302,6 +304,22 @@ class SiteManager:
else:
raise ValueError(f"Site for {typ} not found")
@staticmethod
def get_redirected_url(url: str) -> str:
k = "_redir_" + md5(url.encode()).hexdigest()
u = cache.get(k, default=None)
if u == "":
return url
elif u:
return u
try:
u = requests.head(url, allow_redirects=True, timeout=1).url
except requests.RequestException:
logger.warning(f"HEAD timeout: {url}")
u = url
cache.set(k, u if u != url else "", 3600)
return u
@staticmethod
def get_site_by_url(url: str) -> AbstractSite | None:
if not url or not url_validate(
@ -312,25 +330,29 @@ class SiteManager:
strict_query=False,
):
return None
u = SiteManager.get_redirected_url(url)
cls = next(
filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None
filter(lambda p: p.validate_url(u), SiteManager.registry.values()), None
)
if cls is None and re.match(r"^https?://(spotify.link|t.co)/.+", url):
try:
url2 = requests.head(url, allow_redirects=True, timeout=1).url
if url2 != url:
cls = next(
filter(
lambda p: p.validate_url(url2),
SiteManager.registry.values(),
),
None,
)
if cls:
url = url2
except Exception:
pass
if cls is None and u != url:
cls = next(
filter(
lambda p: p.validate_url(url),
SiteManager.registry.values(),
),
None,
)
if cls:
u = url
if cls is None:
cls = next(
filter(
lambda p: p.validate_url_fallback(u),
SiteManager.registry.values(),
),
None,
)
if cls is None and u != url:
cls = next(
filter(
lambda p: p.validate_url_fallback(url),
@ -338,7 +360,9 @@ class SiteManager:
),
None,
)
return cls(url) if cls else None
if cls:
u = url
return cls(u) if cls else None
@staticmethod
def get_site_by_id(id_type: IdType | str, id_value: str) -> AbstractSite | None:

View file

@ -9,9 +9,9 @@ Scraping the website directly.
"""
import json
import logging
import dateparser
from loguru import logger
from catalog.common import *
from catalog.models import *
@ -24,8 +24,6 @@ from common.models.misc import uniq
from .douban import *
_logger = logging.getLogger(__name__)
@SiteManager.register
class AppleMusic(AbstractSite):
@ -82,7 +80,7 @@ class AppleMusic(AbstractSite):
content = (
BasicDownloader(url, headers=self.headers).download().html()
)
_logger.info(f"got localized content from {url}")
logger.info(f"got localized content from {url}")
elem = content.xpath(
"//script[@id='serialized-server-data']/text()"
)