handle item remote url change

This commit is contained in:
Your Name 2025-02-22 17:11:35 -05:00 committed by Henri Dickson
parent 3ed93668e1
commit 5b4c9d3048
3 changed files with 44 additions and 22 deletions

View file

@ -66,7 +66,7 @@ class MockResponse:
try: try:
self.content = Path(fn).read_bytes() self.content = Path(fn).read_bytes()
self.status_code = 200 self.status_code = 200
logger.debug(f"use local response for {url} from {fn}") # logger.debug(f"use local response for {url} from {fn}")
except Exception: except Exception:
self.content = b"Error: response file not found" self.content = b"Error: response file not found"
self.status_code = 404 self.status_code = 404

View file

@ -10,11 +10,13 @@ ResourceContent persists as an ExternalResource which may link to an Item
import json import json
import re import re
from dataclasses import dataclass, field from dataclasses import dataclass, field
from hashlib import md5
from typing import TYPE_CHECKING, Type, TypeVar from typing import TYPE_CHECKING, Type, TypeVar
import django_rq import django_rq
import requests import requests
from django.conf import settings from django.conf import settings
from django.core.cache import cache
from loguru import logger from loguru import logger
from validators import url as url_validate from validators import url as url_validate
@ -302,6 +304,22 @@ class SiteManager:
else: else:
raise ValueError(f"Site for {typ} not found") raise ValueError(f"Site for {typ} not found")
@staticmethod
def get_redirected_url(url: str) -> str:
k = "_redir_" + md5(url.encode()).hexdigest()
u = cache.get(k, default=None)
if u == "":
return url
elif u:
return u
try:
u = requests.head(url, allow_redirects=True, timeout=1).url
except requests.RequestException:
logger.warning(f"HEAD timeout: {url}")
u = url
cache.set(k, u if u != url else "", 3600)
return u
@staticmethod @staticmethod
def get_site_by_url(url: str) -> AbstractSite | None: def get_site_by_url(url: str) -> AbstractSite | None:
if not url or not url_validate( if not url or not url_validate(
@ -312,25 +330,29 @@ class SiteManager:
strict_query=False, strict_query=False,
): ):
return None return None
u = SiteManager.get_redirected_url(url)
cls = next( cls = next(
filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None filter(lambda p: p.validate_url(u), SiteManager.registry.values()), None
) )
if cls is None and re.match(r"^https?://(spotify.link|t.co)/.+", url): if cls is None and u != url:
try: cls = next(
url2 = requests.head(url, allow_redirects=True, timeout=1).url filter(
if url2 != url: lambda p: p.validate_url(url),
cls = next( SiteManager.registry.values(),
filter( ),
lambda p: p.validate_url(url2), None,
SiteManager.registry.values(), )
), if cls:
None, u = url
)
if cls:
url = url2
except Exception:
pass
if cls is None: if cls is None:
cls = next(
filter(
lambda p: p.validate_url_fallback(u),
SiteManager.registry.values(),
),
None,
)
if cls is None and u != url:
cls = next( cls = next(
filter( filter(
lambda p: p.validate_url_fallback(url), lambda p: p.validate_url_fallback(url),
@ -338,7 +360,9 @@ class SiteManager:
), ),
None, None,
) )
return cls(url) if cls else None if cls:
u = url
return cls(u) if cls else None
@staticmethod @staticmethod
def get_site_by_id(id_type: IdType | str, id_value: str) -> AbstractSite | None: def get_site_by_id(id_type: IdType | str, id_value: str) -> AbstractSite | None:

View file

@ -9,9 +9,9 @@ Scraping the website directly.
""" """
import json import json
import logging
import dateparser import dateparser
from loguru import logger
from catalog.common import * from catalog.common import *
from catalog.models import * from catalog.models import *
@ -24,8 +24,6 @@ from common.models.misc import uniq
from .douban import * from .douban import *
_logger = logging.getLogger(__name__)
@SiteManager.register @SiteManager.register
class AppleMusic(AbstractSite): class AppleMusic(AbstractSite):
@ -82,7 +80,7 @@ class AppleMusic(AbstractSite):
content = ( content = (
BasicDownloader(url, headers=self.headers).download().html() BasicDownloader(url, headers=self.headers).download().html()
) )
_logger.info(f"got localized content from {url}") logger.info(f"got localized content from {url}")
elem = content.xpath( elem = content.xpath(
"//script[@id='serialized-server-data']/text()" "//script[@id='serialized-server-data']/text()"
) )