From 5b4c9d3048fbdd676d546d6adb01d3a47e793ac8 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 22 Feb 2025 17:11:35 -0500 Subject: [PATCH] handle item remote url change --- catalog/common/downloaders.py | 2 +- catalog/common/sites.py | 58 +++++++++++++++++++++++++---------- catalog/sites/apple_music.py | 6 ++-- 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/catalog/common/downloaders.py b/catalog/common/downloaders.py index 959d7a6f..c4da7244 100644 --- a/catalog/common/downloaders.py +++ b/catalog/common/downloaders.py @@ -66,7 +66,7 @@ class MockResponse: try: self.content = Path(fn).read_bytes() self.status_code = 200 - logger.debug(f"use local response for {url} from {fn}") + # logger.debug(f"use local response for {url} from {fn}") except Exception: self.content = b"Error: response file not found" self.status_code = 404 diff --git a/catalog/common/sites.py b/catalog/common/sites.py index 16e3dd11..5188eb22 100644 --- a/catalog/common/sites.py +++ b/catalog/common/sites.py @@ -10,11 +10,13 @@ ResourceContent persists as an ExternalResource which may link to an Item import json import re from dataclasses import dataclass, field +from hashlib import md5 from typing import TYPE_CHECKING, Type, TypeVar import django_rq import requests from django.conf import settings +from django.core.cache import cache from loguru import logger from validators import url as url_validate @@ -302,6 +304,22 @@ class SiteManager: else: raise ValueError(f"Site for {typ} not found") + @staticmethod + def get_redirected_url(url: str) -> str: + k = "_redir_" + md5(url.encode()).hexdigest() + u = cache.get(k, default=None) + if u == "": + return url + elif u: + return u + try: + u = requests.head(url, allow_redirects=True, timeout=1).url + except requests.RequestException: + logger.warning(f"HEAD timeout: {url}") + u = url + cache.set(k, u if u != url else "", 3600) + return u + @staticmethod def get_site_by_url(url: str) -> AbstractSite | None: if not url or not url_validate( @@ -312,25 +330,29 @@ class SiteManager: strict_query=False, ): return None + u = SiteManager.get_redirected_url(url) cls = next( - filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None + filter(lambda p: p.validate_url(u), SiteManager.registry.values()), None ) - if cls is None and re.match(r"^https?://(spotify.link|t.co)/.+", url): - try: - url2 = requests.head(url, allow_redirects=True, timeout=1).url - if url2 != url: - cls = next( - filter( - lambda p: p.validate_url(url2), - SiteManager.registry.values(), - ), - None, - ) - if cls: - url = url2 - except Exception: - pass + if cls is None and u != url: + cls = next( + filter( + lambda p: p.validate_url(url), + SiteManager.registry.values(), + ), + None, + ) + if cls: + u = url if cls is None: + cls = next( + filter( + lambda p: p.validate_url_fallback(u), + SiteManager.registry.values(), + ), + None, + ) + if cls is None and u != url: cls = next( filter( lambda p: p.validate_url_fallback(url), @@ -338,7 +360,9 @@ class SiteManager: ), None, ) - return cls(url) if cls else None + if cls: + u = url + return cls(u) if cls else None @staticmethod def get_site_by_id(id_type: IdType | str, id_value: str) -> AbstractSite | None: diff --git a/catalog/sites/apple_music.py b/catalog/sites/apple_music.py index b6149138..19b89d98 100644 --- a/catalog/sites/apple_music.py +++ b/catalog/sites/apple_music.py @@ -9,9 +9,9 @@ Scraping the website directly. """ import json -import logging import dateparser +from loguru import logger from catalog.common import * from catalog.models import * @@ -24,8 +24,6 @@ from common.models.misc import uniq from .douban import * -_logger = logging.getLogger(__name__) - @SiteManager.register class AppleMusic(AbstractSite): @@ -82,7 +80,7 @@ class AppleMusic(AbstractSite): content = ( BasicDownloader(url, headers=self.headers).download().html() ) - _logger.info(f"got localized content from {url}") + logger.info(f"got localized content from {url}") elem = content.xpath( "//script[@id='serialized-server-data']/text()" )