move cover download out of scrape()

This commit is contained in:
Your Name 2024-07-27 03:22:27 -04:00 committed by Henri Dickson
parent 976319247c
commit b4bdc58cad
16 changed files with 26 additions and 194 deletions

View file

@ -891,12 +891,24 @@ class ExternalResource(models.Model):
def update_content(self, resource_content: "ResourceContent"):
self.other_lookup_ids = resource_content.lookup_ids
self.metadata = resource_content.metadata
if (
resource_content.metadata.get("cover_image_url")
and not resource_content.cover_image
):
from .downloaders import BasicImageDownloader
(
resource_content.cover_image,
resource_content.cover_image_extention,
) = BasicImageDownloader.download_image(
resource_content.metadata.get("cover_image_url"), self.url
)
if resource_content.cover_image and resource_content.cover_image_extention:
self.cover = SimpleUploadedFile(
"temp." + resource_content.cover_image_extention,
resource_content.cover_image,
)
else:
elif resource_content.metadata.get("cover_image_path"):
self.cover = resource_content.metadata.get("cover_image_path")
self.scraped_time = timezone.now()
self.save()

View file

@ -56,8 +56,8 @@ class AppleMusic(AbstractSite):
def get_locales(self):
locales = {}
for l in PREFERRED_LANGUAGES:
match l:
for lang in PREFERRED_LANGUAGES:
match lang:
case "zh":
locales.update({"zh": ["cn", "tw", "hk", "sg"]})
case "en":
@ -94,10 +94,10 @@ class AppleMusic(AbstractSite):
brief = album_data.get("modalPresentationDescriptor", {}).get(
"paragraphText", ""
)
l = detect_language(title + " " + brief)
localized_title.append({"lang": l, "text": title})
tl = detect_language(title + " " + brief)
localized_title.append({"lang": tl, "text": title})
if brief:
localized_desc.append({"lang": l, "text": brief})
localized_desc.append({"lang": tl, "text": brief})
if lang == DEFAULT_CATALOG_LANGUAGE or not matched_content:
matched_content = content
break
@ -155,13 +155,4 @@ class AppleMusic(AbstractSite):
"cover_image_url": image_url,
}
)
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -37,13 +37,4 @@ class ApplePodcast(AbstractSite):
}
)
pd.lookup_ids[IdType.RSS] = RSS.url_to_id(feed_url)
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -102,13 +102,4 @@ class Bandcamp(AbstractSite):
"cover_image_url": cover_url,
}
pd = ResourceContent(metadata=data)
if data["cover_image_url"]:
imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {data["cover_image_url"]}'
)
return pd

View file

@ -69,13 +69,4 @@ class BoardGameGeek(AbstractSite):
"cover_image_url": cover_image_url,
}
)
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -76,15 +76,6 @@ class DiscogsRelease(AbstractSite):
)
if barcode:
pd.lookup_ids[IdType.GTIN] = barcode
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd
@ -122,15 +113,6 @@ class DiscogsMaster(AbstractSite):
"cover_image_url": image_url,
}
)
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -236,9 +236,6 @@ class DoubanBook(AbstractSite):
if t:
pd.lookup_ids[t] = n
pd.lookup_ids[IdType.CUBN] = cubn
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(
img_url, self.url
)
return pd

View file

@ -1,4 +1,3 @@
import logging
import re
from django.core.cache import cache
@ -10,8 +9,6 @@ from common.models.lang import detect_language
from .douban import DoubanDownloader
_logger = logging.getLogger(__name__)
def _cache_key(url):
return f"$:{url}"
@ -77,10 +74,10 @@ class DoubanDramaVersion(AbstractSite):
}
if data["opening_date"]:
d = data["opening_date"].split("-")
l = len(d) if len(d) < 6 else 6
if l > 3:
dl = len(d) if len(d) < 6 else 6
if dl > 3:
data["opening_date"] = "-".join(d[:3])
data["closing_date"] = "-".join(d[0 : 6 - l] + d[3:l])
data["closing_date"] = "-".join(d[0 : 6 - dl] + d[3:dl])
actor_elem = h.xpath(p + "//dt[text()='主演:']/following-sibling::dd[1]/a")
data["actor"] = []
for e in actor_elem:
@ -101,15 +98,6 @@ class DoubanDramaVersion(AbstractSite):
"url": show_url,
}
]
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd
@ -213,10 +201,10 @@ class DoubanDrama(AbstractSite):
data["opening_date"] = date_elem[0] if date_elem else None
if data["opening_date"]:
d = data["opening_date"].split("-")
l = len(d) if len(d) < 6 else 6
if l > 3:
dl = len(d) if len(d) < 6 else 6
if dl > 3:
data["opening_date"] = "-".join(d[:3])
data["closing_date"] = "-".join(d[0 : 6 - l] + d[3:l])
data["closing_date"] = "-".join(d[0 : 6 - dl] + d[3:dl])
data["location"] = [
s.strip()
@ -257,13 +245,4 @@ class DoubanDrama(AbstractSite):
data["localized_description"] = [{"lang": "zh-cn", "text": data["brief"]}]
pd = ResourceContent(metadata=data)
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -37,7 +37,7 @@ class DoubanMovie(AbstractSite):
"\n", ""
) # strip \n bc multi-line string is not properly coded in json by douban
d = json.loads(schema_data) if schema_data else {}
except Exception as e:
except Exception:
d = {}
try:
@ -245,7 +245,6 @@ class DoubanMovie(AbstractSite):
"TVSeason" if is_series or episodes or season else "Movie"
)
tmdb_season_id = None
if imdb_code:
res_data = search_tmdb_by_imdb_id(imdb_code)
has_movie = (
@ -302,13 +301,4 @@ class DoubanMovie(AbstractSite):
]
# TODO parse sister seasons
# pd.metadata['related_resources'] = []
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -133,13 +133,4 @@ class DoubanMusic(AbstractSite):
pd.lookup_ids[IdType.GTIN] = gtin
if isrc:
pd.lookup_ids[IdType.ISRC] = isrc
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -119,15 +119,6 @@ class Goodreads(AbstractSite):
pd = ResourceContent(metadata=data)
pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN)
pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)
if data["cover_image_url"]:
imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {data["cover_image_url"]}'
)
return pd

View file

@ -152,13 +152,4 @@ class IGDB(AbstractSite):
pd.lookup_ids[IdType.Steam] = SiteManager.get_site_cls_by_id_type(
IdType.Steam
).url_to_id(steam_url)
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -116,15 +116,6 @@ class IMDB(AbstractSite):
data["title"] = re.sub(r"#(\d+).(\d+)", r"S\1E\2", data["title"][8:])
pd = ResourceContent(metadata=data)
pd.lookup_ids[IdType.IMDB] = self.id_value
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd
@staticmethod

View file

@ -84,7 +84,7 @@ class RSS(AbstractSite):
def scrape(self):
if not self.url:
raise ValueError(f"no url avaialble in RSS site")
raise ValueError("no url avaialble in RSS site")
feed = self.parse_feed_from_url(self.url)
if not feed:
raise ValueError(f"no feed avaialble in {self.url}")
@ -108,17 +108,6 @@ class RSS(AbstractSite):
}
)
pd.lookup_ids[IdType.RSS] = RSS.url_to_id(self.url)
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(
pd.metadata["cover_image_url"], feed.get("link") or self.url
)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.warn(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd
def scrape_additional_data(self):

View file

@ -103,15 +103,6 @@ class Spotify(AbstractSite):
pd.lookup_ids[IdType.GTIN] = gtin
if isrc:
pd.lookup_ids[IdType.ISRC] = isrc
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -172,15 +172,6 @@ class TMDB_Movie(AbstractSite):
)
if imdb_code:
pd.lookup_ids[IdType.IMDB] = imdb_code
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd
@ -285,15 +276,6 @@ class TMDB_TV(AbstractSite):
)
if imdb_code:
pd.lookup_ids[IdType.IMDB] = imdb_code
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd
@ -380,15 +362,6 @@ class TMDB_TVSeason(AbstractSite):
map(lambda ep: ep["episode_number"], d["episodes"])
)
pd.metadata["episode_count"] = len(pd.metadata["episode_number_list"])
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
# use show's IMDB (for Season 1) or 1st episode's IMDB (if not Season 1) as this season's IMDB so that it can be compatible with TVSeason data from Douban
if pd.lookup_ids.get(IdType.IMDB):
@ -486,15 +459,6 @@ class TMDB_TVEpisode(AbstractSite):
if pd.metadata["title"]
else f'S{d["season_number"]} E{d["episode_number"]}'
)
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
if pd.lookup_ids.get(IdType.IMDB):
pd.lookup_ids[IdType.IMDB] = pd.lookup_ids[IdType.IMDB]