lint fix site and import

This commit is contained in:
Your Name 2023-08-11 11:55:42 -04:00 committed by Henri Dickson
parent bcd35f3526
commit 4649a109bd
16 changed files with 151 additions and 122 deletions

View file

@ -98,6 +98,10 @@ class AbstractSite:
def scrape_additional_data(self): def scrape_additional_data(self):
pass pass
@staticmethod
def query_str(content, query: str) -> str:
return content.xpath(query)[0].strip()
@classmethod @classmethod
def get_model_for_resource(cls, resource): def get_model_for_resource(cls, resource):
model = resource.get_preferred_model() model = resource.get_preferred_model()

View file

@ -57,7 +57,8 @@ class AppleMusic(AbstractSite):
if content is None: if content is None:
raise ParseError(self, f"localized content for {self.url}") raise ParseError(self, f"localized content for {self.url}")
elem = content.xpath("//script[@id='serialized-server-data']/text()") elem = content.xpath("//script[@id='serialized-server-data']/text()")
page_data = json.loads(elem[0])[0] txt: str = elem[0] # type:ignore
page_data = json.loads(txt)[0]
album_data = page_data["data"]["sections"][0]["items"][0] album_data = page_data["data"]["sections"][0]["items"][0]
title = album_data["title"] title = album_data["title"]
brief = album_data.get("modalPresentationDescriptor") brief = album_data.get("modalPresentationDescriptor")
@ -67,11 +68,11 @@ class AppleMusic(AbstractSite):
track_data = page_data["data"]["seoData"] track_data = page_data["data"]["seoData"]
date_elem = track_data.get("musicReleaseDate") date_elem = track_data.get("musicReleaseDate")
release_datetime = dateparser.parse(date_elem.strip()) if date_elem else None
release_date = ( release_date = (
dateparser.parse(date_elem.strip()).strftime("%Y-%m-%d") release_datetime.strftime("%Y-%m-%d") if release_datetime else None
if date_elem
else None
) )
track_list = [ track_list = [
f"{i}. {track['attributes']['name']}" f"{i}. {track['attributes']['name']}"
for i, track in enumerate(track_data["ogSongs"], 1) for i, track in enumerate(track_data["ogSongs"], 1)
@ -87,7 +88,10 @@ class AppleMusic(AbstractSite):
genre[0] genre[0]
] # apple treat "Music" as a genre. Thus, only the first genre is obtained. ] # apple treat "Music" as a genre. Thus, only the first genre is obtained.
image_elem = content.xpath("//source[@type='image/jpeg']/@srcset")[0] images = (
content.xpath("//source[@type='image/jpeg']/@srcset") if content else []
)
image_elem: str = images[0] if images else "" # type:ignore
image_url = image_elem.split(" ")[0] if image_elem else None image_url = image_elem.split(" ")[0] if image_elem else None
pd = ResourceContent( pd = ResourceContent(

View file

@ -4,6 +4,7 @@ import re
import urllib.parse import urllib.parse
import dateparser import dateparser
import dns.resolver
from catalog.common import * from catalog.common import *
from catalog.models import * from catalog.models import *
@ -32,14 +33,14 @@ class Bandcamp(AbstractSite):
hostname = parsed_url.netloc hostname = parsed_url.netloc
try: try:
answers = dns.resolver.query(hostname, "CNAME") answers = dns.resolver.query(hostname, "CNAME")
for rdata in answers: for rdata in answers: # type:ignore
if str(rdata.target) == "dom.bandcamp.com.": if str(rdata.target) == "dom.bandcamp.com.":
return True return True
except Exception: except Exception:
pass pass
try: try:
answers = dns.resolver.query(hostname, "A") answers = dns.resolver.query(hostname, "A")
for rdata in answers: for rdata in answers: # type:ignore
if str(rdata.address) == "35.241.62.186": if str(rdata.address) == "35.241.62.186":
return True return True
except Exception: except Exception:
@ -48,32 +49,36 @@ class Bandcamp(AbstractSite):
def scrape(self): def scrape(self):
content = BasicDownloader(self.url).download().html() content = BasicDownloader(self.url).download().html()
try: try:
title = content.xpath("//h2[@class='trackTitle']/text()")[0].strip() title = self.query_str(content, "//h2[@class='trackTitle']/text()")
artist = [ artist = [
content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip() self.query_str(content, "//div[@id='name-section']/h3/span/a/text()")
] ]
except IndexError: except IndexError:
raise ValueError("given url contains no valid info") raise ValueError("given url contains no valid info")
genre = [] # TODO: parse tags genre = [] # TODO: parse tags
track_list = "" track_list = ""
release_nodes = content.xpath( try:
"//div[@class='tralbumData tralbum-credits']/text()" release_str = re.sub(
) r"releas\w+ ",
release_date = ( "",
dateparser.parse( self.query_str(
re.sub(r"releas\w+ ", "", release_nodes[0].strip()) content, "//div[@class='tralbumData tralbum-credits']/text()"
).strftime("%Y-%m-%d") ),
if release_nodes )
else None release_datetime = dateparser.parse(release_str) if release_str else None
) release_date = (
release_datetime.strftime("%Y-%m-%d") if release_datetime else None
)
except:
release_date = None
duration = None duration = None
company = None company = None
brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()") brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()")
brief = "".join(brief_nodes) if brief_nodes else None brief = "".join(brief_nodes) if brief_nodes else None # type:ignore
cover_url = content.xpath("//div[@id='tralbumArt']/a/@href")[0].strip() cover_url = self.query_str(content, "//div[@id='tralbumArt']/a/@href")
bandcamp_page_data = json.loads( bandcamp_page_data = json.loads(
content.xpath("//meta[@name='bc-page-properties']/@content")[0].strip() self.query_str(content, "//meta[@name='bc-page-properties']/@content")
) )
bandcamp_album_id = bandcamp_page_data["item_id"] bandcamp_album_id = bandcamp_page_data["item_id"]

View file

@ -17,7 +17,7 @@ class Bangumi(AbstractSite):
DEFAULT_MODEL = None DEFAULT_MODEL = None
@classmethod @classmethod
def id_to_url(self, id_value): def id_to_url(cls, id_value):
return f"https://bgm.tv/subject/{id_value}" return f"https://bgm.tv/subject/{id_value}"
def scrape(self): def scrape(self):

View file

@ -29,7 +29,7 @@ class BooksTW(AbstractSite):
isbn_elem = content.xpath( isbn_elem = content.xpath(
"//div[@class='bd']/ul/li[starts-with(text(),'ISBN')]/text()" "//div[@class='bd']/ul/li[starts-with(text(),'ISBN')]/text()"
) )
isbn = isbn_elem[0].strip().split("", 1)[1].strip() if isbn_elem else None isbn = isbn_elem[0].strip().split("", 1)[1].strip() if isbn_elem else None # type: ignore
# isbn_elem = content.xpath( # isbn_elem = content.xpath(
# "//div[@class='bd']/ul/li[starts-with(text(),'EISBN')]/text()" # "//div[@class='bd']/ul/li[starts-with(text(),'EISBN')]/text()"
@ -43,26 +43,26 @@ class BooksTW(AbstractSite):
orig_title = content.xpath("string(//h1/following-sibling::h2)") orig_title = content.xpath("string(//h1/following-sibling::h2)")
authors = content.xpath("string(//div/ul/li[contains(text(),'作者:')])") authors = content.xpath("string(//div/ul/li[contains(text(),'作者:')])")
authors = authors.strip().split("", 1)[1].split(",") if authors else [] authors = authors.strip().split("", 1)[1].split(",") if authors else [] # type: ignore
if not authors: if not authors:
authors = [content.xpath("string(//div/ul/li[contains(.,'作者:')]/a)")] authors = [content.xpath("string(//div/ul/li[contains(.,'作者:')]/a)")]
authors = [s.strip() for s in authors] authors = [s.strip() for s in authors] # type: ignore
# author_orig = content.xpath("string(//div/ul/li[contains(text(),'原文作者:')])") # author_orig = content.xpath("string(//div/ul/li[contains(text(),'原文作者:')])")
translators = content.xpath("string(//div/ul/li[contains(text(),'譯者:')])") translators = content.xpath("string(//div/ul/li[contains(text(),'譯者:')])")
translators = ( translators = (
translators.strip().split("", 1)[1].split(",") if translators else [] translators.strip().split("", 1)[1].split(",") if translators else [] # type: ignore
) )
translators = [s.strip() for s in translators] translators = [s.strip() for s in translators]
language_elem = content.xpath("//div/ul/li[starts-with(text(),'語言:')]/text()") language_elem = content.xpath("//div/ul/li[starts-with(text(),'語言:')]/text()")
language = ( language = (
language_elem[0].strip().split("")[1].strip() if language_elem else None language_elem[0].strip().split("")[1].strip() if language_elem else None # type: ignore
) )
pub_house = content.xpath("string(//div/ul/li[contains(text(),'出版社:')])") pub_house = content.xpath("string(//div/ul/li[contains(text(),'出版社:')])")
pub_house = ( pub_house = (
pub_house.strip().split("", 1)[1].strip().split(" ", 1)[0] pub_house.strip().split("", 1)[1].strip().split(" ", 1)[0] # type: ignore
if pub_house if pub_house
else None else None
) )
@ -70,7 +70,7 @@ class BooksTW(AbstractSite):
pub_date = content.xpath("string(//div/ul/li[contains(text(),'出版日期:')])") pub_date = content.xpath("string(//div/ul/li[contains(text(),'出版日期:')])")
pub_date = re.match( pub_date = re.match(
r"(\d+)/(\d+)/(\d+)\s*$", r"(\d+)/(\d+)/(\d+)\s*$",
pub_date.strip().split("", 1)[1].strip().split(" ", 1)[0] pub_date.strip().split("", 1)[1].strip().split(" ", 1)[0] # type: ignore
if pub_date if pub_date
else "", else "",
) )
@ -82,10 +82,10 @@ class BooksTW(AbstractSite):
pub_month = None pub_month = None
spec = content.xpath("string(//div/ul/li[contains(text(),'規格:')])") spec = content.xpath("string(//div/ul/li[contains(text(),'規格:')])")
spec = spec.strip().split("", 1)[1].strip().split("/") if spec else [] spec = spec.strip().split("", 1)[1].strip().split("/") if spec else [] # type: ignore
if len(spec) > 1: if len(spec) > 1:
binding = spec[0].strip() binding = spec[0].strip()
pages = spec[1].strip().split("") pages = str(spec[1].strip()).split("")
pages = int(pages[0]) if len(pages) > 1 else None pages = int(pages[0]) if len(pages) > 1 else None
if pages and (pages > 999999 or pages < 1): if pages and (pages > 999999 or pages < 1):
pages = None pages = None
@ -95,7 +95,7 @@ class BooksTW(AbstractSite):
price = content.xpath("string(//div/ul/li[contains(text(),'定價:')])") price = content.xpath("string(//div/ul/li[contains(text(),'定價:')])")
price = ( price = (
price.strip().split("", 1)[1].split("")[0].strip() + " NTD" price.strip().split("", 1)[1].split("")[0].strip() + " NTD" # type: ignore
if price if price
else None else None
) )
@ -111,7 +111,7 @@ class BooksTW(AbstractSite):
img_url = content.xpath( img_url = content.xpath(
"string(//div[contains(@class,'cover_img')]//img[contains(@class,'cover')]/@src)" "string(//div[contains(@class,'cover_img')]//img[contains(@class,'cover')]/@src)"
) )
img_url = re.sub(r"&[wh]=\d+", "", img_url) if img_url else None img_url = re.sub(r"&[wh]=\d+", "", img_url) if img_url else None # type: ignore
data = { data = {
"title": title, "title": title,

View file

@ -53,8 +53,7 @@ class Goodreads(AbstractSite):
h = dl.download().html() h = dl.download().html()
# Next.JS version of GoodReads # Next.JS version of GoodReads
# JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState'] # JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']
elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()') src = self.query_str(h, '//script[@id="__NEXT_DATA__"]/text()')
src = elem[0].strip() if elem else None
if not src: if not src:
raise ParseError(self, "__NEXT_DATA__ element") raise ParseError(self, "__NEXT_DATA__ element")
d = json.loads(src)["props"]["pageProps"]["apolloState"] d = json.loads(src)["props"]["pageProps"]["apolloState"]
@ -134,16 +133,14 @@ class Goodreads_Work(AbstractSite):
def scrape(self, response=None): def scrape(self, response=None):
content = BasicDownloader(self.url).download().html() content = BasicDownloader(self.url).download().html()
title_elem = content.xpath("//h1/a/text()") title = self.query_str(content, "//h1/a/text()")
title = title_elem[0].strip() if title_elem else None
if not title: if not title:
raise ParseError(self, "title") raise ParseError(self, "title")
author_elem = content.xpath("//h2/a/text()") author = self.query_str(content, "//h2/a/text()")
author = author_elem[0].strip() if author_elem else None try:
first_published_elem = content.xpath("//h2/span/text()") first_published = self.query_str(content, "//h2/span/text()")
first_published = ( except:
first_published_elem[0].strip() if first_published_elem else None first_published = None
)
pd = ResourceContent( pd = ResourceContent(
metadata={ metadata={
"title": title, "title": title,

View file

@ -20,7 +20,7 @@ class GoogleBooks(AbstractSite):
DEFAULT_MODEL = Edition DEFAULT_MODEL = Edition
@classmethod @classmethod
def id_to_url(self, id_value): def id_to_url(cls, id_value):
return "https://books.google.com/books?id=" + id_value return "https://books.google.com/books?id=" + id_value
def scrape(self): def scrape(self):

View file

@ -61,7 +61,7 @@ class IGDB(AbstractSite):
if get_mock_mode(): if get_mock_mode():
r = BasicDownloader(key).download().json() r = BasicDownloader(key).download().json()
else: else:
r = json.loads(_wrapper.api_request(p, q)) r = json.loads(_wrapper.api_request(p, q)) # type: ignore
if settings.DOWNLOADER_SAVEDIR: if settings.DOWNLOADER_SAVEDIR:
with open( with open(
settings.DOWNLOADER_SAVEDIR + "/" + get_mock_file(key), settings.DOWNLOADER_SAVEDIR + "/" + get_mock_file(key),

View file

@ -57,11 +57,16 @@ class IMDB(AbstractSite):
season_number = res_data["tv_episode_results"][0]["season_number"] season_number = res_data["tv_episode_results"][0]["season_number"]
episode_number = res_data["tv_episode_results"][0]["episode_number"] episode_number = res_data["tv_episode_results"][0]["episode_number"]
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}" url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}"
pd = None
if url: if url:
tmdb = SiteManager.get_site_by_url(url) tmdb = SiteManager.get_site_by_url(url)
pd = tmdb.scrape() if tmdb:
pd.metadata["preferred_model"] = tmdb.DEFAULT_MODEL.__name__ pd = tmdb.scrape()
pd.metadata["required_resources"] = [] # do not auto fetch parent season pd.metadata["preferred_model"] = (
tmdb.DEFAULT_MODEL.__name__ if tmdb.DEFAULT_MODEL else None
)
# do not auto fetch parent season
pd.metadata["required_resources"] = []
if not pd: if not pd:
# if IMDB id not found in TMDB, use real IMDB scraper # if IMDB id not found in TMDB, use real IMDB scraper
pd = self.scrape_imdb() pd = self.scrape_imdb()
@ -69,8 +74,7 @@ class IMDB(AbstractSite):
def scrape_imdb(self): def scrape_imdb(self):
h = BasicDownloader(self.url).download().html() h = BasicDownloader(self.url).download().html()
elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()') src = self.query_str(h, '//script[@id="__NEXT_DATA__"]/text()')
src = elem[0].strip() if elem else None
if not src: if not src:
raise ParseError(self, "__NEXT_DATA__ element") raise ParseError(self, "__NEXT_DATA__ element")
d = json.loads(src)["props"]["pageProps"]["aboveTheFoldData"] d = json.loads(src)["props"]["pageProps"]["aboveTheFoldData"]
@ -120,15 +124,14 @@ class IMDB(AbstractSite):
def get_episode_list(show_id, season_id): def get_episode_list(show_id, season_id):
url = f"https://m.imdb.com/title/{show_id}/" url = f"https://m.imdb.com/title/{show_id}/"
h = BasicDownloader(url).download().html() h = BasicDownloader(url).download().html()
show_url = "".join( u: str = h.xpath('//a[@data-testid="hero-title-block__series-link"]/@href') # type: ignore
h.xpath('//a[@data-testid="hero-title-block__series-link"]/@href') show_url = "".join(u).split("?")[0]
).split("?")[0]
if not show_url: if not show_url:
show_url = f"/title/{show_id}/" show_url = f"/title/{show_id}/"
url = f"https://m.imdb.com{show_url}episodes/?season={season_id}" url = f"https://m.imdb.com{show_url}episodes/?season={season_id}"
h = BasicDownloader(url).download().html() h = BasicDownloader(url).download().html()
episodes = [] episodes = []
for e in h.xpath('//div[@id="eplist"]/div/a'): for e in h.xpath('//div[@id="eplist"]/div/a'): # type: ignore
episode_number = e.xpath( episode_number = e.xpath(
'./span[contains(@class,"episode-list__title")]/text()' './span[contains(@class,"episode-list__title")]/text()'
)[0].strip() )[0].strip()
@ -166,9 +169,12 @@ class IMDB(AbstractSite):
).first() ).first()
if not episode: if not episode:
site = SiteManager.get_site_by_url(e["url"]) site = SiteManager.get_site_by_url(e["url"])
episode = site.get_resource_ready().item if site:
episode.set_parent_item(season) res = site.get_resource_ready()
episode.save() if res and res.item:
episode = res.item
episode.set_parent_item(season)
episode.save()
else: else:
_logger.warning(f"season {season} has no episodes fetched, creating dummy") _logger.warning(f"season {season} has no episodes fetched, creating dummy")
cnt = int(season.episode_count or 0) cnt = int(season.episode_count or 0)

View file

@ -68,8 +68,8 @@ class Spotify(AbstractSite):
else: else:
track_list.append(str(track["track_number"]) + ". " + track["name"]) track_list.append(str(track["track_number"]) + ". " + track["name"])
track_list = "\n".join(track_list) track_list = "\n".join(track_list)
dt = dateparser.parse(res_data["release_date"])
release_date = dateparser.parse(res_data["release_date"]).strftime("%Y-%m-%d") release_date = dt.strftime("%Y-%m-%d") if dt else None
gtin = None gtin = None
if res_data["external_ids"].get("upc"): if res_data["external_ids"].get("upc"):

View file

@ -31,30 +31,29 @@ class Steam(AbstractSite):
headers["Cookie"] = "wants_mature_content=1; birthtime=754700401;" headers["Cookie"] = "wants_mature_content=1; birthtime=754700401;"
content = BasicDownloader(self.url, headers=headers).download().html() content = BasicDownloader(self.url, headers=headers).download().html()
title = content.xpath("//div[@class='apphub_AppName']/text()")[0] title = self.query_str(content, "//div[@class='apphub_AppName']/text()")
developer = content.xpath("//div[@id='developers_list']/a/text()") developer = content.xpath("//div[@id='developers_list']/a/text()")
publisher = content.xpath( publisher = content.xpath(
"//div[@class='glance_ctn']//div[@class='dev_row'][2]//a/text()" "//div[@class='glance_ctn']//div[@class='dev_row'][2]//a/text()"
) )
dt = content.xpath("//div[@class='release_date']/div[@class='date']/text()") dts = self.query_str(
release_date = ( content, "//div[@class='release_date']/div[@class='date']/text()"
dateparser.parse(dt[0].replace(" ", "")).strftime("%Y-%m-%d")
if dt
else None
) )
dt = dateparser.parse(dts.replace(" ", "")) if dts else None
release_date = dt.strftime("%Y-%m-%d") if dt else None
genre = content.xpath( genre = content.xpath(
"//div[@class='details_block']/b[2]/following-sibling::a/text()" "//div[@class='details_block']/b[2]/following-sibling::a/text()"
) )
platform = ["PC"] platform = ["PC"]
brief = content.xpath("//div[@class='game_description_snippet']/text()")[ brief = self.query_str(
0 content, "//div[@class='game_description_snippet']/text()"
].strip() )
# try Steam images if no image from IGDB # try Steam images if no image from IGDB
if pd.cover_image is None: if pd.cover_image is None:
pd.metadata["cover_image_url"] = content.xpath( pd.metadata["cover_image_url"] = self.query_str(
"//img[@class='game_header_image_full']/@src" content, "//img[@class='game_header_image_full']/@src"
)[0].replace("header.jpg", "library_600x900.jpg") ).replace("header.jpg", "library_600x900.jpg")
( (
pd.cover_image, pd.cover_image,
pd.cover_image_extention, pd.cover_image_extention,
@ -62,9 +61,9 @@ class Steam(AbstractSite):
pd.metadata["cover_image_url"], self.url pd.metadata["cover_image_url"], self.url
) )
if pd.cover_image is None: if pd.cover_image is None:
pd.metadata["cover_image_url"] = content.xpath( pd.metadata["cover_image_url"] = self.query_str(
"//img[@class='game_header_image_full']/@src" content, "//img[@class='game_header_image_full']/@src"
)[0] )
( (
pd.cover_image, pd.cover_image,
pd.cover_image_extention, pd.cover_image_extention,

View file

@ -44,7 +44,7 @@ class TMDB_Movie(AbstractSite):
DEFAULT_MODEL = Movie DEFAULT_MODEL = Movie
@classmethod @classmethod
def id_to_url(self, id_value): def id_to_url(cls, id_value):
return f"https://www.themoviedb.org/movie/{id_value}" return f"https://www.themoviedb.org/movie/{id_value}"
def scrape(self): def scrape(self):
@ -178,7 +178,7 @@ class TMDB_TV(AbstractSite):
DEFAULT_MODEL = TVShow DEFAULT_MODEL = TVShow
@classmethod @classmethod
def id_to_url(self, id_value): def id_to_url(cls, id_value):
return f"https://www.themoviedb.org/tv/{id_value}" return f"https://www.themoviedb.org/tv/{id_value}"
def scrape(self): def scrape(self):
@ -338,6 +338,8 @@ class TMDB_TVSeason(AbstractSite):
return f"https://www.themoviedb.org/tv/{v[0]}/season/{v[1]}" return f"https://www.themoviedb.org/tv/{v[0]}/season/{v[1]}"
def scrape(self): def scrape(self):
if not self.id_value:
raise ParseError(self, "id_value")
v = self.id_value.split("-") v = self.id_value.split("-")
show_id = v[0] show_id = v[0]
season_id = v[1] season_id = v[1]
@ -346,7 +348,7 @@ class TMDB_TVSeason(AbstractSite):
api_url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season_id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits" api_url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season_id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
d = BasicDownloader(api_url).download().json() d = BasicDownloader(api_url).download().json()
if not d.get("id"): if not d.get("id"):
raise ParseError("id") raise ParseError(self, "id")
pd = ResourceContent( pd = ResourceContent(
metadata=_copy_dict( metadata=_copy_dict(
d, d,
@ -401,12 +403,14 @@ class TMDB_TVSeason(AbstractSite):
# this should not happen # this should not happen
_logger.warning("Unexpected IMDB id for TMDB tv season") _logger.warning("Unexpected IMDB id for TMDB tv season")
elif pd.metadata.get("season_number") == 1: elif pd.metadata.get("season_number") == 1:
res = SiteManager.get_site_by_url( site = SiteManager.get_site_by_url(
f"https://www.themoviedb.org/tv/{show_id}" f"https://www.themoviedb.org/tv/{show_id}"
).get_resource_ready()
pd.lookup_ids[IdType.IMDB] = (
res.other_lookup_ids.get(IdType.IMDB) if res else None
) )
if site:
res = site.get_resource_ready()
pd.lookup_ids[IdType.IMDB] = (
res.other_lookup_ids.get(IdType.IMDB) if res else None
)
elif len(pd.metadata["episode_number_list"]) == 0: elif len(pd.metadata["episode_number_list"]) == 0:
_logger.warning( _logger.warning(
"Unable to lookup IMDB id for TMDB tv season with zero episodes" "Unable to lookup IMDB id for TMDB tv season with zero episodes"
@ -416,7 +420,7 @@ class TMDB_TVSeason(AbstractSite):
api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits" api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
d2 = BasicDownloader(api_url2).download().json() d2 = BasicDownloader(api_url2).download().json()
if not d2.get("id"): if not d2.get("id"):
raise ParseError("first episode id for season") raise ParseError(self, "first episode id for season")
pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id") pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id")
return pd return pd
@ -445,6 +449,8 @@ class TMDB_TVEpisode(AbstractSite):
return f"https://www.themoviedb.org/tv/{v[0]}/season/{v[1]}/episode/{v[2]}" return f"https://www.themoviedb.org/tv/{v[0]}/season/{v[1]}/episode/{v[2]}"
def scrape(self): def scrape(self):
if not self.id_value:
raise ParseError(self, "id_value")
v = self.id_value.split("-") v = self.id_value.split("-")
show_id = v[0] show_id = v[0]
season_id = v[1] season_id = v[1]
@ -454,7 +460,7 @@ class TMDB_TVEpisode(AbstractSite):
api_url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season_id}/episode/{episode_id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits" api_url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season_id}/episode/{episode_id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
d = BasicDownloader(api_url).download().json() d = BasicDownloader(api_url).download().json()
if not d.get("id"): if not d.get("id"):
raise ParseError("id") raise ParseError(self, "id")
pd = ResourceContent( pd = ResourceContent(
metadata=_copy_dict( metadata=_copy_dict(
d, d,

View file

@ -60,7 +60,11 @@
<td>{{ value.1|default:"-" }}</td> <td>{{ value.1|default:"-" }}</td>
</tr> </tr>
{% empty %} {% empty %}
<p>No data.</p> <tr>
<td>-</td>
<td></td>
<td></td>
</tr>
{% endfor %} {% endfor %}
{% empty %} {% empty %}
<p>No history for this item has been logged yet.</p> <p>No history for this item has been logged yet.</p>

View file

@ -8,6 +8,7 @@ import openpyxl
import pytz import pytz
from auditlog.context import set_actor from auditlog.context import set_actor
from django.conf import settings from django.conf import settings
from loguru import logger
from markdownify import markdownify as md from markdownify import markdownify as md
from user_messages import api as msg from user_messages import api as msg
@ -18,28 +19,27 @@ from catalog.sites.douban import DoubanDownloader
from common.utils import GenerateDateUUIDMediaFilePath from common.utils import GenerateDateUUIDMediaFilePath
from journal.models import * from journal.models import *
_logger = logging.getLogger(__name__)
_tz_sh = pytz.timezone("Asia/Shanghai") _tz_sh = pytz.timezone("Asia/Shanghai")
def _fetch_remote_image(url): def _fetch_remote_image(url):
try: try:
print(f"fetching remote image {url}") logger.info(f"fetching remote image {url}")
imgdl = ProxiedImageDownloader(url) imgdl = ProxiedImageDownloader(url)
raw_img = imgdl.download().content raw_img = imgdl.download().content
ext = imgdl.extention ext = imgdl.extention
f = GenerateDateUUIDMediaFilePath( f = GenerateDateUUIDMediaFilePath(
None, "x." + ext, settings.MARKDOWNX_MEDIA_PATH None, f"x.{ext}", settings.MARKDOWNX_MEDIA_PATH
) )
file = settings.MEDIA_ROOT + f file = settings.MEDIA_ROOT + f
local_url = settings.MEDIA_URL + f local_url = settings.MEDIA_URL + f
os.makedirs(os.path.dirname(file), exist_ok=True) os.makedirs(os.path.dirname(file), exist_ok=True)
with open(file, "wb") as binary_file: with open(file, "wb") as binary_file:
binary_file.write(raw_img) binary_file.write(raw_img)
# print(f'remote image saved as {local_url}') # logger.info(f'remote image saved as {local_url}')
return local_url return local_url
except Exception: except Exception:
print(f"unable to fetch remote image {url}") logger.error(f"unable to fetch remote image {url}")
return url return url
@ -49,10 +49,9 @@ class DoubanImporter:
skipped = 0 skipped = 0
imported = 0 imported = 0
failed = [] failed = []
user = None
visibility = 0 visibility = 0
mode = 0 mode = 0
file = None file = ""
def __init__(self, user, visibility, mode): def __init__(self, user, visibility, mode):
self.user = user self.user = user
@ -149,7 +148,7 @@ class DoubanImporter:
for name in config: for name in config:
data[name] = [] data[name] = []
if name in wb: if name in wb:
print(f"{self.user} parsing {name}") logger.info(f"{self.user} parsing {name}")
for row in wb[name].iter_rows(min_row=2, values_only=True): for row in wb[name].iter_rows(min_row=2, values_only=True):
cells = [cell for cell in row] cells = [cell for cell in row]
if len(cells) > 6 and cells[0]: if len(cells) > 6 and cells[0]:
@ -189,12 +188,12 @@ class DoubanImporter:
# return cells[3] # return cells[3]
def import_from_file_task(self): def import_from_file_task(self):
print(f"{self.user} import start") logger.info(f"{self.user} import start")
msg.info(self.user, f"开始导入豆瓣标记和评论") msg.info(self.user, f"开始导入豆瓣标记和评论")
self.update_user_import_status(1) self.update_user_import_status(1)
with set_actor(self.user): with set_actor(self.user):
self.load_sheets() self.load_sheets()
print(f"{self.user} sheet loaded, {self.total} lines total") logger.info(f"{self.user} sheet loaded, {self.total} lines total")
self.update_user_import_status(1) self.update_user_import_status(1)
for name, param in self.mark_sheet_config.items(): for name, param in self.mark_sheet_config.items():
self.import_mark_sheet(self.mark_data[name], param[0], name) self.import_mark_sheet(self.mark_data[name], param[0], name)
@ -211,7 +210,7 @@ class DoubanImporter:
def import_mark_sheet(self, worksheet, shelf_type, sheet_name): def import_mark_sheet(self, worksheet, shelf_type, sheet_name):
prefix = f"{self.user} {sheet_name}|" prefix = f"{self.user} {sheet_name}|"
if worksheet is None: # or worksheet.max_row < 2: if worksheet is None: # or worksheet.max_row < 2:
print(f"{prefix} empty sheet") logger.warning(f"{prefix} empty sheet")
return return
for cells in worksheet: for cells in worksheet:
if len(cells) < 6: if len(cells) < 6:
@ -244,7 +243,7 @@ class DoubanImporter:
""" """
item = self.get_item_by_url(url) item = self.get_item_by_url(url)
if not item: if not item:
print(f"{self.user} | match/fetch {url} failed") logger.warning(f"{self.user} | match/fetch {url} failed")
return return
mark = Mark(self.user, item) mark = Mark(self.user, item)
if self.mode == 0 and ( if self.mode == 0 and (
@ -268,7 +267,7 @@ class DoubanImporter:
def import_review_sheet(self, worksheet, sheet_name): def import_review_sheet(self, worksheet, sheet_name):
prefix = f"{self.user} {sheet_name}|" prefix = f"{self.user} {sheet_name}|"
if worksheet is None: # or worksheet.max_row < 2: if worksheet is None: # or worksheet.max_row < 2:
print(f"{prefix} empty sheet") logger.warning(f"{prefix} empty sheet")
return return
for cells in worksheet: for cells in worksheet:
if len(cells) < 6: if len(cells) < 6:
@ -307,17 +306,18 @@ class DoubanImporter:
item = None item = None
try: try:
site = SiteManager.get_site_by_url(url) site = SiteManager.get_site_by_url(url)
if not site:
raise ValueError(f"URL unrecognized {url}")
item = site.get_item() item = site.get_item()
if not item: if not item:
print(f"fetching {url}") logger.info(f"fetching {url}")
site.get_resource_ready() site.get_resource_ready()
item = site.get_item() item = site.get_item()
else: else:
# print(f"matched {url}") # logger.info(f"matched {url}")
print(".", end="", flush=True) print(".", end="", flush=True)
except Exception as e: except Exception as e:
print(f"fetching exception: {url} {e}") logger.error(f"fetching exception: {url} {e}")
_logger.error(f"scrape failed: {url}", exc_info=e)
if item is None: if item is None:
self.failed.append(url) self.failed.append(url)
return item return item
@ -329,23 +329,24 @@ class DoubanImporter:
prefix = f"{self.user} |" prefix = f"{self.user} |"
url = self.guess_entity_url(entity_title, rating, time) url = self.guess_entity_url(entity_title, rating, time)
if url is None: if url is None:
print(f"{prefix} fetching review {review_url}") logger.info(f"{prefix} fetching review {review_url}")
try: try:
h = DoubanDownloader(review_url).download().html() h = DoubanDownloader(review_url).download().html()
for u in h.xpath("//header[@class='main-hd']/a/@href"): urls = h.xpath("//header[@class='main-hd']/a/@href")
for u in urls: # type:ignore
if ".douban.com/subject/" in u: if ".douban.com/subject/" in u:
url = u url = u
if not url: if not url:
print( logger.warning(
f"{prefix} fetching error {review_url} unable to locate entity url" f"{prefix} fetching error {review_url} unable to locate entity url"
) )
return return
except Exception: except Exception:
print(f"{prefix} fetching review exception {review_url}") logger.error(f"{prefix} fetching review exception {review_url}")
return return
item = self.get_item_by_url(url) item = self.get_item_by_url(url)
if not item: if not item:
print(f"{prefix} match/fetch {url} failed") logger.warning(f"{prefix} match/fetch {url} failed")
return return
if ( if (
self.mode == 1 self.mode == 1

View file

@ -3,6 +3,7 @@ from datetime import datetime
import django_rq import django_rq
from auditlog.context import set_actor from auditlog.context import set_actor
from django.utils import timezone
from django.utils.timezone import make_aware from django.utils.timezone import make_aware
from user_messages import api as msg from user_messages import api as msg
@ -45,12 +46,12 @@ class GoodreadsImporter:
total = 0 total = 0
visibility = user.preference.default_visibility visibility = user.preference.default_visibility
with set_actor(user): with set_actor(user):
if match_list or match_shelf: shelf = None
shelf = ( if match_shelf:
cls.parse_shelf(match_shelf[0], user) shelf = cls.parse_shelf(match_shelf[0], user)
if match_shelf elif match_list:
else cls.parse_list(match_list[0], user) shelf = cls.parse_list(match_list[0], user)
) if shelf:
if shelf["title"] and shelf["books"]: if shelf["title"] and shelf["books"]:
collection = Collection.objects.create( collection = Collection.objects.create(
title=shelf["title"], title=shelf["title"],
@ -119,7 +120,7 @@ class GoodreadsImporter:
@classmethod @classmethod
def parse_shelf(cls, url, user): def parse_shelf(cls, url, user):
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]} # return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
title = None title = ""
books = [] books = []
url_shelf = url + "&view=table" url_shelf = url + "&view=table"
while url_shelf: while url_shelf:
@ -205,7 +206,7 @@ class GoodreadsImporter:
pass # likely just download error pass # likely just download error
next_elem = content.xpath("//a[@class='next_page']/@href") next_elem = content.xpath("//a[@class='next_page']/@href")
url_shelf = ( url_shelf = (
f"https://www.goodreads.com{next_elem[0].strip()}" f"https://www.goodreads.com{next_elem[0].strip()}" # type:ignore
if next_elem if next_elem
else None else None
) )
@ -214,8 +215,8 @@ class GoodreadsImporter:
@classmethod @classmethod
def parse_list(cls, url, user): def parse_list(cls, url, user):
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]} # return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
title = None title = ""
description = None description = ""
books = [] books = []
url_shelf = url url_shelf = url
while url_shelf: while url_shelf:
@ -225,10 +226,12 @@ class GoodreadsImporter:
if not title_elem: if not title_elem:
print(f"List parsing error {url_shelf}") print(f"List parsing error {url_shelf}")
break break
title = title_elem[0].strip() title: str = title_elem[0].strip() # type:ignore
description = content.xpath('//div[@class="mediumText"]/text()')[0].strip() desc_elem = content.xpath('//div[@class="mediumText"]/text()')
description: str = desc_elem[0].strip() # type:ignore
print("List title: " + title) print("List title: " + title)
for link in content.xpath('//a[@class="bookTitle"]/@href'): links = content.xpath('//a[@class="bookTitle"]/@href')
for link in links: # type:ignore
url_book = "https://www.goodreads.com" + link url_book = "https://www.goodreads.com" + link
try: try:
book = cls.get_book(url_book, user) book = cls.get_book(url_book, user)
@ -244,7 +247,7 @@ class GoodreadsImporter:
pass # likely just download error pass # likely just download error
next_elem = content.xpath("//a[@class='next_page']/@href") next_elem = content.xpath("//a[@class='next_page']/@href")
url_shelf = ( url_shelf = (
("https://www.goodreads.com" + next_elem[0].strip()) f"https://www.goodreads.com{next_elem[0].strip()}" # type:ignore
if next_elem if next_elem
else None else None
) )

View file

@ -1,5 +1,5 @@
[tool.pyright] [tool.pyright]
exclude = [ "media", ".venv", ".git", "playground", "**/tests.py", "neodb", "**/migrations", "**/commands", "**/importers", "**/sites" ] exclude = [ "media", ".venv", ".git", "playground", "**/tests.py", "neodb", "**/migrations", "**/commands", "**/sites/douban_*" ]
[tool.djlint] [tool.djlint]
ignore="T002,T003,H006,H019,H020,H021,H023,H030,H031" ignore="T002,T003,H006,H019,H020,H021,H023,H030,H031"