lint fix site and import
This commit is contained in:
parent
bcd35f3526
commit
4649a109bd
16 changed files with 151 additions and 122 deletions
|
@ -98,6 +98,10 @@ class AbstractSite:
|
||||||
def scrape_additional_data(self):
|
def scrape_additional_data(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def query_str(content, query: str) -> str:
|
||||||
|
return content.xpath(query)[0].strip()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_model_for_resource(cls, resource):
|
def get_model_for_resource(cls, resource):
|
||||||
model = resource.get_preferred_model()
|
model = resource.get_preferred_model()
|
||||||
|
|
|
@ -57,7 +57,8 @@ class AppleMusic(AbstractSite):
|
||||||
if content is None:
|
if content is None:
|
||||||
raise ParseError(self, f"localized content for {self.url}")
|
raise ParseError(self, f"localized content for {self.url}")
|
||||||
elem = content.xpath("//script[@id='serialized-server-data']/text()")
|
elem = content.xpath("//script[@id='serialized-server-data']/text()")
|
||||||
page_data = json.loads(elem[0])[0]
|
txt: str = elem[0] # type:ignore
|
||||||
|
page_data = json.loads(txt)[0]
|
||||||
album_data = page_data["data"]["sections"][0]["items"][0]
|
album_data = page_data["data"]["sections"][0]["items"][0]
|
||||||
title = album_data["title"]
|
title = album_data["title"]
|
||||||
brief = album_data.get("modalPresentationDescriptor")
|
brief = album_data.get("modalPresentationDescriptor")
|
||||||
|
@ -67,11 +68,11 @@ class AppleMusic(AbstractSite):
|
||||||
|
|
||||||
track_data = page_data["data"]["seoData"]
|
track_data = page_data["data"]["seoData"]
|
||||||
date_elem = track_data.get("musicReleaseDate")
|
date_elem = track_data.get("musicReleaseDate")
|
||||||
|
release_datetime = dateparser.parse(date_elem.strip()) if date_elem else None
|
||||||
release_date = (
|
release_date = (
|
||||||
dateparser.parse(date_elem.strip()).strftime("%Y-%m-%d")
|
release_datetime.strftime("%Y-%m-%d") if release_datetime else None
|
||||||
if date_elem
|
|
||||||
else None
|
|
||||||
)
|
)
|
||||||
|
|
||||||
track_list = [
|
track_list = [
|
||||||
f"{i}. {track['attributes']['name']}"
|
f"{i}. {track['attributes']['name']}"
|
||||||
for i, track in enumerate(track_data["ogSongs"], 1)
|
for i, track in enumerate(track_data["ogSongs"], 1)
|
||||||
|
@ -87,7 +88,10 @@ class AppleMusic(AbstractSite):
|
||||||
genre[0]
|
genre[0]
|
||||||
] # apple treat "Music" as a genre. Thus, only the first genre is obtained.
|
] # apple treat "Music" as a genre. Thus, only the first genre is obtained.
|
||||||
|
|
||||||
image_elem = content.xpath("//source[@type='image/jpeg']/@srcset")[0]
|
images = (
|
||||||
|
content.xpath("//source[@type='image/jpeg']/@srcset") if content else []
|
||||||
|
)
|
||||||
|
image_elem: str = images[0] if images else "" # type:ignore
|
||||||
image_url = image_elem.split(" ")[0] if image_elem else None
|
image_url = image_elem.split(" ")[0] if image_elem else None
|
||||||
|
|
||||||
pd = ResourceContent(
|
pd = ResourceContent(
|
||||||
|
|
|
@ -4,6 +4,7 @@ import re
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
import dateparser
|
import dateparser
|
||||||
|
import dns.resolver
|
||||||
|
|
||||||
from catalog.common import *
|
from catalog.common import *
|
||||||
from catalog.models import *
|
from catalog.models import *
|
||||||
|
@ -32,14 +33,14 @@ class Bandcamp(AbstractSite):
|
||||||
hostname = parsed_url.netloc
|
hostname = parsed_url.netloc
|
||||||
try:
|
try:
|
||||||
answers = dns.resolver.query(hostname, "CNAME")
|
answers = dns.resolver.query(hostname, "CNAME")
|
||||||
for rdata in answers:
|
for rdata in answers: # type:ignore
|
||||||
if str(rdata.target) == "dom.bandcamp.com.":
|
if str(rdata.target) == "dom.bandcamp.com.":
|
||||||
return True
|
return True
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
answers = dns.resolver.query(hostname, "A")
|
answers = dns.resolver.query(hostname, "A")
|
||||||
for rdata in answers:
|
for rdata in answers: # type:ignore
|
||||||
if str(rdata.address) == "35.241.62.186":
|
if str(rdata.address) == "35.241.62.186":
|
||||||
return True
|
return True
|
||||||
except Exception:
|
except Exception:
|
||||||
|
@ -48,32 +49,36 @@ class Bandcamp(AbstractSite):
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
content = BasicDownloader(self.url).download().html()
|
content = BasicDownloader(self.url).download().html()
|
||||||
try:
|
try:
|
||||||
title = content.xpath("//h2[@class='trackTitle']/text()")[0].strip()
|
title = self.query_str(content, "//h2[@class='trackTitle']/text()")
|
||||||
artist = [
|
artist = [
|
||||||
content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip()
|
self.query_str(content, "//div[@id='name-section']/h3/span/a/text()")
|
||||||
]
|
]
|
||||||
except IndexError:
|
except IndexError:
|
||||||
raise ValueError("given url contains no valid info")
|
raise ValueError("given url contains no valid info")
|
||||||
|
|
||||||
genre = [] # TODO: parse tags
|
genre = [] # TODO: parse tags
|
||||||
track_list = ""
|
track_list = ""
|
||||||
release_nodes = content.xpath(
|
try:
|
||||||
"//div[@class='tralbumData tralbum-credits']/text()"
|
release_str = re.sub(
|
||||||
)
|
r"releas\w+ ",
|
||||||
release_date = (
|
"",
|
||||||
dateparser.parse(
|
self.query_str(
|
||||||
re.sub(r"releas\w+ ", "", release_nodes[0].strip())
|
content, "//div[@class='tralbumData tralbum-credits']/text()"
|
||||||
).strftime("%Y-%m-%d")
|
),
|
||||||
if release_nodes
|
)
|
||||||
else None
|
release_datetime = dateparser.parse(release_str) if release_str else None
|
||||||
)
|
release_date = (
|
||||||
|
release_datetime.strftime("%Y-%m-%d") if release_datetime else None
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
release_date = None
|
||||||
duration = None
|
duration = None
|
||||||
company = None
|
company = None
|
||||||
brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()")
|
brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()")
|
||||||
brief = "".join(brief_nodes) if brief_nodes else None
|
brief = "".join(brief_nodes) if brief_nodes else None # type:ignore
|
||||||
cover_url = content.xpath("//div[@id='tralbumArt']/a/@href")[0].strip()
|
cover_url = self.query_str(content, "//div[@id='tralbumArt']/a/@href")
|
||||||
bandcamp_page_data = json.loads(
|
bandcamp_page_data = json.loads(
|
||||||
content.xpath("//meta[@name='bc-page-properties']/@content")[0].strip()
|
self.query_str(content, "//meta[@name='bc-page-properties']/@content")
|
||||||
)
|
)
|
||||||
bandcamp_album_id = bandcamp_page_data["item_id"]
|
bandcamp_album_id = bandcamp_page_data["item_id"]
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,7 @@ class Bangumi(AbstractSite):
|
||||||
DEFAULT_MODEL = None
|
DEFAULT_MODEL = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def id_to_url(self, id_value):
|
def id_to_url(cls, id_value):
|
||||||
return f"https://bgm.tv/subject/{id_value}"
|
return f"https://bgm.tv/subject/{id_value}"
|
||||||
|
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
|
|
|
@ -29,7 +29,7 @@ class BooksTW(AbstractSite):
|
||||||
isbn_elem = content.xpath(
|
isbn_elem = content.xpath(
|
||||||
"//div[@class='bd']/ul/li[starts-with(text(),'ISBN:')]/text()"
|
"//div[@class='bd']/ul/li[starts-with(text(),'ISBN:')]/text()"
|
||||||
)
|
)
|
||||||
isbn = isbn_elem[0].strip().split(":", 1)[1].strip() if isbn_elem else None
|
isbn = isbn_elem[0].strip().split(":", 1)[1].strip() if isbn_elem else None # type: ignore
|
||||||
|
|
||||||
# isbn_elem = content.xpath(
|
# isbn_elem = content.xpath(
|
||||||
# "//div[@class='bd']/ul/li[starts-with(text(),'EISBN')]/text()"
|
# "//div[@class='bd']/ul/li[starts-with(text(),'EISBN')]/text()"
|
||||||
|
@ -43,26 +43,26 @@ class BooksTW(AbstractSite):
|
||||||
orig_title = content.xpath("string(//h1/following-sibling::h2)")
|
orig_title = content.xpath("string(//h1/following-sibling::h2)")
|
||||||
|
|
||||||
authors = content.xpath("string(//div/ul/li[contains(text(),'作者:')])")
|
authors = content.xpath("string(//div/ul/li[contains(text(),'作者:')])")
|
||||||
authors = authors.strip().split(":", 1)[1].split(",") if authors else []
|
authors = authors.strip().split(":", 1)[1].split(",") if authors else [] # type: ignore
|
||||||
if not authors:
|
if not authors:
|
||||||
authors = [content.xpath("string(//div/ul/li[contains(.,'作者:')]/a)")]
|
authors = [content.xpath("string(//div/ul/li[contains(.,'作者:')]/a)")]
|
||||||
authors = [s.strip() for s in authors]
|
authors = [s.strip() for s in authors] # type: ignore
|
||||||
# author_orig = content.xpath("string(//div/ul/li[contains(text(),'原文作者:')])")
|
# author_orig = content.xpath("string(//div/ul/li[contains(text(),'原文作者:')])")
|
||||||
|
|
||||||
translators = content.xpath("string(//div/ul/li[contains(text(),'譯者:')])")
|
translators = content.xpath("string(//div/ul/li[contains(text(),'譯者:')])")
|
||||||
translators = (
|
translators = (
|
||||||
translators.strip().split(":", 1)[1].split(",") if translators else []
|
translators.strip().split(":", 1)[1].split(",") if translators else [] # type: ignore
|
||||||
)
|
)
|
||||||
translators = [s.strip() for s in translators]
|
translators = [s.strip() for s in translators]
|
||||||
|
|
||||||
language_elem = content.xpath("//div/ul/li[starts-with(text(),'語言:')]/text()")
|
language_elem = content.xpath("//div/ul/li[starts-with(text(),'語言:')]/text()")
|
||||||
language = (
|
language = (
|
||||||
language_elem[0].strip().split(":")[1].strip() if language_elem else None
|
language_elem[0].strip().split(":")[1].strip() if language_elem else None # type: ignore
|
||||||
)
|
)
|
||||||
|
|
||||||
pub_house = content.xpath("string(//div/ul/li[contains(text(),'出版社:')])")
|
pub_house = content.xpath("string(//div/ul/li[contains(text(),'出版社:')])")
|
||||||
pub_house = (
|
pub_house = (
|
||||||
pub_house.strip().split(":", 1)[1].strip().split(" ", 1)[0]
|
pub_house.strip().split(":", 1)[1].strip().split(" ", 1)[0] # type: ignore
|
||||||
if pub_house
|
if pub_house
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
@ -70,7 +70,7 @@ class BooksTW(AbstractSite):
|
||||||
pub_date = content.xpath("string(//div/ul/li[contains(text(),'出版日期:')])")
|
pub_date = content.xpath("string(//div/ul/li[contains(text(),'出版日期:')])")
|
||||||
pub_date = re.match(
|
pub_date = re.match(
|
||||||
r"(\d+)/(\d+)/(\d+)\s*$",
|
r"(\d+)/(\d+)/(\d+)\s*$",
|
||||||
pub_date.strip().split(":", 1)[1].strip().split(" ", 1)[0]
|
pub_date.strip().split(":", 1)[1].strip().split(" ", 1)[0] # type: ignore
|
||||||
if pub_date
|
if pub_date
|
||||||
else "",
|
else "",
|
||||||
)
|
)
|
||||||
|
@ -82,10 +82,10 @@ class BooksTW(AbstractSite):
|
||||||
pub_month = None
|
pub_month = None
|
||||||
|
|
||||||
spec = content.xpath("string(//div/ul/li[contains(text(),'規格:')])")
|
spec = content.xpath("string(//div/ul/li[contains(text(),'規格:')])")
|
||||||
spec = spec.strip().split(":", 1)[1].strip().split("/") if spec else []
|
spec = spec.strip().split(":", 1)[1].strip().split("/") if spec else [] # type: ignore
|
||||||
if len(spec) > 1:
|
if len(spec) > 1:
|
||||||
binding = spec[0].strip()
|
binding = spec[0].strip()
|
||||||
pages = spec[1].strip().split("頁")
|
pages = str(spec[1].strip()).split("頁")
|
||||||
pages = int(pages[0]) if len(pages) > 1 else None
|
pages = int(pages[0]) if len(pages) > 1 else None
|
||||||
if pages and (pages > 999999 or pages < 1):
|
if pages and (pages > 999999 or pages < 1):
|
||||||
pages = None
|
pages = None
|
||||||
|
@ -95,7 +95,7 @@ class BooksTW(AbstractSite):
|
||||||
|
|
||||||
price = content.xpath("string(//div/ul/li[contains(text(),'定價:')])")
|
price = content.xpath("string(//div/ul/li[contains(text(),'定價:')])")
|
||||||
price = (
|
price = (
|
||||||
price.strip().split(":", 1)[1].split("元")[0].strip() + " NTD"
|
price.strip().split(":", 1)[1].split("元")[0].strip() + " NTD" # type: ignore
|
||||||
if price
|
if price
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
@ -111,7 +111,7 @@ class BooksTW(AbstractSite):
|
||||||
img_url = content.xpath(
|
img_url = content.xpath(
|
||||||
"string(//div[contains(@class,'cover_img')]//img[contains(@class,'cover')]/@src)"
|
"string(//div[contains(@class,'cover_img')]//img[contains(@class,'cover')]/@src)"
|
||||||
)
|
)
|
||||||
img_url = re.sub(r"&[wh]=\d+", "", img_url) if img_url else None
|
img_url = re.sub(r"&[wh]=\d+", "", img_url) if img_url else None # type: ignore
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"title": title,
|
"title": title,
|
||||||
|
|
|
@ -53,8 +53,7 @@ class Goodreads(AbstractSite):
|
||||||
h = dl.download().html()
|
h = dl.download().html()
|
||||||
# Next.JS version of GoodReads
|
# Next.JS version of GoodReads
|
||||||
# JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']
|
# JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']
|
||||||
elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()')
|
src = self.query_str(h, '//script[@id="__NEXT_DATA__"]/text()')
|
||||||
src = elem[0].strip() if elem else None
|
|
||||||
if not src:
|
if not src:
|
||||||
raise ParseError(self, "__NEXT_DATA__ element")
|
raise ParseError(self, "__NEXT_DATA__ element")
|
||||||
d = json.loads(src)["props"]["pageProps"]["apolloState"]
|
d = json.loads(src)["props"]["pageProps"]["apolloState"]
|
||||||
|
@ -134,16 +133,14 @@ class Goodreads_Work(AbstractSite):
|
||||||
|
|
||||||
def scrape(self, response=None):
|
def scrape(self, response=None):
|
||||||
content = BasicDownloader(self.url).download().html()
|
content = BasicDownloader(self.url).download().html()
|
||||||
title_elem = content.xpath("//h1/a/text()")
|
title = self.query_str(content, "//h1/a/text()")
|
||||||
title = title_elem[0].strip() if title_elem else None
|
|
||||||
if not title:
|
if not title:
|
||||||
raise ParseError(self, "title")
|
raise ParseError(self, "title")
|
||||||
author_elem = content.xpath("//h2/a/text()")
|
author = self.query_str(content, "//h2/a/text()")
|
||||||
author = author_elem[0].strip() if author_elem else None
|
try:
|
||||||
first_published_elem = content.xpath("//h2/span/text()")
|
first_published = self.query_str(content, "//h2/span/text()")
|
||||||
first_published = (
|
except:
|
||||||
first_published_elem[0].strip() if first_published_elem else None
|
first_published = None
|
||||||
)
|
|
||||||
pd = ResourceContent(
|
pd = ResourceContent(
|
||||||
metadata={
|
metadata={
|
||||||
"title": title,
|
"title": title,
|
||||||
|
|
|
@ -20,7 +20,7 @@ class GoogleBooks(AbstractSite):
|
||||||
DEFAULT_MODEL = Edition
|
DEFAULT_MODEL = Edition
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def id_to_url(self, id_value):
|
def id_to_url(cls, id_value):
|
||||||
return "https://books.google.com/books?id=" + id_value
|
return "https://books.google.com/books?id=" + id_value
|
||||||
|
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
|
|
|
@ -61,7 +61,7 @@ class IGDB(AbstractSite):
|
||||||
if get_mock_mode():
|
if get_mock_mode():
|
||||||
r = BasicDownloader(key).download().json()
|
r = BasicDownloader(key).download().json()
|
||||||
else:
|
else:
|
||||||
r = json.loads(_wrapper.api_request(p, q))
|
r = json.loads(_wrapper.api_request(p, q)) # type: ignore
|
||||||
if settings.DOWNLOADER_SAVEDIR:
|
if settings.DOWNLOADER_SAVEDIR:
|
||||||
with open(
|
with open(
|
||||||
settings.DOWNLOADER_SAVEDIR + "/" + get_mock_file(key),
|
settings.DOWNLOADER_SAVEDIR + "/" + get_mock_file(key),
|
||||||
|
|
|
@ -57,11 +57,16 @@ class IMDB(AbstractSite):
|
||||||
season_number = res_data["tv_episode_results"][0]["season_number"]
|
season_number = res_data["tv_episode_results"][0]["season_number"]
|
||||||
episode_number = res_data["tv_episode_results"][0]["episode_number"]
|
episode_number = res_data["tv_episode_results"][0]["episode_number"]
|
||||||
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}"
|
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}"
|
||||||
|
pd = None
|
||||||
if url:
|
if url:
|
||||||
tmdb = SiteManager.get_site_by_url(url)
|
tmdb = SiteManager.get_site_by_url(url)
|
||||||
pd = tmdb.scrape()
|
if tmdb:
|
||||||
pd.metadata["preferred_model"] = tmdb.DEFAULT_MODEL.__name__
|
pd = tmdb.scrape()
|
||||||
pd.metadata["required_resources"] = [] # do not auto fetch parent season
|
pd.metadata["preferred_model"] = (
|
||||||
|
tmdb.DEFAULT_MODEL.__name__ if tmdb.DEFAULT_MODEL else None
|
||||||
|
)
|
||||||
|
# do not auto fetch parent season
|
||||||
|
pd.metadata["required_resources"] = []
|
||||||
if not pd:
|
if not pd:
|
||||||
# if IMDB id not found in TMDB, use real IMDB scraper
|
# if IMDB id not found in TMDB, use real IMDB scraper
|
||||||
pd = self.scrape_imdb()
|
pd = self.scrape_imdb()
|
||||||
|
@ -69,8 +74,7 @@ class IMDB(AbstractSite):
|
||||||
|
|
||||||
def scrape_imdb(self):
|
def scrape_imdb(self):
|
||||||
h = BasicDownloader(self.url).download().html()
|
h = BasicDownloader(self.url).download().html()
|
||||||
elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()')
|
src = self.query_str(h, '//script[@id="__NEXT_DATA__"]/text()')
|
||||||
src = elem[0].strip() if elem else None
|
|
||||||
if not src:
|
if not src:
|
||||||
raise ParseError(self, "__NEXT_DATA__ element")
|
raise ParseError(self, "__NEXT_DATA__ element")
|
||||||
d = json.loads(src)["props"]["pageProps"]["aboveTheFoldData"]
|
d = json.loads(src)["props"]["pageProps"]["aboveTheFoldData"]
|
||||||
|
@ -120,15 +124,14 @@ class IMDB(AbstractSite):
|
||||||
def get_episode_list(show_id, season_id):
|
def get_episode_list(show_id, season_id):
|
||||||
url = f"https://m.imdb.com/title/{show_id}/"
|
url = f"https://m.imdb.com/title/{show_id}/"
|
||||||
h = BasicDownloader(url).download().html()
|
h = BasicDownloader(url).download().html()
|
||||||
show_url = "".join(
|
u: str = h.xpath('//a[@data-testid="hero-title-block__series-link"]/@href') # type: ignore
|
||||||
h.xpath('//a[@data-testid="hero-title-block__series-link"]/@href')
|
show_url = "".join(u).split("?")[0]
|
||||||
).split("?")[0]
|
|
||||||
if not show_url:
|
if not show_url:
|
||||||
show_url = f"/title/{show_id}/"
|
show_url = f"/title/{show_id}/"
|
||||||
url = f"https://m.imdb.com{show_url}episodes/?season={season_id}"
|
url = f"https://m.imdb.com{show_url}episodes/?season={season_id}"
|
||||||
h = BasicDownloader(url).download().html()
|
h = BasicDownloader(url).download().html()
|
||||||
episodes = []
|
episodes = []
|
||||||
for e in h.xpath('//div[@id="eplist"]/div/a'):
|
for e in h.xpath('//div[@id="eplist"]/div/a'): # type: ignore
|
||||||
episode_number = e.xpath(
|
episode_number = e.xpath(
|
||||||
'./span[contains(@class,"episode-list__title")]/text()'
|
'./span[contains(@class,"episode-list__title")]/text()'
|
||||||
)[0].strip()
|
)[0].strip()
|
||||||
|
@ -166,9 +169,12 @@ class IMDB(AbstractSite):
|
||||||
).first()
|
).first()
|
||||||
if not episode:
|
if not episode:
|
||||||
site = SiteManager.get_site_by_url(e["url"])
|
site = SiteManager.get_site_by_url(e["url"])
|
||||||
episode = site.get_resource_ready().item
|
if site:
|
||||||
episode.set_parent_item(season)
|
res = site.get_resource_ready()
|
||||||
episode.save()
|
if res and res.item:
|
||||||
|
episode = res.item
|
||||||
|
episode.set_parent_item(season)
|
||||||
|
episode.save()
|
||||||
else:
|
else:
|
||||||
_logger.warning(f"season {season} has no episodes fetched, creating dummy")
|
_logger.warning(f"season {season} has no episodes fetched, creating dummy")
|
||||||
cnt = int(season.episode_count or 0)
|
cnt = int(season.episode_count or 0)
|
||||||
|
|
|
@ -68,8 +68,8 @@ class Spotify(AbstractSite):
|
||||||
else:
|
else:
|
||||||
track_list.append(str(track["track_number"]) + ". " + track["name"])
|
track_list.append(str(track["track_number"]) + ". " + track["name"])
|
||||||
track_list = "\n".join(track_list)
|
track_list = "\n".join(track_list)
|
||||||
|
dt = dateparser.parse(res_data["release_date"])
|
||||||
release_date = dateparser.parse(res_data["release_date"]).strftime("%Y-%m-%d")
|
release_date = dt.strftime("%Y-%m-%d") if dt else None
|
||||||
|
|
||||||
gtin = None
|
gtin = None
|
||||||
if res_data["external_ids"].get("upc"):
|
if res_data["external_ids"].get("upc"):
|
||||||
|
|
|
@ -31,30 +31,29 @@ class Steam(AbstractSite):
|
||||||
headers["Cookie"] = "wants_mature_content=1; birthtime=754700401;"
|
headers["Cookie"] = "wants_mature_content=1; birthtime=754700401;"
|
||||||
content = BasicDownloader(self.url, headers=headers).download().html()
|
content = BasicDownloader(self.url, headers=headers).download().html()
|
||||||
|
|
||||||
title = content.xpath("//div[@class='apphub_AppName']/text()")[0]
|
title = self.query_str(content, "//div[@class='apphub_AppName']/text()")
|
||||||
developer = content.xpath("//div[@id='developers_list']/a/text()")
|
developer = content.xpath("//div[@id='developers_list']/a/text()")
|
||||||
publisher = content.xpath(
|
publisher = content.xpath(
|
||||||
"//div[@class='glance_ctn']//div[@class='dev_row'][2]//a/text()"
|
"//div[@class='glance_ctn']//div[@class='dev_row'][2]//a/text()"
|
||||||
)
|
)
|
||||||
dt = content.xpath("//div[@class='release_date']/div[@class='date']/text()")
|
dts = self.query_str(
|
||||||
release_date = (
|
content, "//div[@class='release_date']/div[@class='date']/text()"
|
||||||
dateparser.parse(dt[0].replace(" ", "")).strftime("%Y-%m-%d")
|
|
||||||
if dt
|
|
||||||
else None
|
|
||||||
)
|
)
|
||||||
|
dt = dateparser.parse(dts.replace(" ", "")) if dts else None
|
||||||
|
release_date = dt.strftime("%Y-%m-%d") if dt else None
|
||||||
|
|
||||||
genre = content.xpath(
|
genre = content.xpath(
|
||||||
"//div[@class='details_block']/b[2]/following-sibling::a/text()"
|
"//div[@class='details_block']/b[2]/following-sibling::a/text()"
|
||||||
)
|
)
|
||||||
platform = ["PC"]
|
platform = ["PC"]
|
||||||
brief = content.xpath("//div[@class='game_description_snippet']/text()")[
|
brief = self.query_str(
|
||||||
0
|
content, "//div[@class='game_description_snippet']/text()"
|
||||||
].strip()
|
)
|
||||||
# try Steam images if no image from IGDB
|
# try Steam images if no image from IGDB
|
||||||
if pd.cover_image is None:
|
if pd.cover_image is None:
|
||||||
pd.metadata["cover_image_url"] = content.xpath(
|
pd.metadata["cover_image_url"] = self.query_str(
|
||||||
"//img[@class='game_header_image_full']/@src"
|
content, "//img[@class='game_header_image_full']/@src"
|
||||||
)[0].replace("header.jpg", "library_600x900.jpg")
|
).replace("header.jpg", "library_600x900.jpg")
|
||||||
(
|
(
|
||||||
pd.cover_image,
|
pd.cover_image,
|
||||||
pd.cover_image_extention,
|
pd.cover_image_extention,
|
||||||
|
@ -62,9 +61,9 @@ class Steam(AbstractSite):
|
||||||
pd.metadata["cover_image_url"], self.url
|
pd.metadata["cover_image_url"], self.url
|
||||||
)
|
)
|
||||||
if pd.cover_image is None:
|
if pd.cover_image is None:
|
||||||
pd.metadata["cover_image_url"] = content.xpath(
|
pd.metadata["cover_image_url"] = self.query_str(
|
||||||
"//img[@class='game_header_image_full']/@src"
|
content, "//img[@class='game_header_image_full']/@src"
|
||||||
)[0]
|
)
|
||||||
(
|
(
|
||||||
pd.cover_image,
|
pd.cover_image,
|
||||||
pd.cover_image_extention,
|
pd.cover_image_extention,
|
||||||
|
|
|
@ -44,7 +44,7 @@ class TMDB_Movie(AbstractSite):
|
||||||
DEFAULT_MODEL = Movie
|
DEFAULT_MODEL = Movie
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def id_to_url(self, id_value):
|
def id_to_url(cls, id_value):
|
||||||
return f"https://www.themoviedb.org/movie/{id_value}"
|
return f"https://www.themoviedb.org/movie/{id_value}"
|
||||||
|
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
|
@ -178,7 +178,7 @@ class TMDB_TV(AbstractSite):
|
||||||
DEFAULT_MODEL = TVShow
|
DEFAULT_MODEL = TVShow
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def id_to_url(self, id_value):
|
def id_to_url(cls, id_value):
|
||||||
return f"https://www.themoviedb.org/tv/{id_value}"
|
return f"https://www.themoviedb.org/tv/{id_value}"
|
||||||
|
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
|
@ -338,6 +338,8 @@ class TMDB_TVSeason(AbstractSite):
|
||||||
return f"https://www.themoviedb.org/tv/{v[0]}/season/{v[1]}"
|
return f"https://www.themoviedb.org/tv/{v[0]}/season/{v[1]}"
|
||||||
|
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
|
if not self.id_value:
|
||||||
|
raise ParseError(self, "id_value")
|
||||||
v = self.id_value.split("-")
|
v = self.id_value.split("-")
|
||||||
show_id = v[0]
|
show_id = v[0]
|
||||||
season_id = v[1]
|
season_id = v[1]
|
||||||
|
@ -346,7 +348,7 @@ class TMDB_TVSeason(AbstractSite):
|
||||||
api_url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season_id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
|
api_url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season_id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
|
||||||
d = BasicDownloader(api_url).download().json()
|
d = BasicDownloader(api_url).download().json()
|
||||||
if not d.get("id"):
|
if not d.get("id"):
|
||||||
raise ParseError("id")
|
raise ParseError(self, "id")
|
||||||
pd = ResourceContent(
|
pd = ResourceContent(
|
||||||
metadata=_copy_dict(
|
metadata=_copy_dict(
|
||||||
d,
|
d,
|
||||||
|
@ -401,12 +403,14 @@ class TMDB_TVSeason(AbstractSite):
|
||||||
# this should not happen
|
# this should not happen
|
||||||
_logger.warning("Unexpected IMDB id for TMDB tv season")
|
_logger.warning("Unexpected IMDB id for TMDB tv season")
|
||||||
elif pd.metadata.get("season_number") == 1:
|
elif pd.metadata.get("season_number") == 1:
|
||||||
res = SiteManager.get_site_by_url(
|
site = SiteManager.get_site_by_url(
|
||||||
f"https://www.themoviedb.org/tv/{show_id}"
|
f"https://www.themoviedb.org/tv/{show_id}"
|
||||||
).get_resource_ready()
|
|
||||||
pd.lookup_ids[IdType.IMDB] = (
|
|
||||||
res.other_lookup_ids.get(IdType.IMDB) if res else None
|
|
||||||
)
|
)
|
||||||
|
if site:
|
||||||
|
res = site.get_resource_ready()
|
||||||
|
pd.lookup_ids[IdType.IMDB] = (
|
||||||
|
res.other_lookup_ids.get(IdType.IMDB) if res else None
|
||||||
|
)
|
||||||
elif len(pd.metadata["episode_number_list"]) == 0:
|
elif len(pd.metadata["episode_number_list"]) == 0:
|
||||||
_logger.warning(
|
_logger.warning(
|
||||||
"Unable to lookup IMDB id for TMDB tv season with zero episodes"
|
"Unable to lookup IMDB id for TMDB tv season with zero episodes"
|
||||||
|
@ -416,7 +420,7 @@ class TMDB_TVSeason(AbstractSite):
|
||||||
api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
|
api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
|
||||||
d2 = BasicDownloader(api_url2).download().json()
|
d2 = BasicDownloader(api_url2).download().json()
|
||||||
if not d2.get("id"):
|
if not d2.get("id"):
|
||||||
raise ParseError("first episode id for season")
|
raise ParseError(self, "first episode id for season")
|
||||||
pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id")
|
pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id")
|
||||||
return pd
|
return pd
|
||||||
|
|
||||||
|
@ -445,6 +449,8 @@ class TMDB_TVEpisode(AbstractSite):
|
||||||
return f"https://www.themoviedb.org/tv/{v[0]}/season/{v[1]}/episode/{v[2]}"
|
return f"https://www.themoviedb.org/tv/{v[0]}/season/{v[1]}/episode/{v[2]}"
|
||||||
|
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
|
if not self.id_value:
|
||||||
|
raise ParseError(self, "id_value")
|
||||||
v = self.id_value.split("-")
|
v = self.id_value.split("-")
|
||||||
show_id = v[0]
|
show_id = v[0]
|
||||||
season_id = v[1]
|
season_id = v[1]
|
||||||
|
@ -454,7 +460,7 @@ class TMDB_TVEpisode(AbstractSite):
|
||||||
api_url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season_id}/episode/{episode_id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
|
api_url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season_id}/episode/{episode_id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
|
||||||
d = BasicDownloader(api_url).download().json()
|
d = BasicDownloader(api_url).download().json()
|
||||||
if not d.get("id"):
|
if not d.get("id"):
|
||||||
raise ParseError("id")
|
raise ParseError(self, "id")
|
||||||
pd = ResourceContent(
|
pd = ResourceContent(
|
||||||
metadata=_copy_dict(
|
metadata=_copy_dict(
|
||||||
d,
|
d,
|
||||||
|
|
|
@ -60,7 +60,11 @@
|
||||||
<td>{{ value.1|default:"-" }}</td>
|
<td>{{ value.1|default:"-" }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% empty %}
|
{% empty %}
|
||||||
<p>No data.</p>
|
<tr>
|
||||||
|
<td>-</td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% empty %}
|
{% empty %}
|
||||||
<p>No history for this item has been logged yet.</p>
|
<p>No history for this item has been logged yet.</p>
|
||||||
|
|
|
@ -8,6 +8,7 @@ import openpyxl
|
||||||
import pytz
|
import pytz
|
||||||
from auditlog.context import set_actor
|
from auditlog.context import set_actor
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from loguru import logger
|
||||||
from markdownify import markdownify as md
|
from markdownify import markdownify as md
|
||||||
from user_messages import api as msg
|
from user_messages import api as msg
|
||||||
|
|
||||||
|
@ -18,28 +19,27 @@ from catalog.sites.douban import DoubanDownloader
|
||||||
from common.utils import GenerateDateUUIDMediaFilePath
|
from common.utils import GenerateDateUUIDMediaFilePath
|
||||||
from journal.models import *
|
from journal.models import *
|
||||||
|
|
||||||
_logger = logging.getLogger(__name__)
|
|
||||||
_tz_sh = pytz.timezone("Asia/Shanghai")
|
_tz_sh = pytz.timezone("Asia/Shanghai")
|
||||||
|
|
||||||
|
|
||||||
def _fetch_remote_image(url):
|
def _fetch_remote_image(url):
|
||||||
try:
|
try:
|
||||||
print(f"fetching remote image {url}")
|
logger.info(f"fetching remote image {url}")
|
||||||
imgdl = ProxiedImageDownloader(url)
|
imgdl = ProxiedImageDownloader(url)
|
||||||
raw_img = imgdl.download().content
|
raw_img = imgdl.download().content
|
||||||
ext = imgdl.extention
|
ext = imgdl.extention
|
||||||
f = GenerateDateUUIDMediaFilePath(
|
f = GenerateDateUUIDMediaFilePath(
|
||||||
None, "x." + ext, settings.MARKDOWNX_MEDIA_PATH
|
None, f"x.{ext}", settings.MARKDOWNX_MEDIA_PATH
|
||||||
)
|
)
|
||||||
file = settings.MEDIA_ROOT + f
|
file = settings.MEDIA_ROOT + f
|
||||||
local_url = settings.MEDIA_URL + f
|
local_url = settings.MEDIA_URL + f
|
||||||
os.makedirs(os.path.dirname(file), exist_ok=True)
|
os.makedirs(os.path.dirname(file), exist_ok=True)
|
||||||
with open(file, "wb") as binary_file:
|
with open(file, "wb") as binary_file:
|
||||||
binary_file.write(raw_img)
|
binary_file.write(raw_img)
|
||||||
# print(f'remote image saved as {local_url}')
|
# logger.info(f'remote image saved as {local_url}')
|
||||||
return local_url
|
return local_url
|
||||||
except Exception:
|
except Exception:
|
||||||
print(f"unable to fetch remote image {url}")
|
logger.error(f"unable to fetch remote image {url}")
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
@ -49,10 +49,9 @@ class DoubanImporter:
|
||||||
skipped = 0
|
skipped = 0
|
||||||
imported = 0
|
imported = 0
|
||||||
failed = []
|
failed = []
|
||||||
user = None
|
|
||||||
visibility = 0
|
visibility = 0
|
||||||
mode = 0
|
mode = 0
|
||||||
file = None
|
file = ""
|
||||||
|
|
||||||
def __init__(self, user, visibility, mode):
|
def __init__(self, user, visibility, mode):
|
||||||
self.user = user
|
self.user = user
|
||||||
|
@ -149,7 +148,7 @@ class DoubanImporter:
|
||||||
for name in config:
|
for name in config:
|
||||||
data[name] = []
|
data[name] = []
|
||||||
if name in wb:
|
if name in wb:
|
||||||
print(f"{self.user} parsing {name}")
|
logger.info(f"{self.user} parsing {name}")
|
||||||
for row in wb[name].iter_rows(min_row=2, values_only=True):
|
for row in wb[name].iter_rows(min_row=2, values_only=True):
|
||||||
cells = [cell for cell in row]
|
cells = [cell for cell in row]
|
||||||
if len(cells) > 6 and cells[0]:
|
if len(cells) > 6 and cells[0]:
|
||||||
|
@ -189,12 +188,12 @@ class DoubanImporter:
|
||||||
# return cells[3]
|
# return cells[3]
|
||||||
|
|
||||||
def import_from_file_task(self):
|
def import_from_file_task(self):
|
||||||
print(f"{self.user} import start")
|
logger.info(f"{self.user} import start")
|
||||||
msg.info(self.user, f"开始导入豆瓣标记和评论")
|
msg.info(self.user, f"开始导入豆瓣标记和评论")
|
||||||
self.update_user_import_status(1)
|
self.update_user_import_status(1)
|
||||||
with set_actor(self.user):
|
with set_actor(self.user):
|
||||||
self.load_sheets()
|
self.load_sheets()
|
||||||
print(f"{self.user} sheet loaded, {self.total} lines total")
|
logger.info(f"{self.user} sheet loaded, {self.total} lines total")
|
||||||
self.update_user_import_status(1)
|
self.update_user_import_status(1)
|
||||||
for name, param in self.mark_sheet_config.items():
|
for name, param in self.mark_sheet_config.items():
|
||||||
self.import_mark_sheet(self.mark_data[name], param[0], name)
|
self.import_mark_sheet(self.mark_data[name], param[0], name)
|
||||||
|
@ -211,7 +210,7 @@ class DoubanImporter:
|
||||||
def import_mark_sheet(self, worksheet, shelf_type, sheet_name):
|
def import_mark_sheet(self, worksheet, shelf_type, sheet_name):
|
||||||
prefix = f"{self.user} {sheet_name}|"
|
prefix = f"{self.user} {sheet_name}|"
|
||||||
if worksheet is None: # or worksheet.max_row < 2:
|
if worksheet is None: # or worksheet.max_row < 2:
|
||||||
print(f"{prefix} empty sheet")
|
logger.warning(f"{prefix} empty sheet")
|
||||||
return
|
return
|
||||||
for cells in worksheet:
|
for cells in worksheet:
|
||||||
if len(cells) < 6:
|
if len(cells) < 6:
|
||||||
|
@ -244,7 +243,7 @@ class DoubanImporter:
|
||||||
"""
|
"""
|
||||||
item = self.get_item_by_url(url)
|
item = self.get_item_by_url(url)
|
||||||
if not item:
|
if not item:
|
||||||
print(f"{self.user} | match/fetch {url} failed")
|
logger.warning(f"{self.user} | match/fetch {url} failed")
|
||||||
return
|
return
|
||||||
mark = Mark(self.user, item)
|
mark = Mark(self.user, item)
|
||||||
if self.mode == 0 and (
|
if self.mode == 0 and (
|
||||||
|
@ -268,7 +267,7 @@ class DoubanImporter:
|
||||||
def import_review_sheet(self, worksheet, sheet_name):
|
def import_review_sheet(self, worksheet, sheet_name):
|
||||||
prefix = f"{self.user} {sheet_name}|"
|
prefix = f"{self.user} {sheet_name}|"
|
||||||
if worksheet is None: # or worksheet.max_row < 2:
|
if worksheet is None: # or worksheet.max_row < 2:
|
||||||
print(f"{prefix} empty sheet")
|
logger.warning(f"{prefix} empty sheet")
|
||||||
return
|
return
|
||||||
for cells in worksheet:
|
for cells in worksheet:
|
||||||
if len(cells) < 6:
|
if len(cells) < 6:
|
||||||
|
@ -307,17 +306,18 @@ class DoubanImporter:
|
||||||
item = None
|
item = None
|
||||||
try:
|
try:
|
||||||
site = SiteManager.get_site_by_url(url)
|
site = SiteManager.get_site_by_url(url)
|
||||||
|
if not site:
|
||||||
|
raise ValueError(f"URL unrecognized {url}")
|
||||||
item = site.get_item()
|
item = site.get_item()
|
||||||
if not item:
|
if not item:
|
||||||
print(f"fetching {url}")
|
logger.info(f"fetching {url}")
|
||||||
site.get_resource_ready()
|
site.get_resource_ready()
|
||||||
item = site.get_item()
|
item = site.get_item()
|
||||||
else:
|
else:
|
||||||
# print(f"matched {url}")
|
# logger.info(f"matched {url}")
|
||||||
print(".", end="", flush=True)
|
print(".", end="", flush=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"fetching exception: {url} {e}")
|
logger.error(f"fetching exception: {url} {e}")
|
||||||
_logger.error(f"scrape failed: {url}", exc_info=e)
|
|
||||||
if item is None:
|
if item is None:
|
||||||
self.failed.append(url)
|
self.failed.append(url)
|
||||||
return item
|
return item
|
||||||
|
@ -329,23 +329,24 @@ class DoubanImporter:
|
||||||
prefix = f"{self.user} |"
|
prefix = f"{self.user} |"
|
||||||
url = self.guess_entity_url(entity_title, rating, time)
|
url = self.guess_entity_url(entity_title, rating, time)
|
||||||
if url is None:
|
if url is None:
|
||||||
print(f"{prefix} fetching review {review_url}")
|
logger.info(f"{prefix} fetching review {review_url}")
|
||||||
try:
|
try:
|
||||||
h = DoubanDownloader(review_url).download().html()
|
h = DoubanDownloader(review_url).download().html()
|
||||||
for u in h.xpath("//header[@class='main-hd']/a/@href"):
|
urls = h.xpath("//header[@class='main-hd']/a/@href")
|
||||||
|
for u in urls: # type:ignore
|
||||||
if ".douban.com/subject/" in u:
|
if ".douban.com/subject/" in u:
|
||||||
url = u
|
url = u
|
||||||
if not url:
|
if not url:
|
||||||
print(
|
logger.warning(
|
||||||
f"{prefix} fetching error {review_url} unable to locate entity url"
|
f"{prefix} fetching error {review_url} unable to locate entity url"
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
except Exception:
|
except Exception:
|
||||||
print(f"{prefix} fetching review exception {review_url}")
|
logger.error(f"{prefix} fetching review exception {review_url}")
|
||||||
return
|
return
|
||||||
item = self.get_item_by_url(url)
|
item = self.get_item_by_url(url)
|
||||||
if not item:
|
if not item:
|
||||||
print(f"{prefix} match/fetch {url} failed")
|
logger.warning(f"{prefix} match/fetch {url} failed")
|
||||||
return
|
return
|
||||||
if (
|
if (
|
||||||
self.mode == 1
|
self.mode == 1
|
||||||
|
|
|
@ -3,6 +3,7 @@ from datetime import datetime
|
||||||
|
|
||||||
import django_rq
|
import django_rq
|
||||||
from auditlog.context import set_actor
|
from auditlog.context import set_actor
|
||||||
|
from django.utils import timezone
|
||||||
from django.utils.timezone import make_aware
|
from django.utils.timezone import make_aware
|
||||||
from user_messages import api as msg
|
from user_messages import api as msg
|
||||||
|
|
||||||
|
@ -45,12 +46,12 @@ class GoodreadsImporter:
|
||||||
total = 0
|
total = 0
|
||||||
visibility = user.preference.default_visibility
|
visibility = user.preference.default_visibility
|
||||||
with set_actor(user):
|
with set_actor(user):
|
||||||
if match_list or match_shelf:
|
shelf = None
|
||||||
shelf = (
|
if match_shelf:
|
||||||
cls.parse_shelf(match_shelf[0], user)
|
shelf = cls.parse_shelf(match_shelf[0], user)
|
||||||
if match_shelf
|
elif match_list:
|
||||||
else cls.parse_list(match_list[0], user)
|
shelf = cls.parse_list(match_list[0], user)
|
||||||
)
|
if shelf:
|
||||||
if shelf["title"] and shelf["books"]:
|
if shelf["title"] and shelf["books"]:
|
||||||
collection = Collection.objects.create(
|
collection = Collection.objects.create(
|
||||||
title=shelf["title"],
|
title=shelf["title"],
|
||||||
|
@ -119,7 +120,7 @@ class GoodreadsImporter:
|
||||||
@classmethod
|
@classmethod
|
||||||
def parse_shelf(cls, url, user):
|
def parse_shelf(cls, url, user):
|
||||||
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
|
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
|
||||||
title = None
|
title = ""
|
||||||
books = []
|
books = []
|
||||||
url_shelf = url + "&view=table"
|
url_shelf = url + "&view=table"
|
||||||
while url_shelf:
|
while url_shelf:
|
||||||
|
@ -205,7 +206,7 @@ class GoodreadsImporter:
|
||||||
pass # likely just download error
|
pass # likely just download error
|
||||||
next_elem = content.xpath("//a[@class='next_page']/@href")
|
next_elem = content.xpath("//a[@class='next_page']/@href")
|
||||||
url_shelf = (
|
url_shelf = (
|
||||||
f"https://www.goodreads.com{next_elem[0].strip()}"
|
f"https://www.goodreads.com{next_elem[0].strip()}" # type:ignore
|
||||||
if next_elem
|
if next_elem
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
@ -214,8 +215,8 @@ class GoodreadsImporter:
|
||||||
@classmethod
|
@classmethod
|
||||||
def parse_list(cls, url, user):
|
def parse_list(cls, url, user):
|
||||||
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
|
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
|
||||||
title = None
|
title = ""
|
||||||
description = None
|
description = ""
|
||||||
books = []
|
books = []
|
||||||
url_shelf = url
|
url_shelf = url
|
||||||
while url_shelf:
|
while url_shelf:
|
||||||
|
@ -225,10 +226,12 @@ class GoodreadsImporter:
|
||||||
if not title_elem:
|
if not title_elem:
|
||||||
print(f"List parsing error {url_shelf}")
|
print(f"List parsing error {url_shelf}")
|
||||||
break
|
break
|
||||||
title = title_elem[0].strip()
|
title: str = title_elem[0].strip() # type:ignore
|
||||||
description = content.xpath('//div[@class="mediumText"]/text()')[0].strip()
|
desc_elem = content.xpath('//div[@class="mediumText"]/text()')
|
||||||
|
description: str = desc_elem[0].strip() # type:ignore
|
||||||
print("List title: " + title)
|
print("List title: " + title)
|
||||||
for link in content.xpath('//a[@class="bookTitle"]/@href'):
|
links = content.xpath('//a[@class="bookTitle"]/@href')
|
||||||
|
for link in links: # type:ignore
|
||||||
url_book = "https://www.goodreads.com" + link
|
url_book = "https://www.goodreads.com" + link
|
||||||
try:
|
try:
|
||||||
book = cls.get_book(url_book, user)
|
book = cls.get_book(url_book, user)
|
||||||
|
@ -244,7 +247,7 @@ class GoodreadsImporter:
|
||||||
pass # likely just download error
|
pass # likely just download error
|
||||||
next_elem = content.xpath("//a[@class='next_page']/@href")
|
next_elem = content.xpath("//a[@class='next_page']/@href")
|
||||||
url_shelf = (
|
url_shelf = (
|
||||||
("https://www.goodreads.com" + next_elem[0].strip())
|
f"https://www.goodreads.com{next_elem[0].strip()}" # type:ignore
|
||||||
if next_elem
|
if next_elem
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
[tool.pyright]
|
[tool.pyright]
|
||||||
exclude = [ "media", ".venv", ".git", "playground", "**/tests.py", "neodb", "**/migrations", "**/commands", "**/importers", "**/sites" ]
|
exclude = [ "media", ".venv", ".git", "playground", "**/tests.py", "neodb", "**/migrations", "**/commands", "**/sites/douban_*" ]
|
||||||
|
|
||||||
[tool.djlint]
|
[tool.djlint]
|
||||||
ignore="T002,T003,H006,H019,H020,H021,H023,H030,H031"
|
ignore="T002,T003,H006,H019,H020,H021,H023,H030,H031"
|
||||||
|
|
Loading…
Add table
Reference in a new issue