fix some lint issues

This commit is contained in:
Your Name 2025-01-04 11:23:07 -05:00 committed by Henri Dickson
parent ea4f52dfa6
commit 86b1ee19e0
9 changed files with 288 additions and 193 deletions

View file

@ -104,7 +104,7 @@ class AbstractSite:
return content.xpath(query)[0].strip()
@staticmethod
def query_list(content, query: str) -> list[str]:
def query_list(content, query: str) -> list:
return list(content.xpath(query))
@classmethod

View file

@ -79,6 +79,9 @@ class ExternalSearchResultItem:
self.display_description = brief
self.cover_image_url = cover_url
def __repr__(self):
return f"[{self.category}] {self.display_title} {self.url}"
@property
def verbose_category_name(self):
return self.category.label if self.category else ""

View file

@ -1,6 +1,8 @@
import json
import re
from catalog.common import *
from catalog.search.models import ExternalSearchResultItem
RE_NUMBERS = re.compile(r"\d+\d*")
RE_WHITESPACES = re.compile(r"\s+")
@ -30,3 +32,35 @@ class DoubanDownloader(ProxiedDownloader):
return RESPONSE_OK
else:
return RESPONSE_INVALID_CONTENT
class DoubanSearcher:
@classmethod
def search(cls, cat: ItemCategory, c: str, q: str, p: int = 1):
url = f"https://search.douban.com/{c}/subject_search?search_text={q}&start={15*(p-1)}"
content = DoubanDownloader(url).download().html()
j = json.loads(
content.xpath(
"//script[text()[contains(.,'window.__DATA__')]]/text()"
)[ # type:ignore
0
]
.split("window.__DATA__ = ")[1] # type:ignore
.split("};")[0] # type:ignore
+ "}"
)
results = [
ExternalSearchResultItem(
cat,
SiteName.Douban,
item["url"],
item["title"],
item["abstract"],
item["abstract_2"],
item["cover_url"],
)
for item in j["items"]
for item in j["items"]
if item.get("tpl_name") == "search_subject"
]
return results

View file

@ -3,7 +3,7 @@ from catalog.book.utils import *
from catalog.common import *
from common.models.lang import detect_language
from .douban import *
from .douban import RE_NUMBERS, RE_WHITESPACES, DoubanDownloader, DoubanSearcher
@SiteManager.register
@ -23,46 +23,51 @@ class DoubanBook(AbstractSite):
def id_to_url(cls, id_value):
return "https://book.douban.com/subject/" + id_value + "/"
@classmethod
def search(cls, q: str, p: int = 1):
return DoubanSearcher.search(ItemCategory.Book, "book", q, p)
def scrape(self):
content = DoubanDownloader(self.url).download().html()
isbn_elem = content.xpath(
"//div[@id='info']//span[text()='ISBN:']/following::text()"
isbn_elem = self.query_list(
content, "//div[@id='info']//span[text()='ISBN:']/following::text()"
)
isbn = isbn_elem[0].strip() if isbn_elem else None
title_elem = content.xpath("/html/body//h1/span/text()")
title_elem = self.query_list(content, "/html/body//h1/span/text()")
title = (
title_elem[0].strip() if title_elem else f"Unknown Title {self.id_value}"
)
subtitle_elem = content.xpath(
"//div[@id='info']//span[text()='副标题:']/following::text()"
subtitle_elem = self.query_list(
content, "//div[@id='info']//span[text()='副标题:']/following::text()"
)
subtitle = subtitle_elem[0].strip()[:500] if subtitle_elem else None
orig_title_elem = content.xpath(
"//div[@id='info']//span[text()='原作名:']/following::text()"
orig_title_elem = self.query_list(
content, "//div[@id='info']//span[text()='原作名:']/following::text()"
)
orig_title = orig_title_elem[0].strip()[:500] if orig_title_elem else None
language_elem = content.xpath(
"//div[@id='info']//span[text()='语言:']/following::text()"
language_elem = self.query_list(
content, "//div[@id='info']//span[text()='语言:']/following::text()"
)
language = [language_elem[0].strip()] if language_elem else []
pub_house_elem = content.xpath(
"//div[@id='info']//span[text()='出版社:']/following::text()"
pub_house_elem = self.query_list(
content, "//div[@id='info']//span[text()='出版社:']/following::text()"
)
pub_house = pub_house_elem[0].strip() if pub_house_elem else None
if not pub_house:
pub_house_elem = content.xpath(
"//div[@id='info']//span[text()='出版社:']/following-sibling::a/text()"
pub_house_elem = self.query_list(
content,
"//div[@id='info']//span[text()='出版社:']/following-sibling::a/text()",
)
pub_house = pub_house_elem[0].strip() if pub_house_elem else None
pub_date_elem = content.xpath(
"//div[@id='info']//span[text()='出版年:']/following::text()"
pub_date_elem = self.query_list(
content, "//div[@id='info']//span[text()='出版年:']/following::text()"
)
pub_date = pub_date_elem[0].strip() if pub_date_elem else ""
year_month_day = RE_NUMBERS.findall(pub_date)
@ -88,18 +93,18 @@ class DoubanBook(AbstractSite):
else pub_month
)
binding_elem = content.xpath(
"//div[@id='info']//span[text()='装帧:']/following::text()"
binding_elem = self.query_list(
content, "//div[@id='info']//span[text()='装帧:']/following::text()"
)
binding = binding_elem[0].strip() if binding_elem else None
price_elem = content.xpath(
"//div[@id='info']//span[text()='定价:']/following::text()"
price_elem = self.query_list(
content, "//div[@id='info']//span[text()='定价:']/following::text()"
)
price = price_elem[0].strip() if price_elem else None
pages_elem = content.xpath(
"//div[@id='info']//span[text()='页数:']/following::text()"
pages_elem = self.query_list(
content, "//div[@id='info']//span[text()='页数:']/following::text()"
)
pages = pages_elem[0].strip() if pages_elem else None
if pages is not None:
@ -109,15 +114,16 @@ class DoubanBook(AbstractSite):
if pages and (pages > 999999 or pages < 1):
pages = None
brief_elem = content.xpath(
"//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()"
brief_elem = self.query_list(
content,
"//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()",
)
brief = "\n".join(p.strip() for p in brief_elem) if brief_elem else None
contents = None
try:
contents_elem = content.xpath(
"//h2/span[text()='目录']/../following-sibling::div[1]"
contents_elem = self.query_list(
content, "//h2/span[text()='目录']/../following-sibling::div[1]"
)[0]
# if next the id of next sibling contains `dir`, that would be the full contents
if "dir" in contents_elem.getnext().xpath("@id")[0]:
@ -129,24 +135,28 @@ class DoubanBook(AbstractSite):
)
else:
contents = (
"\n".join(p.strip() for p in contents_elem.xpath("text()"))
"\n".join(
p.strip() for p in self.query_list(contents_elem, "text()")
)
if contents_elem is not None
else None
)
except Exception:
pass
img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src")
img_url_elem = self.query_list(content, "//*[@id='mainpic']/a/img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
# there are two html formats for authors and translators
authors_elem = content.xpath(
authors_elem = self.query_list(
content,
"""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()"""
preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""",
)
if not authors_elem:
authors_elem = content.xpath(
"""//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()"""
authors_elem = self.query_list(
content,
"""//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""",
)
if authors_elem:
authors = []
@ -155,13 +165,15 @@ class DoubanBook(AbstractSite):
else:
authors = None
translators_elem = content.xpath(
translators_elem = self.query_list(
content,
"""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()"""
preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""",
)
if not translators_elem:
translators_elem = content.xpath(
"""//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()"""
translators_elem = self.query_list(
content,
"""//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""",
)
if translators_elem:
translators = []
@ -170,18 +182,20 @@ class DoubanBook(AbstractSite):
else:
translators = None
cncode_elem = content.xpath(
"//div[@id='info']//span[text()='统一书号:']/following::text()"
cncode_elem = self.query_list(
content, "//div[@id='info']//span[text()='统一书号:']/following::text()"
)
cubn = cncode_elem[0].strip() if cncode_elem else None
series_elem = content.xpath(
"//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()"
series_elem = self.query_list(
content,
"//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()",
)
series = series_elem[0].strip() if series_elem else None
imprint_elem = content.xpath(
"//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()"
imprint_elem = self.query_list(
content,
"//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()",
)
imprint = imprint_elem[0].strip() if imprint_elem else None
@ -212,8 +226,9 @@ class DoubanBook(AbstractSite):
"cover_image_url": img_url,
}
works_element = content.xpath(
'//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href'
works_element = self.query_list(
content,
'//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href',
)
if works_element:
r = re.match(r"\w+://book.douban.com/works/(\d+)", works_element[0])
@ -234,7 +249,7 @@ class DoubanBook(AbstractSite):
]
pd = ResourceContent(metadata=data)
t, n = detect_isbn_asin(isbn)
t, n = detect_isbn_asin(isbn or "")
if t:
pd.lookup_ids[t] = n
pd.lookup_ids[IdType.CUBN] = cubn
@ -255,11 +270,11 @@ class DoubanBook_Work(AbstractSite):
def scrape(self):
content = DoubanDownloader(self.url).download().html()
title_elem = content.xpath("//h1/text()")
title_elem = self.query_list(content, "//h1/text()")
title = title_elem[0].split("全部版本(")[0].strip() if title_elem else None
if not title:
raise ParseError(self, "title")
book_urls = content.xpath('//a[@class="pl2"]/@href')
book_urls = self.query_list(content, '//a[@class="pl2"]/@href')
related_resources = []
for url in book_urls:
site = SiteManager.get_site_by_url(url)

View file

@ -7,7 +7,7 @@ from catalog.common import *
from catalog.models import *
from common.models.lang import detect_language
from .douban import DoubanDownloader
from .douban import DoubanDownloader, DoubanSearcher
def _cache_key(url):
@ -45,6 +45,8 @@ class DoubanDramaVersion(AbstractSite):
return f"https://www.douban.com/location/drama/{ids[0]}/#{ids[1]}"
def scrape(self):
if not self.id_value or not self.url:
raise ParseError(self, "id_value or url")
show_url = self.url.split("#")[0]
show_id = self.id_value.split("-")[0]
version_id = self.id_value.split("-")[1]
@ -59,20 +61,20 @@ class DoubanDramaVersion(AbstractSite):
p = "//div[@id='" + version_id + "']"
q = p + "//dt[text()='{}']/following-sibling::dd[1]/a/span/text()"
q2 = p + "//dt[text()='{}']/following-sibling::dd[1]/text()"
title = " ".join(h.xpath(p + "//h3/text()")).strip()
title = " ".join(self.query_list(h, p + "//h3/text()")).strip()
if not title:
raise ParseError(self, "title")
data = {
"title": title,
"localized_title": [{"lang": "zh-cn", "text": title}],
"director": [x.strip() for x in h.xpath(q.format("导演"))],
"playwright": [x.strip() for x in h.xpath(q.format("编剧"))],
# "actor": [x.strip() for x in h.xpath(q.format("主演"))],
"composer": [x.strip() for x in h.xpath(q.format("作曲"))],
"language": [x.strip() for x in h.xpath(q2.format("语言"))],
"opening_date": " ".join(h.xpath(q2.format("演出日期"))).strip(),
"troupe": [x.strip() for x in h.xpath(q.format("演出团体"))],
"location": [x.strip() for x in h.xpath(q.format("演出剧院"))],
"director": [x.strip() for x in self.query_list(h, q.format("导演"))],
"playwright": [x.strip() for x in self.query_list(h, q.format("编剧"))],
# "actor": [x.strip() for x in self.query_list(h, q.format("主演"))],
"composer": [x.strip() for x in self.query_list(h, q.format("作曲"))],
"language": [x.strip() for x in self.query_list(h, q2.format("语言"))],
"opening_date": " ".join(self.query_list(h, q2.format("演出日期"))).strip(),
"troupe": [x.strip() for x in self.query_list(h, q.format("演出团体"))],
"location": [x.strip() for x in self.query_list(h, q.format("演出剧院"))],
}
if data["opening_date"]:
d = data["opening_date"].split("-")
@ -80,7 +82,9 @@ class DoubanDramaVersion(AbstractSite):
if dl > 3:
data["opening_date"] = "-".join(d[:3])
data["closing_date"] = "-".join(d[0 : 6 - dl] + d[3:dl])
actor_elem = h.xpath(p + "//dt[text()='主演:']/following-sibling::dd[1]/a")
actor_elem = self.query_list(
h, p + "//dt[text()='主演:']/following-sibling::dd[1]/a"
)
data["actor"] = []
for e in actor_elem:
n = "".join(e.xpath("span/text()")).strip()
@ -88,7 +92,7 @@ class DoubanDramaVersion(AbstractSite):
t = re.sub(r"^[\s\(饰]*(.+)\)[\s\/]*$", r"\1", t).strip()
t = t if t != "/" else ""
data["actor"].append({"name": n, "role": t})
img_url_elem = h.xpath("//img[@itemprop='image']/@src")
img_url_elem = self.query_list(h, "//img[@itemprop='image']/@src")
data["cover_image_url"] = img_url_elem[0].strip() if img_url_elem else None
pd = ResourceContent(metadata=data)
pd.metadata["required_resources"] = [
@ -128,78 +132,87 @@ class DoubanDrama(AbstractSite):
h = html.fromstring(r)
data = {}
title_elem = h.xpath("/html/body//h1/span/text()")
title_elem = self.query_list(h, "/html/body//h1/span/text()")
if title_elem:
data["title"] = title_elem[0].strip()
data["orig_title"] = title_elem[1] if len(title_elem) > 1 else None
else:
raise ParseError(self, "title")
other_title_elem = h.xpath(
"//dl//dt[text()='又名:']/following::dd[@itemprop='name']/text()"
other_title_elem = self.query_list(
h, "//dl//dt[text()='又名:']/following::dd[@itemprop='name']/text()"
)
data["other_title"] = other_title_elem
plot_elem = h.xpath("//div[@class='pure-text']/div[@class='full']/text()")
if len(plot_elem) == 0:
plot_elem = h.xpath(
"//div[@class='pure-text']/div[@class='abstract']/text()"
plot_elem = self.query_list(
h, "//div[@class='pure-text']/div[@class='full']/text()"
)
if len(plot_elem) == 0:
plot_elem = h.xpath("//div[@class='pure-text']/text()")
plot_elem = self.query_list(
h, "//div[@class='pure-text']/div[@class='abstract']/text()"
)
if len(plot_elem) == 0:
plot_elem = self.query_list(h, "//div[@class='pure-text']/text()")
data["brief"] = "\n".join(plot_elem)
data["genre"] = [
s.strip()
for s in h.xpath(
"//div[@class='meta']//dl//dt[text()='类型:']/following-sibling::dd[@itemprop='genre']/text()"
for s in self.query_list(
h,
"//div[@class='meta']//dl//dt[text()='类型:']/following-sibling::dd[@itemprop='genre']/text()",
)
]
# data["version"] = [
# s.strip()
# for s in h.xpath(
# for s in self.query_list(h,
# "//dl//dt[text()='版本:']/following-sibling::dd[@class='titles']/a//text()"
# )
# ]
data["director"] = [
s.strip()
for s in h.xpath(
"//div[@class='meta']/dl//dt[text()='导演:']/following-sibling::dd/a[@itemprop='director']//text()"
for s in self.query_list(
h,
"//div[@class='meta']/dl//dt[text()='导演:']/following-sibling::dd/a[@itemprop='director']//text()",
)
]
data["composer"] = [
s.strip()
for s in h.xpath(
"//div[@class='meta']/dl//dt[text()='作曲:']/following-sibling::dd/a[@itemprop='musicBy']//text()"
for s in self.query_list(
h,
"//div[@class='meta']/dl//dt[text()='作曲:']/following-sibling::dd/a[@itemprop='musicBy']//text()",
)
]
data["choreographer"] = [
s.strip()
for s in h.xpath(
"//div[@class='meta']/dl//dt[text()='编舞:']/following-sibling::dd/a[@itemprop='choreographer']//text()"
for s in self.query_list(
h,
"//div[@class='meta']/dl//dt[text()='编舞:']/following-sibling::dd/a[@itemprop='choreographer']//text()",
)
]
data["troupe"] = [
s.strip()
for s in h.xpath(
"//div[@class='meta']/dl//dt[text()='演出团体:']/following-sibling::dd/a[@itemprop='performer']//text()"
for s in self.query_list(
h,
"//div[@class='meta']/dl//dt[text()='演出团体:']/following-sibling::dd/a[@itemprop='performer']//text()",
)
]
data["playwright"] = [
s.strip()
for s in h.xpath(
"//div[@class='meta']/dl//dt[text()='编剧:']/following-sibling::dd/a[@itemprop='author']//text()"
for s in self.query_list(
h,
"//div[@class='meta']/dl//dt[text()='编剧:']/following-sibling::dd/a[@itemprop='author']//text()",
)
]
data["actor"] = [
{"name": s.strip(), "role": ""}
for s in h.xpath(
"//div[@class='meta']/dl//dt[text()='主演:']/following-sibling::dd/a[@itemprop='actor']//text()"
for s in self.query_list(
h,
"//div[@class='meta']/dl//dt[text()='主演:']/following-sibling::dd/a[@itemprop='actor']//text()",
)
]
date_elem = h.xpath(
"//div[@class='meta']//dl//dt[text()='演出日期:']/following::dd/text()"
date_elem = self.query_list(
h, "//div[@class='meta']//dl//dt[text()='演出日期:']/following::dd/text()"
)
data["opening_date"] = date_elem[0] if date_elem else None
if data["opening_date"]:
@ -211,12 +224,15 @@ class DoubanDrama(AbstractSite):
data["location"] = [
s.strip()
for s in h.xpath(
"//div[@class='meta']/dl//dt[text()='演出剧院:']/following-sibling::dd/a[@itemprop='location']//text()"
for s in self.query_list(
h,
"//div[@class='meta']/dl//dt[text()='演出剧院:']/following-sibling::dd/a[@itemprop='location']//text()",
)
]
versions = h.xpath("//div[@id='versions']/div[@class='fluid-mods']/div/@id")
versions = self.query_list(
h, "//div[@id='versions']/div[@class='fluid-mods']/div/@id"
)
data["related_resources"] = list(
map(
lambda v: {
@ -229,7 +245,7 @@ class DoubanDrama(AbstractSite):
versions,
)
)
img_url_elem = h.xpath("//img[@itemprop='image']/@src")
img_url_elem = self.query_list(h, "//img[@itemprop='image']/@src")
data["cover_image_url"] = img_url_elem[0].strip() if img_url_elem else None
data["localized_title"] = (
[{"lang": "zh-cn", "text": data["title"]}]

View file

@ -7,9 +7,7 @@ from catalog.models import *
from common.models.lang import detect_language
from common.models.misc import uniq
from .douban import DoubanDownloader
_logger = logging.getLogger(__name__)
from .douban import DoubanDownloader, DoubanSearcher
@SiteManager.register
@ -26,18 +24,18 @@ class DoubanGame(AbstractSite):
DEFAULT_MODEL = Game
@classmethod
def id_to_url(self, id_value):
def id_to_url(cls, id_value):
return "https://www.douban.com/game/" + id_value + "/"
def scrape(self):
content = DoubanDownloader(self.url).download().html()
elem = content.xpath("//div[@id='content']/h1/text()")
elem = self.query_list(content, "//div[@id='content']/h1/text()")
title = elem[0].strip() if len(elem) else None
if not title:
raise ParseError(self, "title")
elem = content.xpath("//div[@id='comments']//h2/text()")
elem = self.query_list(content, "//div[@id='comments']//h2/text()")
title2 = elem[0].strip() if len(elem) else ""
if title2:
sp = title2.strip().rsplit("的短评", 1)
@ -48,46 +46,52 @@ class DoubanGame(AbstractSite):
else:
orig_title = ""
other_title_elem = content.xpath(
"//dl[@class='thing-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()"
other_title_elem = self.query_list(
content,
"//dl[@class='thing-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()",
)
other_title = (
other_title_elem[0].strip().split(" / ") if other_title_elem else []
)
developer_elem = content.xpath(
"//dl[@class='thing-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()"
developer_elem = self.query_list(
content,
"//dl[@class='thing-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()",
)
developer = developer_elem[0].strip().split(" / ") if developer_elem else None
publisher_elem = content.xpath(
"//dl[@class='thing-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()"
publisher_elem = self.query_list(
content,
"//dl[@class='thing-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()",
)
publisher = publisher_elem[0].strip().split(" / ") if publisher_elem else None
platform_elem = content.xpath(
"//dl[@class='thing-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()"
platform_elem = self.query_list(
content,
"//dl[@class='thing-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()",
)
platform = platform_elem if platform_elem else None
genre_elem = content.xpath(
"//dl[@class='thing-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()"
genre_elem = self.query_list(
content,
"//dl[@class='thing-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()",
)
genre = None
if genre_elem:
genre = [g for g in genre_elem if g != "游戏"]
date_elem = content.xpath(
"//dl[@class='thing-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()"
date_elem = self.query_list(
content,
"//dl[@class='thing-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()",
)
release_date = dateparser.parse(date_elem[0].strip()) if date_elem else None
release_date = release_date.strftime("%Y-%m-%d") if release_date else None
brief_elem = content.xpath("//div[@class='mod item-desc']/p/text()")
brief_elem = self.query_list(content, "//div[@class='mod item-desc']/p/text()")
brief = "\n".join(brief_elem) if brief_elem else ""
img_url_elem = content.xpath(
"//div[@class='item-subject-info']/div[@class='pic']//img/@src"
img_url_elem = self.query_list(
content, "//div[@class='item-subject-info']/div[@class='pic']//img/@src"
)
img_url = img_url_elem[0].strip() if img_url_elem else None

View file

@ -1,16 +1,17 @@
import json
import logging
from loguru import logger
from catalog.common import *
from catalog.movie.models import *
from catalog.tv.models import *
from common.models.lang import detect_language
from common.models.misc import int_
from .douban import *
from .douban import DoubanDownloader, DoubanSearcher
from .tmdb import TMDB_TV, TMDB_TVSeason, query_tmdb_tv_episode, search_tmdb_by_imdb_id
_logger = logging.getLogger(__name__)
@SiteManager.register
class DoubanMovie(AbstractSite):
@ -29,11 +30,15 @@ class DoubanMovie(AbstractSite):
def id_to_url(cls, id_value):
return "https://movie.douban.com/subject/" + id_value + "/"
@classmethod
def search(cls, q: str, p: int = 1):
return DoubanSearcher.search(ItemCategory.Movie, "movie", q, p)
def scrape(self):
content = DoubanDownloader(self.url).download().html()
try:
schema_data = "".join(
content.xpath('//script[@type="application/ld+json"]/text()')
self.query_list(content, '//script[@type="application/ld+json"]/text()')
).replace(
"\n", ""
) # strip \n bc multi-line string is not properly coded in json by douban
@ -42,13 +47,13 @@ class DoubanMovie(AbstractSite):
d = {}
try:
raw_title = content.xpath("//span[@property='v:itemreviewed']/text()")[
0
].strip()
raw_title = self.query_list(
content, "//span[@property='v:itemreviewed']/text()"
)[0].strip()
except IndexError:
raise ParseError(self, "title")
orig_title = content.xpath("//img[@rel='v:image']/@alt")[0].strip()
orig_title = self.query_list(content, "//img[@rel='v:image']/@alt")[0].strip()
title = raw_title.split(orig_title)[0].strip()
# if has no chinese title
if title == "":
@ -58,40 +63,46 @@ class DoubanMovie(AbstractSite):
orig_title = None
# there are two html formats for authors and translators
other_title_elem = content.xpath(
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]"
other_title_elem = self.query_list(
content,
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]",
)
other_title = (
other_title_elem[0].strip().split(" / ") if other_title_elem else None
)
imdb_elem = content.xpath(
"//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()"
imdb_elem = self.query_list(
content,
"//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()",
)
if not imdb_elem:
imdb_elem = content.xpath(
"//div[@id='info']//span[text()='IMDb:']/following-sibling::text()[1]"
imdb_elem = self.query_list(
content,
"//div[@id='info']//span[text()='IMDb:']/following-sibling::text()[1]",
)
imdb_code = imdb_elem[0].strip() if imdb_elem else None
director_elem = content.xpath(
"//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()"
director_elem = self.query_list(
content,
"//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()",
)
director = director_elem if director_elem else None
playwright_elem = content.xpath(
"//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()"
playwright_elem = self.query_list(
content,
"//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()",
)
playwright = (
list(map(lambda a: a[:200], playwright_elem)) if playwright_elem else None
)
actor_elem = content.xpath(
"//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()"
actor_elem = self.query_list(
content,
"//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()",
)
actor = list(map(lambda a: a[:200], actor_elem)) if actor_elem else None
genre_elem = content.xpath("//span[@property='v:genre']/text()")
genre_elem = self.query_list(content, "//span[@property='v:genre']/text()")
genre = []
if genre_elem:
for g in genre_elem:
@ -102,7 +113,9 @@ class DoubanMovie(AbstractSite):
g = "惊悚"
genre.append(g)
showtime_elem = content.xpath("//span[@property='v:initialReleaseDate']/text()")
showtime_elem = self.query_list(
content, "//span[@property='v:initialReleaseDate']/text()"
)
if showtime_elem:
showtime = []
for st in showtime_elem:
@ -122,39 +135,39 @@ class DoubanMovie(AbstractSite):
else:
showtime = None
site_elem = content.xpath(
"//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href"
site_elem = self.query_list(
content,
"//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href",
)
site = site_elem[0].strip()[:200] if site_elem else None
if site and not re.match(r"http.+", site):
site = None
area_elem = content.xpath(
"//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]"
area_elem = self.query_list(
content,
"//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]",
)
if area_elem:
area = [a.strip()[:100] for a in area_elem[0].split("/")]
else:
area = None
language_elem = content.xpath(
"//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]"
language_elem = self.query_list(
content,
"//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]",
)
if language_elem:
language = [a.strip() for a in language_elem[0].split(" / ")]
else:
language = None
year_elem = content.xpath("//span[@class='year']/text()")
year = (
int(re.search(r"\d+", year_elem[0])[0])
if year_elem and re.search(r"\d+", year_elem[0])
else None
)
year_s = self.query_str(content, "//span[@class='year']/text()")
year_r = re.search(r"\d+", year_s) if year_s else None
year = int_(year_r[0]) if year_r else None
duration_elem = content.xpath("//span[@property='v:runtime']/text()")
other_duration_elem = content.xpath(
"//span[@property='v:runtime']/following-sibling::text()[1]"
duration_elem = self.query_list(content, "//span[@property='v:runtime']/text()")
other_duration_elem = self.query_list(
content, "//span[@property='v:runtime']/following-sibling::text()[1]"
)
if duration_elem:
duration = duration_elem[0].strip()
@ -164,19 +177,21 @@ class DoubanMovie(AbstractSite):
else:
duration = None
season_elem = content.xpath(
"//*[@id='season']/option[@selected='selected']/text()"
season_elem = self.query_list(
content, "//*[@id='season']/option[@selected='selected']/text()"
)
if not season_elem:
season_elem = content.xpath(
"//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]"
season_elem = self.query_list(
content,
"//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]",
)
season = int(season_elem[0].strip()) if season_elem else None
else:
season = int(season_elem[0].strip())
episodes_elem = content.xpath(
"//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]"
episodes_elem = self.query_list(
content,
"//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]",
)
episodes = (
int(episodes_elem[0].strip())
@ -184,8 +199,9 @@ class DoubanMovie(AbstractSite):
else None
)
single_episode_length_elem = content.xpath(
"//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]"
single_episode_length_elem = self.query_list(
content,
"//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]",
)
single_episode_length = (
single_episode_length_elem[0].strip()[:100]
@ -195,16 +211,16 @@ class DoubanMovie(AbstractSite):
is_series = d.get("@type") == "TVSeries" or episodes is not None
brief_elem = content.xpath("//span[@class='all hidden']")
brief_elem = self.query_list(content, "//span[@class='all hidden']")
if not brief_elem:
brief_elem = content.xpath("//span[@property='v:summary']")
brief_elem = self.query_list(content, "//span[@property='v:summary']")
brief = (
"\n".join([e.strip() for e in brief_elem[0].xpath("./text()")])
if brief_elem
else None
)
img_url_elem = content.xpath("//img[@rel='v:image']/@src")
img_url_elem = self.query_list(content, "//img[@rel='v:image']/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
titles = set(
@ -261,26 +277,26 @@ class DoubanMovie(AbstractSite):
pd.metadata.get("season_number")
and pd.metadata.get("season_number") != 1
):
_logger.warn(f"{imdb_code} matched imdb tv show, force season 1")
logger.warning(f"{imdb_code} matched imdb tv show, force season 1")
pd.metadata["season_number"] = 1
elif pd.metadata["preferred_model"] == "TVSeason" and has_episode:
if res_data["tv_episode_results"][0]["episode_number"] != 1:
_logger.warning(
logger.warning(
f"Douban Movie {self.url} IMDB {imdb_code} mapping to non-first episode in a season"
)
elif res_data["tv_episode_results"][0]["season_number"] == 1:
_logger.warning(
logger.warning(
f"Douban Movie {self.url} IMDB {imdb_code} mapping to first season episode in a season"
)
elif has_movie:
if pd.metadata["preferred_model"] != "Movie":
_logger.warn(f"{imdb_code} matched imdb movie, force Movie")
logger.warning(f"{imdb_code} matched imdb movie, force Movie")
pd.metadata["preferred_model"] = "Movie"
elif has_tv or has_episode:
_logger.warn(f"{imdb_code} matched imdb tv/episode, force TVSeason")
logger.warning(f"{imdb_code} matched imdb tv/episode, force TVSeason")
pd.metadata["preferred_model"] = "TVSeason"
else:
_logger.warn(f"{imdb_code} unknown to TMDB")
logger.warning(f"{imdb_code} unknown to TMDB")
pd.lookup_ids[IdType.IMDB] = imdb_code

View file

@ -7,9 +7,7 @@ from catalog.models import *
from catalog.music.utils import upc_to_gtin_13
from common.models.lang import detect_language
from .douban import DoubanDownloader
_logger = logging.getLogger(__name__)
from .douban import DoubanDownloader, DoubanSearcher
@SiteManager.register
@ -29,58 +27,63 @@ class DoubanMusic(AbstractSite):
def id_to_url(cls, id_value):
return "https://music.douban.com/subject/" + id_value + "/"
@classmethod
def search(cls, q: str, p: int = 1):
return DoubanSearcher.search(ItemCategory.Music, "music", q, p)
def scrape(self):
content = DoubanDownloader(self.url).download().html()
elem = content.xpath("//h1/span/text()")
elem = self.query_list(content, "//h1/span/text()")
title = elem[0].strip() if len(elem) else None
if not title:
raise ParseError(self, "title")
artists_elem = content.xpath(
"//div[@id='info']/span/span[@class='pl']/a/text()"
artists_elem = self.query_list(
content, "//div[@id='info']/span/span[@class='pl']/a/text()"
)
artist = (
None if not artists_elem else list(map(lambda a: a[:200], artists_elem))
)
genre_elem = content.xpath(
"//div[@id='info']//span[text()='流派:']/following::text()[1]"
genre_elem = self.query_list(
content, "//div[@id='info']//span[text()='流派:']/following::text()[1]"
)
genre = genre_elem[0].strip().split(" / ") if genre_elem else []
date_elem = content.xpath(
"//div[@id='info']//span[text()='发行时间:']/following::text()[1]"
date_elem = self.query_list(
content, "//div[@id='info']//span[text()='发行时间:']/following::text()[1]"
)
release_date = dateparser.parse(date_elem[0].strip()) if date_elem else None
release_date = release_date.strftime("%Y-%m-%d") if release_date else None
company_elem = content.xpath(
"//div[@id='info']//span[text()='出版者:']/following::text()[1]"
company_elem = self.query_list(
content, "//div[@id='info']//span[text()='出版者:']/following::text()[1]"
)
company = company_elem[0].strip() if company_elem else None
track_list_elem = content.xpath(
"//div[@class='track-list']/div[@class='indent']/div/text()"
track_list_elem = self.query_list(
content, "//div[@class='track-list']/div[@class='indent']/div/text()"
)
if track_list_elem:
track_list = "\n".join([track.strip() for track in track_list_elem])
else:
track_list = None
brief_elem = content.xpath("//span[@class='all hidden']")
brief_elem = self.query_list(content, "//span[@class='all hidden']")
if not brief_elem:
brief_elem = content.xpath("//span[@property='v:summary']")
brief_elem = self.query_list(content, "//span[@property='v:summary']")
brief = (
"\n".join([e.strip() for e in brief_elem[0].xpath("./text()")])
if brief_elem
else None
)
img_url_elem = content.xpath("//div[@id='mainpic']//img/@src")
img_url_elem = self.query_list(content, "//div[@id='mainpic']//img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
other_elem = content.xpath(
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]"
other_elem = self.query_list(
content,
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]",
)
other_title = other_elem[0].strip().split(" / ") if other_elem else []
lang = detect_language(f"{title} {brief}")
@ -103,28 +106,33 @@ class DoubanMusic(AbstractSite):
}
gtin = None
isrc = None
other_elem = content.xpath(
"//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]"
other_elem = self.query_list(
content,
"//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]",
)
if other_elem:
data["album_type"] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]"
other_elem = self.query_list(
content,
"//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]",
)
if other_elem:
data["media"] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]"
other_elem = self.query_list(
content,
"//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]",
)
if other_elem:
isrc = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]"
other_elem = self.query_list(
content,
"//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]",
)
if other_elem:
gtin = upc_to_gtin_13(other_elem[0].strip())
other_elem = content.xpath(
"//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]"
other_elem = self.query_list(
content,
"//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]",
)
if other_elem:
data["disc_count"] = other_elem[0].strip()

View file

@ -80,7 +80,6 @@ exclude = [
"journal/tests.py",
"neodb",
"**/migrations",
"**/sites/douban_*",
"neodb-takahe",
]
reportIncompatibleVariableOverride = false