267 lines
9.7 KiB
Python
267 lines
9.7 KiB
Python
import re
|
||
|
||
from django.core.cache import cache
|
||
from lxml import html
|
||
|
||
from catalog.common import *
|
||
from catalog.models import *
|
||
from common.models.lang import detect_language
|
||
|
||
from .douban import DoubanDownloader
|
||
|
||
|
||
def _cache_key(url):
|
||
return f"$:{url}"
|
||
|
||
|
||
@SiteManager.register
|
||
class DoubanDramaVersion(AbstractSite):
|
||
"""
|
||
Parse Douban Drama Version section in Douban Drama page
|
||
|
||
It's the same page as the drama page, each version resides in a <div id="1234" />
|
||
since they all get parsed about the same time, page content will be cached to avoid duplicate fetch
|
||
"""
|
||
|
||
SITE_NAME = SiteName.Douban
|
||
ID_TYPE = IdType.DoubanDramaVersion
|
||
URL_PATTERNS = [
|
||
r"\w+://www.douban.com/location/drama/(\d+)/#(\d+)$",
|
||
]
|
||
|
||
WIKI_PROPERTY_ID = "?"
|
||
DEFAULT_MODEL = PerformanceProduction
|
||
|
||
@classmethod
|
||
def url_to_id(cls, url: str):
|
||
m = re.match(cls.URL_PATTERNS[0], url)
|
||
if not m:
|
||
return None
|
||
return m.group(1) + "-" + m.group(2)
|
||
|
||
@classmethod
|
||
def id_to_url(cls, id_value):
|
||
ids = id_value.split("-")
|
||
return f"https://www.douban.com/location/drama/{ids[0]}/#{ids[1]}"
|
||
|
||
def scrape(self):
|
||
if not self.id_value or not self.url:
|
||
raise ParseError(self, "id_value or url")
|
||
show_url = self.url.split("#")[0]
|
||
show_id = self.id_value.split("-")[0]
|
||
version_id = self.id_value.split("-")[1]
|
||
|
||
key = _cache_key(show_url)
|
||
r = cache.get(key, None)
|
||
if r is None:
|
||
r = DoubanDownloader(show_url).download().content.decode("utf-8")
|
||
cache.set(key, r, 3600)
|
||
h = html.fromstring(r)
|
||
|
||
p = "//div[@id='" + version_id + "']"
|
||
q = p + "//dt[text()='{}:']/following-sibling::dd[1]/a/span/text()"
|
||
q2 = p + "//dt[text()='{}:']/following-sibling::dd[1]/text()"
|
||
title = " ".join(self.query_list(h, p + "//h3/text()")).strip()
|
||
if not title:
|
||
raise ParseError(self, "title")
|
||
data = {
|
||
"title": title,
|
||
"localized_title": [{"lang": "zh-cn", "text": title}],
|
||
"director": [x.strip() for x in self.query_list(h, q.format("导演"))],
|
||
"playwright": [x.strip() for x in self.query_list(h, q.format("编剧"))],
|
||
# "actor": [x.strip() for x in self.query_list(h, q.format("主演"))],
|
||
"composer": [x.strip() for x in self.query_list(h, q.format("作曲"))],
|
||
"language": [x.strip() for x in self.query_list(h, q2.format("语言"))],
|
||
"opening_date": " ".join(self.query_list(h, q2.format("演出日期"))).strip(),
|
||
"troupe": [x.strip() for x in self.query_list(h, q.format("演出团体"))],
|
||
"location": [x.strip() for x in self.query_list(h, q.format("演出剧院"))],
|
||
}
|
||
if data["opening_date"]:
|
||
d = data["opening_date"].split("-")
|
||
dl = len(d) if len(d) < 6 else 6
|
||
if dl > 3:
|
||
data["opening_date"] = "-".join(d[:3])
|
||
data["closing_date"] = "-".join(d[0 : 6 - dl] + d[3:dl])
|
||
actor_elem = self.query_list(
|
||
h, p + "//dt[text()='主演:']/following-sibling::dd[1]/a"
|
||
)
|
||
data["actor"] = []
|
||
for e in actor_elem:
|
||
n = "".join(e.xpath("span/text()")).strip()
|
||
t = "".join(e.xpath("following-sibling::text()[1]")).strip()
|
||
t = re.sub(r"^[\s\(饰]*(.+)\)[\s\/]*$", r"\1", t).strip()
|
||
t = t if t != "/" else ""
|
||
data["actor"].append({"name": n, "role": t})
|
||
img_url_elem = self.query_list(h, "//img[@itemprop='image']/@src")
|
||
data["cover_image_url"] = img_url_elem[0].strip() if img_url_elem else None
|
||
pd = ResourceContent(metadata=data)
|
||
pd.metadata["required_resources"] = [
|
||
{
|
||
"model": "Performance",
|
||
"id_type": IdType.DoubanDrama,
|
||
"id_value": show_id,
|
||
"title": f"Douban Drama {show_id}",
|
||
"url": show_url,
|
||
}
|
||
]
|
||
return pd
|
||
|
||
|
||
@SiteManager.register
|
||
class DoubanDrama(AbstractSite):
|
||
SITE_NAME = SiteName.Douban
|
||
ID_TYPE = IdType.DoubanDrama
|
||
URL_PATTERNS = [
|
||
r"\w+://www.douban.com/location/drama/(\d+)/[^#]*$",
|
||
r"\w+://www.douban.com/doubanapp/dispatch\?uri=/drama/(\d+)/",
|
||
r"\w+://www.douban.com/doubanapp/dispatch/drama/(\d+)",
|
||
]
|
||
WIKI_PROPERTY_ID = "P6443"
|
||
DEFAULT_MODEL = Performance
|
||
|
||
@classmethod
|
||
def id_to_url(cls, id_value):
|
||
return "https://www.douban.com/location/drama/" + id_value + "/"
|
||
|
||
def scrape(self):
|
||
key = _cache_key(self.url)
|
||
r = cache.get(key, None)
|
||
if r is None:
|
||
r = DoubanDownloader(self.url).download().content.decode("utf-8")
|
||
cache.set(key, r, 3600)
|
||
h = html.fromstring(r)
|
||
data = {}
|
||
|
||
title_elem = self.query_list(h, "/html/body//h1/span/text()")
|
||
if title_elem:
|
||
data["title"] = title_elem[0].strip()
|
||
data["orig_title"] = title_elem[1] if len(title_elem) > 1 else None
|
||
else:
|
||
raise ParseError(self, "title")
|
||
|
||
other_title_elem = self.query_list(
|
||
h, "//dl//dt[text()='又名:']/following::dd[@itemprop='name']/text()"
|
||
)
|
||
data["other_title"] = other_title_elem
|
||
|
||
plot_elem = self.query_list(
|
||
h, "//div[@class='pure-text']/div[@class='full']/text()"
|
||
)
|
||
if len(plot_elem) == 0:
|
||
plot_elem = self.query_list(
|
||
h, "//div[@class='pure-text']/div[@class='abstract']/text()"
|
||
)
|
||
if len(plot_elem) == 0:
|
||
plot_elem = self.query_list(h, "//div[@class='pure-text']/text()")
|
||
data["brief"] = "\n".join(plot_elem)
|
||
|
||
data["genre"] = [
|
||
s.strip()
|
||
for s in self.query_list(
|
||
h,
|
||
"//div[@class='meta']//dl//dt[text()='类型:']/following-sibling::dd[@itemprop='genre']/text()",
|
||
)
|
||
]
|
||
# data["version"] = [
|
||
# s.strip()
|
||
# for s in self.query_list(h,
|
||
# "//dl//dt[text()='版本:']/following-sibling::dd[@class='titles']/a//text()"
|
||
# )
|
||
# ]
|
||
data["director"] = [
|
||
s.strip()
|
||
for s in self.query_list(
|
||
h,
|
||
"//div[@class='meta']/dl//dt[text()='导演:']/following-sibling::dd/a[@itemprop='director']//text()",
|
||
)
|
||
]
|
||
data["composer"] = [
|
||
s.strip()
|
||
for s in self.query_list(
|
||
h,
|
||
"//div[@class='meta']/dl//dt[text()='作曲:']/following-sibling::dd/a[@itemprop='musicBy']//text()",
|
||
)
|
||
]
|
||
data["choreographer"] = [
|
||
s.strip()
|
||
for s in self.query_list(
|
||
h,
|
||
"//div[@class='meta']/dl//dt[text()='编舞:']/following-sibling::dd/a[@itemprop='choreographer']//text()",
|
||
)
|
||
]
|
||
data["troupe"] = [
|
||
s.strip()
|
||
for s in self.query_list(
|
||
h,
|
||
"//div[@class='meta']/dl//dt[text()='演出团体:']/following-sibling::dd/a[@itemprop='performer']//text()",
|
||
)
|
||
]
|
||
data["playwright"] = [
|
||
s.strip()
|
||
for s in self.query_list(
|
||
h,
|
||
"//div[@class='meta']/dl//dt[text()='编剧:']/following-sibling::dd/a[@itemprop='author']//text()",
|
||
)
|
||
]
|
||
data["actor"] = [
|
||
{"name": s.strip(), "role": ""}
|
||
for s in self.query_list(
|
||
h,
|
||
"//div[@class='meta']/dl//dt[text()='主演:']/following-sibling::dd/a[@itemprop='actor']//text()",
|
||
)
|
||
]
|
||
|
||
date_elem = self.query_list(
|
||
h, "//div[@class='meta']//dl//dt[text()='演出日期:']/following::dd/text()"
|
||
)
|
||
data["opening_date"] = date_elem[0] if date_elem else None
|
||
if data["opening_date"]:
|
||
d = data["opening_date"].split("-")
|
||
dl = len(d) if len(d) < 6 else 6
|
||
if dl > 3:
|
||
data["opening_date"] = "-".join(d[:3])
|
||
data["closing_date"] = "-".join(d[0 : 6 - dl] + d[3:dl])
|
||
|
||
data["location"] = [
|
||
s.strip()
|
||
for s in self.query_list(
|
||
h,
|
||
"//div[@class='meta']/dl//dt[text()='演出剧院:']/following-sibling::dd/a[@itemprop='location']//text()",
|
||
)
|
||
]
|
||
|
||
versions = self.query_list(
|
||
h, "//div[@id='versions']/div[@class='fluid-mods']/div/@id"
|
||
)
|
||
data["related_resources"] = list(
|
||
map(
|
||
lambda v: {
|
||
"model": "PerformanceProduction",
|
||
"id_type": IdType.DoubanDramaVersion,
|
||
"id_value": f"{self.id_value}-{v}",
|
||
"title": f"{data['title']} - {v}",
|
||
"url": f"{self.url}#{v}",
|
||
},
|
||
versions,
|
||
)
|
||
)
|
||
img_url_elem = self.query_list(h, "//img[@itemprop='image']/@src")
|
||
data["cover_image_url"] = img_url_elem[0].strip() if img_url_elem else None
|
||
data["localized_title"] = (
|
||
[{"lang": "zh-cn", "text": data["title"]}]
|
||
+ (
|
||
[
|
||
{
|
||
"lang": detect_language(data["orig_title"]),
|
||
"text": data["orig_title"],
|
||
}
|
||
]
|
||
if data["orig_title"]
|
||
else []
|
||
)
|
||
+ [{"lang": detect_language(t), "text": t} for t in data["other_title"]]
|
||
)
|
||
data["localized_description"] = [{"lang": "zh-cn", "text": data["brief"]}]
|
||
|
||
pd = ResourceContent(metadata=data)
|
||
return pd
|