2023-07-19 11:12:58 -04:00
|
|
|
|
import logging
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
from django.core.cache import cache
|
|
|
|
|
from lxml import html
|
|
|
|
|
|
2022-12-07 19:09:05 -05:00
|
|
|
|
from catalog.common import *
|
2022-12-08 16:59:03 +00:00
|
|
|
|
from catalog.models import *
|
2024-07-13 00:16:47 -04:00
|
|
|
|
from common.models.lang import detect_language
|
2023-07-19 11:12:58 -04:00
|
|
|
|
|
2022-12-07 19:09:05 -05:00
|
|
|
|
from .douban import DoubanDownloader
|
|
|
|
|
|
2022-12-08 16:59:03 +00:00
|
|
|
|
_logger = logging.getLogger(__name__)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
|
|
|
2023-06-05 02:04:52 -04:00
|
|
|
|
def _cache_key(url):
|
|
|
|
|
return f"$:{url}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@SiteManager.register
|
|
|
|
|
class DoubanDramaVersion(AbstractSite):
|
|
|
|
|
"""
|
|
|
|
|
Parse Douban Drama Version section in Douban Drama page
|
|
|
|
|
|
|
|
|
|
It's the same page as the drama page, each version resides in a <div id="1234" />
|
|
|
|
|
since they all get parsed about the same time, page content will be cached to avoid duplicate fetch
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
SITE_NAME = SiteName.Douban
|
|
|
|
|
ID_TYPE = IdType.DoubanDramaVersion
|
2023-06-05 03:39:37 -04:00
|
|
|
|
URL_PATTERNS = [r"\w+://www.douban.com/location/drama/(\d+)/#(\d+)$"]
|
|
|
|
|
|
2023-06-05 02:04:52 -04:00
|
|
|
|
WIKI_PROPERTY_ID = "?"
|
|
|
|
|
DEFAULT_MODEL = PerformanceProduction
|
|
|
|
|
|
2023-06-05 03:39:37 -04:00
|
|
|
|
@classmethod
|
|
|
|
|
def url_to_id(cls, url: str):
|
|
|
|
|
m = re.match(cls.URL_PATTERNS[0], url)
|
|
|
|
|
if not m:
|
|
|
|
|
return None
|
|
|
|
|
return m.group(1) + "-" + m.group(2)
|
|
|
|
|
|
2023-06-05 02:04:52 -04:00
|
|
|
|
@classmethod
|
|
|
|
|
def id_to_url(cls, id_value):
|
|
|
|
|
ids = id_value.split("-")
|
|
|
|
|
return f"https://www.douban.com/location/drama/{ids[0]}/#{ids[1]}"
|
|
|
|
|
|
|
|
|
|
def scrape(self):
|
|
|
|
|
show_url = self.url.split("#")[0]
|
|
|
|
|
show_id = self.id_value.split("-")[0]
|
|
|
|
|
version_id = self.id_value.split("-")[1]
|
|
|
|
|
|
|
|
|
|
key = _cache_key(show_url)
|
|
|
|
|
r = cache.get(key, None)
|
|
|
|
|
if r is None:
|
|
|
|
|
r = DoubanDownloader(show_url).download().content.decode("utf-8")
|
|
|
|
|
cache.set(key, r, 3600)
|
|
|
|
|
h = html.fromstring(r)
|
|
|
|
|
|
|
|
|
|
p = "//div[@id='" + version_id + "']"
|
|
|
|
|
q = p + "//dt[text()='{}:']/following-sibling::dd[1]/a/span/text()"
|
|
|
|
|
q2 = p + "//dt[text()='{}:']/following-sibling::dd[1]/text()"
|
2023-06-05 03:39:37 -04:00
|
|
|
|
title = " ".join(h.xpath(p + "//h3/text()")).strip()
|
|
|
|
|
if not title:
|
|
|
|
|
raise ParseError(self, "title")
|
2023-06-05 02:04:52 -04:00
|
|
|
|
data = {
|
2023-06-05 03:39:37 -04:00
|
|
|
|
"title": title,
|
2024-07-13 00:16:47 -04:00
|
|
|
|
"localized_title": [{"lang": "zh-cn", "text": title}],
|
2023-06-05 02:04:52 -04:00
|
|
|
|
"director": [x.strip() for x in h.xpath(q.format("导演"))],
|
|
|
|
|
"playwright": [x.strip() for x in h.xpath(q.format("编剧"))],
|
2023-06-05 17:22:34 -04:00
|
|
|
|
# "actor": [x.strip() for x in h.xpath(q.format("主演"))],
|
2023-06-05 02:04:52 -04:00
|
|
|
|
"composer": [x.strip() for x in h.xpath(q.format("作曲"))],
|
|
|
|
|
"language": [x.strip() for x in h.xpath(q2.format("语言"))],
|
|
|
|
|
"opening_date": " ".join(h.xpath(q2.format("演出日期"))).strip(),
|
|
|
|
|
"troupe": [x.strip() for x in h.xpath(q.format("演出团体"))],
|
|
|
|
|
"location": [x.strip() for x in h.xpath(q.format("演出剧院"))],
|
|
|
|
|
}
|
2023-06-05 11:45:57 -04:00
|
|
|
|
if data["opening_date"]:
|
|
|
|
|
d = data["opening_date"].split("-")
|
|
|
|
|
l = len(d) if len(d) < 6 else 6
|
|
|
|
|
if l > 3:
|
|
|
|
|
data["opening_date"] = "-".join(d[:3])
|
|
|
|
|
data["closing_date"] = "-".join(d[0 : 6 - l] + d[3:l])
|
2023-06-05 17:22:34 -04:00
|
|
|
|
actor_elem = h.xpath(p + "//dt[text()='主演:']/following-sibling::dd[1]/a")
|
|
|
|
|
data["actor"] = []
|
|
|
|
|
for e in actor_elem:
|
|
|
|
|
n = "".join(e.xpath("span/text()")).strip()
|
|
|
|
|
t = "".join(e.xpath("following-sibling::text()[1]")).strip()
|
|
|
|
|
t = re.sub(r"^[\s\(饰]*(.+)\)[\s\/]*$", r"\1", t).strip()
|
|
|
|
|
t = t if t != "/" else ""
|
|
|
|
|
data["actor"].append({"name": n, "role": t})
|
2023-06-05 03:39:37 -04:00
|
|
|
|
img_url_elem = h.xpath("//img[@itemprop='image']/@src")
|
|
|
|
|
data["cover_image_url"] = img_url_elem[0].strip() if img_url_elem else None
|
2023-06-05 02:04:52 -04:00
|
|
|
|
pd = ResourceContent(metadata=data)
|
|
|
|
|
pd.metadata["required_resources"] = [
|
|
|
|
|
{
|
|
|
|
|
"model": "Performance",
|
|
|
|
|
"id_type": IdType.DoubanDrama,
|
|
|
|
|
"id_value": show_id,
|
|
|
|
|
"title": f"Douban Drama {show_id}",
|
|
|
|
|
"url": show_url,
|
|
|
|
|
}
|
|
|
|
|
]
|
2023-06-05 03:39:37 -04:00
|
|
|
|
if pd.metadata["cover_image_url"]:
|
|
|
|
|
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
|
|
|
|
|
try:
|
|
|
|
|
pd.cover_image = imgdl.download().content
|
|
|
|
|
pd.cover_image_extention = imgdl.extention
|
|
|
|
|
except Exception:
|
|
|
|
|
_logger.debug(
|
|
|
|
|
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
|
|
|
|
|
)
|
2023-06-05 02:04:52 -04:00
|
|
|
|
return pd
|
|
|
|
|
|
|
|
|
|
|
2022-12-15 17:29:35 -05:00
|
|
|
|
@SiteManager.register
|
2022-12-07 19:09:05 -05:00
|
|
|
|
class DoubanDrama(AbstractSite):
|
2022-12-16 01:08:10 -05:00
|
|
|
|
SITE_NAME = SiteName.Douban
|
2022-12-07 19:09:05 -05:00
|
|
|
|
ID_TYPE = IdType.DoubanDrama
|
2023-07-19 11:12:58 -04:00
|
|
|
|
URL_PATTERNS = [
|
|
|
|
|
r"\w+://www.douban.com/location/drama/(\d+)/[^#]*$",
|
|
|
|
|
r"\w+://www.douban.com/doubanapp/dispatch\?uri=/drama/(\d+)/",
|
|
|
|
|
]
|
2022-12-29 23:57:02 -05:00
|
|
|
|
WIKI_PROPERTY_ID = "P6443"
|
2022-12-07 19:09:05 -05:00
|
|
|
|
DEFAULT_MODEL = Performance
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2023-02-15 23:45:12 -05:00
|
|
|
|
def id_to_url(cls, id_value):
|
2022-12-07 19:09:05 -05:00
|
|
|
|
return "https://www.douban.com/location/drama/" + id_value + "/"
|
|
|
|
|
|
|
|
|
|
def scrape(self):
|
2023-06-05 02:04:52 -04:00
|
|
|
|
key = _cache_key(self.url)
|
|
|
|
|
r = cache.get(key, None)
|
|
|
|
|
if r is None:
|
|
|
|
|
r = DoubanDownloader(self.url).download().content.decode("utf-8")
|
|
|
|
|
cache.set(key, r, 3600)
|
|
|
|
|
h = html.fromstring(r)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
data = {}
|
|
|
|
|
|
|
|
|
|
title_elem = h.xpath("/html/body//h1/span/text()")
|
|
|
|
|
if title_elem:
|
|
|
|
|
data["title"] = title_elem[0].strip()
|
2023-06-05 02:04:52 -04:00
|
|
|
|
data["orig_title"] = title_elem[1] if len(title_elem) > 1 else None
|
2022-12-07 19:09:05 -05:00
|
|
|
|
else:
|
|
|
|
|
raise ParseError(self, "title")
|
|
|
|
|
|
2022-12-29 23:57:02 -05:00
|
|
|
|
other_title_elem = h.xpath(
|
|
|
|
|
"//dl//dt[text()='又名:']/following::dd[@itemprop='name']/text()"
|
|
|
|
|
)
|
2023-06-05 02:04:52 -04:00
|
|
|
|
data["other_title"] = other_title_elem
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
2023-06-05 13:30:40 -04:00
|
|
|
|
plot_elem = h.xpath("//div[@class='pure-text']/div[@class='full']/text()")
|
2022-12-07 19:09:05 -05:00
|
|
|
|
if len(plot_elem) == 0:
|
2023-06-05 13:30:40 -04:00
|
|
|
|
plot_elem = h.xpath(
|
|
|
|
|
"//div[@class='pure-text']/div[@class='abstract']/text()"
|
|
|
|
|
)
|
|
|
|
|
if len(plot_elem) == 0:
|
|
|
|
|
plot_elem = h.xpath("//div[@class='pure-text']/text()")
|
|
|
|
|
data["brief"] = "\n".join(plot_elem)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
2023-02-15 23:45:12 -05:00
|
|
|
|
data["genre"] = [
|
2022-12-29 23:57:02 -05:00
|
|
|
|
s.strip()
|
|
|
|
|
for s in h.xpath(
|
2023-06-05 11:45:57 -04:00
|
|
|
|
"//div[@class='meta']//dl//dt[text()='类型:']/following-sibling::dd[@itemprop='genre']/text()"
|
2022-12-29 23:57:02 -05:00
|
|
|
|
)
|
|
|
|
|
]
|
2023-06-05 02:04:52 -04:00
|
|
|
|
# data["version"] = [
|
|
|
|
|
# s.strip()
|
|
|
|
|
# for s in h.xpath(
|
|
|
|
|
# "//dl//dt[text()='版本:']/following-sibling::dd[@class='titles']/a//text()"
|
|
|
|
|
# )
|
|
|
|
|
# ]
|
2023-02-15 23:45:12 -05:00
|
|
|
|
data["director"] = [
|
2022-12-29 23:57:02 -05:00
|
|
|
|
s.strip()
|
|
|
|
|
for s in h.xpath(
|
|
|
|
|
"//div[@class='meta']/dl//dt[text()='导演:']/following-sibling::dd/a[@itemprop='director']//text()"
|
|
|
|
|
)
|
|
|
|
|
]
|
2023-02-15 23:45:12 -05:00
|
|
|
|
data["composer"] = [
|
|
|
|
|
s.strip()
|
|
|
|
|
for s in h.xpath(
|
|
|
|
|
"//div[@class='meta']/dl//dt[text()='作曲:']/following-sibling::dd/a[@itemprop='musicBy']//text()"
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
data["choreographer"] = [
|
|
|
|
|
s.strip()
|
|
|
|
|
for s in h.xpath(
|
|
|
|
|
"//div[@class='meta']/dl//dt[text()='编舞:']/following-sibling::dd/a[@itemprop='choreographer']//text()"
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
data["troupe"] = [
|
|
|
|
|
s.strip()
|
|
|
|
|
for s in h.xpath(
|
|
|
|
|
"//div[@class='meta']/dl//dt[text()='演出团体:']/following-sibling::dd/a[@itemprop='performer']//text()"
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
data["playwright"] = [
|
2022-12-29 23:57:02 -05:00
|
|
|
|
s.strip()
|
|
|
|
|
for s in h.xpath(
|
|
|
|
|
"//div[@class='meta']/dl//dt[text()='编剧:']/following-sibling::dd/a[@itemprop='author']//text()"
|
|
|
|
|
)
|
|
|
|
|
]
|
2023-06-05 17:22:34 -04:00
|
|
|
|
data["actor"] = [
|
|
|
|
|
{"name": s.strip(), "role": ""}
|
2022-12-29 23:57:02 -05:00
|
|
|
|
for s in h.xpath(
|
|
|
|
|
"//div[@class='meta']/dl//dt[text()='主演:']/following-sibling::dd/a[@itemprop='actor']//text()"
|
|
|
|
|
)
|
|
|
|
|
]
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
2023-06-05 11:45:57 -04:00
|
|
|
|
date_elem = h.xpath(
|
|
|
|
|
"//div[@class='meta']//dl//dt[text()='演出日期:']/following::dd/text()"
|
|
|
|
|
)
|
2023-02-15 23:45:12 -05:00
|
|
|
|
data["opening_date"] = date_elem[0] if date_elem else None
|
2023-06-05 11:45:57 -04:00
|
|
|
|
if data["opening_date"]:
|
|
|
|
|
d = data["opening_date"].split("-")
|
|
|
|
|
l = len(d) if len(d) < 6 else 6
|
|
|
|
|
if l > 3:
|
|
|
|
|
data["opening_date"] = "-".join(d[:3])
|
|
|
|
|
data["closing_date"] = "-".join(d[0 : 6 - l] + d[3:l])
|
2023-02-15 23:45:12 -05:00
|
|
|
|
|
2023-06-05 02:04:52 -04:00
|
|
|
|
data["location"] = [
|
2023-02-15 23:45:12 -05:00
|
|
|
|
s.strip()
|
|
|
|
|
for s in h.xpath(
|
|
|
|
|
"//div[@class='meta']/dl//dt[text()='演出剧院:']/following-sibling::dd/a[@itemprop='location']//text()"
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
|
2023-06-05 02:04:52 -04:00
|
|
|
|
versions = h.xpath("//div[@id='versions']/div[@class='fluid-mods']/div/@id")
|
|
|
|
|
data["related_resources"] = list(
|
|
|
|
|
map(
|
|
|
|
|
lambda v: {
|
|
|
|
|
"model": "PerformanceProduction",
|
|
|
|
|
"id_type": IdType.DoubanDramaVersion,
|
|
|
|
|
"id_value": f"{self.id_value}-{v}",
|
|
|
|
|
"title": f"{data['title']} - {v}",
|
|
|
|
|
"url": f"{self.url}#{v}",
|
|
|
|
|
},
|
|
|
|
|
versions,
|
|
|
|
|
)
|
|
|
|
|
)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
img_url_elem = h.xpath("//img[@itemprop='image']/@src")
|
2022-12-29 23:57:02 -05:00
|
|
|
|
data["cover_image_url"] = img_url_elem[0].strip() if img_url_elem else None
|
2024-07-13 00:16:47 -04:00
|
|
|
|
data["localized_title"] = (
|
|
|
|
|
[{"lang": "zh-cn", "text": data["title"]}]
|
|
|
|
|
+ (
|
|
|
|
|
[
|
|
|
|
|
{
|
|
|
|
|
"lang": detect_language(data["orig_title"]),
|
|
|
|
|
"text": data["orig_title"],
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
if data["orig_title"]
|
|
|
|
|
else []
|
|
|
|
|
)
|
|
|
|
|
+ [{"lang": detect_language(t), "text": t} for t in data["other_title"]]
|
|
|
|
|
)
|
|
|
|
|
data["localized_description"] = [{"lang": "zh-cn", "text": data["brief"]}]
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
2022-12-08 16:08:59 +00:00
|
|
|
|
pd = ResourceContent(metadata=data)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
if pd.metadata["cover_image_url"]:
|
|
|
|
|
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
|
|
|
|
|
try:
|
|
|
|
|
pd.cover_image = imgdl.download().content
|
|
|
|
|
pd.cover_image_extention = imgdl.extention
|
|
|
|
|
except Exception:
|
2022-12-29 23:57:02 -05:00
|
|
|
|
_logger.debug(
|
|
|
|
|
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
|
|
|
|
|
)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
return pd
|