2022-12-07 19:09:05 -05:00
|
|
|
from catalog.common import *
|
2022-12-08 16:59:03 +00:00
|
|
|
from catalog.models import *
|
2022-12-07 19:09:05 -05:00
|
|
|
from .douban import DoubanDownloader
|
|
|
|
import logging
|
|
|
|
|
|
|
|
|
2022-12-08 16:59:03 +00:00
|
|
|
_logger = logging.getLogger(__name__)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
|
2022-12-15 17:29:35 -05:00
|
|
|
@SiteManager.register
|
2022-12-07 19:09:05 -05:00
|
|
|
class DoubanDrama(AbstractSite):
|
2022-12-16 01:08:10 -05:00
|
|
|
SITE_NAME = SiteName.Douban
|
2022-12-07 19:09:05 -05:00
|
|
|
ID_TYPE = IdType.DoubanDrama
|
|
|
|
URL_PATTERNS = [r"\w+://www.douban.com/location/drama/(\d+)/"]
|
2022-12-29 23:57:02 -05:00
|
|
|
WIKI_PROPERTY_ID = "P6443"
|
2022-12-07 19:09:05 -05:00
|
|
|
DEFAULT_MODEL = Performance
|
|
|
|
|
|
|
|
@classmethod
|
2023-02-15 23:45:12 -05:00
|
|
|
def id_to_url(cls, id_value):
|
2022-12-07 19:09:05 -05:00
|
|
|
return "https://www.douban.com/location/drama/" + id_value + "/"
|
|
|
|
|
|
|
|
def scrape(self):
|
2022-12-08 16:59:03 +00:00
|
|
|
h = DoubanDownloader(self.url).download().html()
|
2022-12-07 19:09:05 -05:00
|
|
|
data = {}
|
|
|
|
|
|
|
|
title_elem = h.xpath("/html/body//h1/span/text()")
|
|
|
|
if title_elem:
|
|
|
|
data["title"] = title_elem[0].strip()
|
|
|
|
else:
|
|
|
|
raise ParseError(self, "title")
|
|
|
|
|
2023-02-15 23:45:12 -05:00
|
|
|
data["other_title"] = [s.strip() for s in title_elem[1:]]
|
2022-12-29 23:57:02 -05:00
|
|
|
other_title_elem = h.xpath(
|
|
|
|
"//dl//dt[text()='又名:']/following::dd[@itemprop='name']/text()"
|
|
|
|
)
|
2023-02-15 23:45:12 -05:00
|
|
|
data["other_title"] += other_title_elem
|
|
|
|
data["other_title"] = list(set(data["other_title"]))
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
plot_elem = h.xpath("//div[@id='link-report']/text()")
|
|
|
|
if len(plot_elem) == 0:
|
|
|
|
plot_elem = h.xpath("//div[@class='abstract']/text()")
|
2022-12-29 23:57:02 -05:00
|
|
|
data["brief"] = "\n".join(plot_elem) if len(plot_elem) > 0 else ""
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2023-02-15 23:45:12 -05:00
|
|
|
data["genre"] = [
|
2022-12-29 23:57:02 -05:00
|
|
|
s.strip()
|
|
|
|
for s in h.xpath(
|
|
|
|
"//dl//dt[text()='类型:']/following-sibling::dd[@itemprop='genre']/text()"
|
|
|
|
)
|
|
|
|
]
|
2023-02-15 23:45:12 -05:00
|
|
|
data["version"] = [
|
2022-12-29 23:57:02 -05:00
|
|
|
s.strip()
|
|
|
|
for s in h.xpath(
|
|
|
|
"//dl//dt[text()='版本:']/following-sibling::dd[@class='titles']/a//text()"
|
|
|
|
)
|
|
|
|
]
|
2023-02-15 23:45:12 -05:00
|
|
|
data["director"] = [
|
2022-12-29 23:57:02 -05:00
|
|
|
s.strip()
|
|
|
|
for s in h.xpath(
|
|
|
|
"//div[@class='meta']/dl//dt[text()='导演:']/following-sibling::dd/a[@itemprop='director']//text()"
|
|
|
|
)
|
|
|
|
]
|
2023-02-15 23:45:12 -05:00
|
|
|
data["composer"] = [
|
|
|
|
s.strip()
|
|
|
|
for s in h.xpath(
|
|
|
|
"//div[@class='meta']/dl//dt[text()='作曲:']/following-sibling::dd/a[@itemprop='musicBy']//text()"
|
|
|
|
)
|
|
|
|
]
|
|
|
|
data["choreographer"] = [
|
|
|
|
s.strip()
|
|
|
|
for s in h.xpath(
|
|
|
|
"//div[@class='meta']/dl//dt[text()='编舞:']/following-sibling::dd/a[@itemprop='choreographer']//text()"
|
|
|
|
)
|
|
|
|
]
|
|
|
|
data["troupe"] = [
|
|
|
|
s.strip()
|
|
|
|
for s in h.xpath(
|
|
|
|
"//div[@class='meta']/dl//dt[text()='演出团体:']/following-sibling::dd/a[@itemprop='performer']//text()"
|
|
|
|
)
|
|
|
|
]
|
|
|
|
data["playwright"] = [
|
2022-12-29 23:57:02 -05:00
|
|
|
s.strip()
|
|
|
|
for s in h.xpath(
|
|
|
|
"//div[@class='meta']/dl//dt[text()='编剧:']/following-sibling::dd/a[@itemprop='author']//text()"
|
|
|
|
)
|
|
|
|
]
|
2023-02-15 23:45:12 -05:00
|
|
|
data["actor"] = [
|
2022-12-29 23:57:02 -05:00
|
|
|
s.strip()
|
|
|
|
for s in h.xpath(
|
|
|
|
"//div[@class='meta']/dl//dt[text()='主演:']/following-sibling::dd/a[@itemprop='actor']//text()"
|
|
|
|
)
|
|
|
|
]
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2023-02-15 23:45:12 -05:00
|
|
|
date_elem = h.xpath("//dl//dt[text()='演出日期:']/following::dd/text()")
|
|
|
|
data["opening_date"] = date_elem[0] if date_elem else None
|
|
|
|
|
|
|
|
data["theatre"] = [
|
|
|
|
s.strip()
|
|
|
|
for s in h.xpath(
|
|
|
|
"//div[@class='meta']/dl//dt[text()='演出剧院:']/following-sibling::dd/a[@itemprop='location']//text()"
|
|
|
|
)
|
|
|
|
]
|
|
|
|
|
2022-12-07 19:09:05 -05:00
|
|
|
img_url_elem = h.xpath("//img[@itemprop='image']/@src")
|
2022-12-29 23:57:02 -05:00
|
|
|
data["cover_image_url"] = img_url_elem[0].strip() if img_url_elem else None
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2022-12-08 16:08:59 +00:00
|
|
|
pd = ResourceContent(metadata=data)
|
2022-12-07 19:09:05 -05:00
|
|
|
if pd.metadata["cover_image_url"]:
|
|
|
|
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
|
|
|
|
try:
|
|
|
|
pd.cover_image = imgdl.download().content
|
|
|
|
pd.cover_image_extention = imgdl.extention
|
|
|
|
except Exception:
|
2022-12-29 23:57:02 -05:00
|
|
|
_logger.debug(
|
|
|
|
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
|
|
|
|
)
|
2022-12-07 19:09:05 -05:00
|
|
|
return pd
|