lib.itmens/catalog/sites/bangumi.py

167 lines
5.4 KiB
Python
Raw Normal View History

2022-12-09 03:09:06 +00:00
import logging
2023-10-31 04:29:28 -04:00
from catalog.book.utils import detect_isbn_asin
from catalog.common import *
from catalog.models import *
2024-07-13 00:16:47 -04:00
from common.models.lang import detect_language
2022-12-09 03:09:06 +00:00
_logger = logging.getLogger(__name__)
2022-12-15 17:29:35 -05:00
@SiteManager.register
2022-12-09 03:09:06 +00:00
class Bangumi(AbstractSite):
2022-12-16 01:08:10 -05:00
SITE_NAME = SiteName.Bangumi
2022-12-09 03:09:06 +00:00
ID_TYPE = IdType.Bangumi
URL_PATTERNS = [
r"https://bgm\.tv/subject/(\d+)",
2023-10-31 04:29:28 -04:00
r"https://bangumi\.tv/subject/(\d+)",
2022-12-09 03:09:06 +00:00
]
2022-12-29 23:57:02 -05:00
WIKI_PROPERTY_ID = ""
2022-12-09 03:09:06 +00:00
DEFAULT_MODEL = None
@classmethod
2023-08-11 11:55:42 -04:00
def id_to_url(cls, id_value):
2022-12-09 03:09:06 +00:00
return f"https://bgm.tv/subject/{id_value}"
def scrape(self):
2023-10-31 04:29:28 -04:00
api_url = f"https://api.bgm.tv/v0/subjects/{self.id_value}"
o = BasicDownloader(api_url).download().json()
showtime = None
pub_year = None
pub_month = None
year = None
dt = o.get("date")
episodes = o.get("total_episodes", 0)
match o["type"]:
case 1:
model = "Edition"
if dt:
d = dt.split("-")
pub_year = d[0]
pub_month = d[1]
case 2 | 6:
is_series = episodes > 1
model = "TVSeason" if is_series else "Movie"
if dt:
year = dt.split("-")[0]
2024-06-02 14:50:07 -04:00
showtime = [
{"time": dt, "region": "首播日期" if is_series else "发布日期"}
]
2023-10-31 04:29:28 -04:00
case 3:
model = "Album"
case 4:
model = "Game"
case _:
raise ValueError(
f"Unknown type {o['type']} for bangumi subject {self.id_value}"
)
title = o.get("name_cn") or o.get("name")
orig_title = o.get("name") if o.get("name") != title else None
brief = o.get("summary")
genre = None
platform = None
other_title = []
imdb_code = None
isbn_type = None
isbn = None
language = None
pub_house = None
authors = None
site = None
director = None
for i in o.get("infobox", []):
k = i["key"]
v = i["value"]
match k:
case "别名":
2024-07-16 22:55:29 -04:00
other_title = (
[d["v"] for d in v]
if type(v) == list
else ([v] if isinstance(v, str) else [])
)
2023-10-31 04:29:28 -04:00
case "imdb_id":
imdb_code = v
case "isbn":
isbn_type, isbn = detect_isbn_asin(v)
case "语言":
language = v
case "出版社":
pub_house = v
case "导演":
director = v
case "作者":
2024-07-16 22:55:29 -04:00
authors = (
[d["v"] for d in v]
if type(v) == list
else ([v] if isinstance(v, str) else [])
)
2023-10-31 04:29:28 -04:00
case "平台":
2024-07-16 22:55:29 -04:00
platform = (
[d["v"] for d in v]
if type(v) == list
else ([v] if isinstance(v, str) else [])
)
2023-10-31 04:29:28 -04:00
case "游戏类型":
2024-07-16 12:10:46 -04:00
genre = (
2024-07-16 22:55:29 -04:00
[d["v"] for d in v]
2024-07-16 12:10:46 -04:00
if isinstance(v, list)
else ([v] if isinstance(v, str) else [])
)
2023-10-31 04:29:28 -04:00
case "官方网站" | "website":
site = v[0] if type(v) == list else v
img_url = o["images"].get("large") or o["images"].get("common")
raw_img = None
ext = None
if img_url:
raw_img, ext = BasicImageDownloader.download_image(
img_url, None, headers={}
)
2024-07-13 00:16:47 -04:00
titles = set(
[title] + (other_title or []) + ([orig_title] if orig_title else [])
)
localized_title = [{"lang": detect_language(t), "text": t} for t in titles]
2024-07-16 00:51:05 -04:00
localized_desc = (
[{"lang": detect_language(brief), "text": brief}] if brief else []
)
2023-10-31 04:29:28 -04:00
data = {
2024-07-13 00:16:47 -04:00
"localized_title": localized_title,
"localized_description": localized_desc,
2023-10-31 04:29:28 -04:00
"preferred_model": model,
"title": title,
"orig_title": orig_title,
"other_title": other_title or None,
"author": authors,
"genre": genre,
"translator": None,
"director": director,
"language": language,
"platform": platform,
"year": year,
"showtime": showtime,
"imdb_code": imdb_code,
"pub_house": pub_house,
"pub_year": pub_year,
"pub_month": pub_month,
"binding": None,
"episode_count": episodes or None,
"official_site": site,
"site": site,
"isbn": isbn,
"brief": brief,
"cover_image_url": img_url,
"release_date": dt,
}
lookup_ids = {}
if isbn:
lookup_ids[isbn_type] = isbn
if imdb_code:
lookup_ids[IdType.IMDB] = imdb_code
return ResourceContent(
metadata={k: v for k, v in data.items() if v is not None},
cover_image=raw_img,
cover_image_extention=ext,
lookup_ids=lookup_ids,
)