2022-12-09 03:09:06 +00:00
|
|
|
|
import logging
|
2024-12-07 15:01:26 +00:00
|
|
|
|
from collections import OrderedDict
|
2025-01-20 10:31:01 -05:00
|
|
|
|
from typing import Any
|
2025-01-20 23:24:20 +08:00
|
|
|
|
|
2025-01-20 10:31:01 -05:00
|
|
|
|
import httpx
|
2025-01-20 23:24:20 +08:00
|
|
|
|
from django.conf import settings
|
2025-01-20 10:31:01 -05:00
|
|
|
|
from loguru import logger
|
2022-12-09 03:09:06 +00:00
|
|
|
|
|
2023-10-31 04:29:28 -04:00
|
|
|
|
from catalog.book.utils import detect_isbn_asin
|
2023-08-10 11:27:31 -04:00
|
|
|
|
from catalog.common import *
|
2025-01-20 23:24:20 +08:00
|
|
|
|
from catalog.game.models import GameReleaseType
|
2023-08-10 11:27:31 -04:00
|
|
|
|
from catalog.models import *
|
2024-07-13 00:16:47 -04:00
|
|
|
|
from common.models.lang import detect_language
|
2022-12-09 03:09:06 +00:00
|
|
|
|
|
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
2022-12-15 17:29:35 -05:00
|
|
|
|
@SiteManager.register
|
2022-12-09 03:09:06 +00:00
|
|
|
|
class Bangumi(AbstractSite):
|
2022-12-16 01:08:10 -05:00
|
|
|
|
SITE_NAME = SiteName.Bangumi
|
2022-12-09 03:09:06 +00:00
|
|
|
|
ID_TYPE = IdType.Bangumi
|
|
|
|
|
URL_PATTERNS = [
|
|
|
|
|
r"https://bgm\.tv/subject/(\d+)",
|
2023-10-31 04:29:28 -04:00
|
|
|
|
r"https://bangumi\.tv/subject/(\d+)",
|
2024-12-07 14:31:17 +00:00
|
|
|
|
r"https://chii\.in/subject/(\d+)",
|
2022-12-09 03:09:06 +00:00
|
|
|
|
]
|
2022-12-29 23:57:02 -05:00
|
|
|
|
WIKI_PROPERTY_ID = ""
|
2022-12-09 03:09:06 +00:00
|
|
|
|
DEFAULT_MODEL = None
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2025-01-20 23:24:20 +08:00
|
|
|
|
def get_category(
|
|
|
|
|
cls, o: dict[str, Any], fetch_resources: bool = False
|
|
|
|
|
) -> tuple[ItemCategory, dict[str, Any]]:
|
|
|
|
|
dt = o.get("date")
|
2023-10-31 04:29:28 -04:00
|
|
|
|
pub_year = None
|
|
|
|
|
pub_month = None
|
2025-01-20 23:24:20 +08:00
|
|
|
|
release_year = None
|
|
|
|
|
release_type = None
|
|
|
|
|
showtime = None
|
2023-10-31 04:29:28 -04:00
|
|
|
|
year = None
|
2025-01-20 23:24:20 +08:00
|
|
|
|
related_resources = []
|
2023-10-31 04:29:28 -04:00
|
|
|
|
match o["type"]:
|
|
|
|
|
case 1:
|
|
|
|
|
model = "Edition"
|
2025-01-20 23:24:20 +08:00
|
|
|
|
category = ItemCategory.Book
|
|
|
|
|
if o["series"] and fetch_resources:
|
|
|
|
|
# model = "Series" TODO
|
|
|
|
|
res = (
|
|
|
|
|
BasicDownloader(
|
|
|
|
|
f"https://api.bgm.tv/v0/subjects/{o['id']}/subjects",
|
|
|
|
|
headers={
|
|
|
|
|
"User-Agent": settings.NEODB_USER_AGENT,
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
.download()
|
|
|
|
|
.json()
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for s in res:
|
|
|
|
|
if s["relation"] != "单行本":
|
|
|
|
|
continue
|
|
|
|
|
related_resources.append(
|
|
|
|
|
{
|
|
|
|
|
"url": cls.id_to_url(s["id"]),
|
|
|
|
|
}
|
|
|
|
|
)
|
2023-10-31 04:29:28 -04:00
|
|
|
|
if dt:
|
|
|
|
|
d = dt.split("-")
|
|
|
|
|
pub_year = d[0]
|
|
|
|
|
pub_month = d[1]
|
|
|
|
|
case 2 | 6:
|
2025-01-20 23:24:20 +08:00
|
|
|
|
is_season = o["platform"] in {
|
|
|
|
|
"TV",
|
|
|
|
|
"OVA", # may be movie in other sites
|
|
|
|
|
"WEB",
|
|
|
|
|
"电视剧",
|
|
|
|
|
"欧美剧",
|
|
|
|
|
"日剧",
|
|
|
|
|
"华语剧",
|
|
|
|
|
"综艺",
|
|
|
|
|
}
|
|
|
|
|
category = ItemCategory.TV if is_season else ItemCategory.Movie
|
|
|
|
|
model = "TVSeason" if is_season else "Movie"
|
|
|
|
|
if "舞台剧" in [
|
|
|
|
|
t["name"] for t in o["tags"]
|
|
|
|
|
]: # 只能这样判断舞台剧了,bangumi三次元分类太少
|
|
|
|
|
category = ItemCategory.Performance
|
|
|
|
|
model = "Performance"
|
2023-10-31 04:29:28 -04:00
|
|
|
|
if dt:
|
|
|
|
|
year = dt.split("-")[0]
|
2024-06-02 14:50:07 -04:00
|
|
|
|
showtime = [
|
2025-01-20 23:24:20 +08:00
|
|
|
|
{"time": dt, "region": "首播日期" if is_season else "发布日期"}
|
2024-06-02 14:50:07 -04:00
|
|
|
|
]
|
2023-10-31 04:29:28 -04:00
|
|
|
|
case 3:
|
|
|
|
|
model = "Album"
|
2025-01-20 23:24:20 +08:00
|
|
|
|
category = ItemCategory.Music
|
2023-10-31 04:29:28 -04:00
|
|
|
|
case 4:
|
|
|
|
|
model = "Game"
|
2025-01-20 23:24:20 +08:00
|
|
|
|
category = ItemCategory.Game
|
|
|
|
|
match o["platform"]:
|
|
|
|
|
case "游戏":
|
|
|
|
|
release_type = GameReleaseType.GAME
|
|
|
|
|
case "扩展包":
|
|
|
|
|
release_type = GameReleaseType.DLC
|
2023-10-31 04:29:28 -04:00
|
|
|
|
case _:
|
|
|
|
|
raise ValueError(
|
2025-01-20 10:31:01 -05:00
|
|
|
|
f"Unknown type {o['type']} for bangumi subject {o['id']}"
|
2023-10-31 04:29:28 -04:00
|
|
|
|
)
|
2025-01-20 23:24:20 +08:00
|
|
|
|
return category, {
|
|
|
|
|
"preferred_model": model,
|
|
|
|
|
"related_resources": related_resources,
|
|
|
|
|
"pub_year": pub_year,
|
|
|
|
|
"pub_month": pub_month,
|
|
|
|
|
"release_year": release_year,
|
|
|
|
|
"release_type": release_type,
|
|
|
|
|
"showtime": showtime,
|
|
|
|
|
"year": year,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def id_to_url(cls, id_value):
|
|
|
|
|
return f"https://bgm.tv/subject/{id_value}"
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
async def search_task(
|
|
|
|
|
cls, query: str, page: int, category: str, page_size: int
|
|
|
|
|
) -> list[ExternalSearchResultItem]:
|
|
|
|
|
results = []
|
|
|
|
|
bgm_type = {
|
|
|
|
|
"all": None,
|
|
|
|
|
"movietv": [2, 6],
|
|
|
|
|
"movie": [2, 6],
|
|
|
|
|
"tv": [2, 6],
|
|
|
|
|
"book": [1],
|
|
|
|
|
"game": [4],
|
|
|
|
|
"performance": [6],
|
|
|
|
|
"music": [3],
|
|
|
|
|
}
|
|
|
|
|
if category not in bgm_type:
|
|
|
|
|
return results
|
2025-01-20 10:31:01 -05:00
|
|
|
|
search_url = f"https://api.bgm.tv/v0/search/subjects?limit={page_size}&offset={(page - 1) * page_size}"
|
2025-01-20 23:24:20 +08:00
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
|
|
|
try:
|
|
|
|
|
response = await client.post(
|
|
|
|
|
search_url,
|
|
|
|
|
headers={"User-Agent": settings.NEODB_USER_AGENT},
|
|
|
|
|
json={"keyword": query, "filter": {"type": bgm_type[category]}},
|
|
|
|
|
timeout=2,
|
|
|
|
|
)
|
|
|
|
|
r = response.json()
|
|
|
|
|
for s in r["data"]:
|
|
|
|
|
cat, _ = cls.get_category(s)
|
|
|
|
|
results.append(
|
|
|
|
|
ExternalSearchResultItem(
|
|
|
|
|
category=cat,
|
|
|
|
|
source_site=cls.SITE_NAME,
|
|
|
|
|
source_url=cls.id_to_url(s["id"]),
|
|
|
|
|
title=s["name"],
|
|
|
|
|
subtitle="",
|
|
|
|
|
brief=s.get("summary", ""),
|
|
|
|
|
cover_url=s["images"].get("common"),
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(
|
|
|
|
|
"Bangumi search error", extra={"query": query, "exception": e}
|
|
|
|
|
)
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
def scrape(self):
|
|
|
|
|
api_url = f"https://api.bgm.tv/v0/subjects/{self.id_value}"
|
|
|
|
|
o = (
|
|
|
|
|
BasicDownloader(
|
|
|
|
|
api_url,
|
|
|
|
|
headers={
|
|
|
|
|
"User-Agent": settings.NEODB_USER_AGENT,
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
.download()
|
|
|
|
|
.json()
|
|
|
|
|
)
|
|
|
|
|
category, data = self.get_category(o, True)
|
2023-10-31 04:29:28 -04:00
|
|
|
|
title = o.get("name_cn") or o.get("name")
|
|
|
|
|
orig_title = o.get("name") if o.get("name") != title else None
|
|
|
|
|
brief = o.get("summary")
|
2025-01-20 23:24:20 +08:00
|
|
|
|
episodes = o.get("total_episodes", 0)
|
2023-10-31 04:29:28 -04:00
|
|
|
|
genre = None
|
|
|
|
|
platform = None
|
|
|
|
|
other_title = []
|
|
|
|
|
imdb_code = None
|
|
|
|
|
isbn_type = None
|
|
|
|
|
isbn = None
|
|
|
|
|
language = None
|
|
|
|
|
pub_house = None
|
2025-01-20 23:24:20 +08:00
|
|
|
|
orig_creator = None
|
|
|
|
|
authors = []
|
2023-10-31 04:29:28 -04:00
|
|
|
|
site = None
|
|
|
|
|
director = None
|
2025-01-20 23:24:20 +08:00
|
|
|
|
playwright = None
|
|
|
|
|
actor = None
|
2024-12-07 14:31:17 +00:00
|
|
|
|
pages = None
|
|
|
|
|
price = None
|
2025-01-20 23:24:20 +08:00
|
|
|
|
opening_date = None
|
|
|
|
|
closing_date = None
|
|
|
|
|
location = None
|
2023-10-31 04:29:28 -04:00
|
|
|
|
for i in o.get("infobox", []):
|
2025-01-20 23:24:20 +08:00
|
|
|
|
k = i["key"].lower()
|
2023-10-31 04:29:28 -04:00
|
|
|
|
v = i["value"]
|
|
|
|
|
match k:
|
|
|
|
|
case "别名":
|
2024-07-16 22:55:29 -04:00
|
|
|
|
other_title = (
|
|
|
|
|
[d["v"] for d in v]
|
2024-10-28 11:14:22 -04:00
|
|
|
|
if isinstance(v, list)
|
2024-07-16 22:55:29 -04:00
|
|
|
|
else ([v] if isinstance(v, str) else [])
|
|
|
|
|
)
|
2025-01-20 23:24:20 +08:00
|
|
|
|
case "话数":
|
|
|
|
|
try:
|
|
|
|
|
episodes = int(v)
|
|
|
|
|
except ValueError:
|
|
|
|
|
pass
|
2023-10-31 04:29:28 -04:00
|
|
|
|
case "imdb_id":
|
|
|
|
|
imdb_code = v
|
2025-01-20 23:24:20 +08:00
|
|
|
|
case "isbn":
|
2023-10-31 04:29:28 -04:00
|
|
|
|
isbn_type, isbn = detect_isbn_asin(v)
|
|
|
|
|
case "语言":
|
|
|
|
|
language = v
|
|
|
|
|
case "出版社":
|
|
|
|
|
pub_house = v
|
|
|
|
|
case "导演":
|
|
|
|
|
director = v
|
2025-01-20 23:24:20 +08:00
|
|
|
|
case "编剧" | "脚本":
|
|
|
|
|
playwright = (
|
|
|
|
|
[d["v"] for d in v]
|
|
|
|
|
if isinstance(v, list)
|
|
|
|
|
else ([v] if isinstance(v, str) else [])
|
|
|
|
|
)
|
|
|
|
|
case "原作":
|
|
|
|
|
match category:
|
|
|
|
|
case ItemCategory.Book:
|
|
|
|
|
authors.append(v)
|
|
|
|
|
case ItemCategory.Performance:
|
|
|
|
|
orig_creator = (
|
|
|
|
|
[d["v"] for d in v]
|
|
|
|
|
if isinstance(v, list)
|
|
|
|
|
else ([v] if isinstance(v, str) else [])
|
|
|
|
|
)
|
|
|
|
|
case "作画":
|
|
|
|
|
authors.append(v)
|
2023-10-31 04:29:28 -04:00
|
|
|
|
case "作者":
|
2025-01-20 23:24:20 +08:00
|
|
|
|
authors.extend(
|
2024-07-16 22:55:29 -04:00
|
|
|
|
[d["v"] for d in v]
|
2024-10-28 11:14:22 -04:00
|
|
|
|
if isinstance(v, list)
|
2024-07-16 22:55:29 -04:00
|
|
|
|
else ([v] if isinstance(v, str) else [])
|
|
|
|
|
)
|
2023-10-31 04:29:28 -04:00
|
|
|
|
case "平台":
|
2024-07-16 22:55:29 -04:00
|
|
|
|
platform = (
|
|
|
|
|
[d["v"] for d in v]
|
2024-10-28 11:14:22 -04:00
|
|
|
|
if isinstance(v, list)
|
2024-07-16 22:55:29 -04:00
|
|
|
|
else ([v] if isinstance(v, str) else [])
|
|
|
|
|
)
|
2025-01-20 23:24:20 +08:00
|
|
|
|
case "游戏类型" | "类型":
|
2024-07-16 12:10:46 -04:00
|
|
|
|
genre = (
|
2024-07-16 22:55:29 -04:00
|
|
|
|
[d["v"] for d in v]
|
2024-07-16 12:10:46 -04:00
|
|
|
|
if isinstance(v, list)
|
|
|
|
|
else ([v] if isinstance(v, str) else [])
|
|
|
|
|
)
|
2023-10-31 04:29:28 -04:00
|
|
|
|
case "官方网站" | "website":
|
2024-10-28 11:14:22 -04:00
|
|
|
|
site = v[0] if isinstance(v, list) else v
|
2024-12-07 14:31:17 +00:00
|
|
|
|
case "页数":
|
|
|
|
|
pages = v
|
|
|
|
|
case "价格":
|
|
|
|
|
price = v
|
2025-01-20 23:24:20 +08:00
|
|
|
|
case "开始":
|
|
|
|
|
opening_date = v
|
|
|
|
|
case "结束":
|
|
|
|
|
closing_date = v
|
|
|
|
|
case "演出":
|
|
|
|
|
if category == ItemCategory.Performance:
|
|
|
|
|
director = v
|
|
|
|
|
case "主演":
|
|
|
|
|
actor = (
|
|
|
|
|
[{"name": d["v"], "role": None} for d in v]
|
|
|
|
|
if isinstance(v, list)
|
|
|
|
|
else (
|
|
|
|
|
[{"name": w, "role": None} for w in v.split("、")]
|
|
|
|
|
if isinstance(v, str)
|
|
|
|
|
else []
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
case "会场" | "演出地点":
|
|
|
|
|
location = v
|
2023-10-31 04:29:28 -04:00
|
|
|
|
|
|
|
|
|
img_url = o["images"].get("large") or o["images"].get("common")
|
|
|
|
|
raw_img = None
|
|
|
|
|
ext = None
|
|
|
|
|
if img_url:
|
|
|
|
|
raw_img, ext = BasicImageDownloader.download_image(
|
|
|
|
|
img_url, None, headers={}
|
|
|
|
|
)
|
2024-12-07 15:01:26 +00:00
|
|
|
|
titles = OrderedDict.fromkeys(
|
2024-07-13 00:16:47 -04:00
|
|
|
|
[title] + (other_title or []) + ([orig_title] if orig_title else [])
|
|
|
|
|
)
|
2024-12-07 15:01:26 +00:00
|
|
|
|
if o.get("name_cn"):
|
|
|
|
|
titles[o.get("name_cn")] = "zh-cn"
|
|
|
|
|
localized_title = [
|
2024-12-26 11:47:36 -05:00
|
|
|
|
{"lang": lang or detect_language(t), "text": t}
|
|
|
|
|
for t, lang in titles.items()
|
2024-12-07 15:01:26 +00:00
|
|
|
|
]
|
2024-07-16 00:51:05 -04:00
|
|
|
|
localized_desc = (
|
|
|
|
|
[{"lang": detect_language(brief), "text": brief}] if brief else []
|
|
|
|
|
)
|
2025-01-20 23:24:20 +08:00
|
|
|
|
data.update(
|
|
|
|
|
{
|
|
|
|
|
"localized_title": localized_title,
|
|
|
|
|
"localized_description": localized_desc,
|
|
|
|
|
"title": title,
|
|
|
|
|
"orig_title": orig_title,
|
|
|
|
|
"other_title": other_title or None,
|
|
|
|
|
"orig_creator": orig_creator,
|
|
|
|
|
"author": authors,
|
|
|
|
|
"genre": genre,
|
|
|
|
|
"translator": None,
|
|
|
|
|
"director": director,
|
|
|
|
|
"playwright": playwright,
|
|
|
|
|
"actor": actor,
|
|
|
|
|
"language": language,
|
|
|
|
|
"platform": platform,
|
|
|
|
|
"imdb_code": imdb_code,
|
|
|
|
|
"pub_house": pub_house,
|
|
|
|
|
"binding": None,
|
|
|
|
|
"episode_count": episodes or None,
|
|
|
|
|
"official_site": site,
|
|
|
|
|
"site": site,
|
|
|
|
|
"isbn": isbn,
|
|
|
|
|
"brief": brief,
|
|
|
|
|
"cover_image_url": img_url,
|
|
|
|
|
"pages": pages,
|
|
|
|
|
"price": price,
|
|
|
|
|
"opening_date": opening_date,
|
|
|
|
|
"closing_date": closing_date,
|
|
|
|
|
"location": location,
|
|
|
|
|
}
|
|
|
|
|
)
|
2023-10-31 04:29:28 -04:00
|
|
|
|
lookup_ids = {}
|
|
|
|
|
if isbn:
|
|
|
|
|
lookup_ids[isbn_type] = isbn
|
|
|
|
|
if imdb_code:
|
|
|
|
|
lookup_ids[IdType.IMDB] = imdb_code
|
|
|
|
|
return ResourceContent(
|
|
|
|
|
metadata={k: v for k, v in data.items() if v is not None},
|
|
|
|
|
cover_image=raw_img,
|
|
|
|
|
cover_image_extention=ext,
|
|
|
|
|
lookup_ids=lookup_ids,
|
|
|
|
|
)
|