lib.itmens/catalog/sites/tmdb.py

420 lines
15 KiB
Python
Raw Normal View History

"""
The Movie Database
"""
import re
from django.conf import settings
from catalog.common import *
from .douban import *
from catalog.movie.models import *
from catalog.tv.models import *
import logging
2022-12-08 16:59:03 +00:00
_logger = logging.getLogger(__name__)
def search_tmdb_by_imdb_id(imdb_id):
tmdb_api_url = f"https://api.themoviedb.org/3/find/{imdb_id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&external_source=imdb_id"
res_data = BasicDownloader(tmdb_api_url).download().json()
return res_data
2022-12-16 01:08:10 -05:00
def query_tmdb_tv_episode(tv, season, episode):
tmdb_api_url = f"https://api.themoviedb.org/3/tv/{tv}/season/{season}/episode/{episode}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids"
res_data = BasicDownloader(tmdb_api_url).download().json()
return res_data
def _copy_dict(s, key_map):
d = {}
for src, dst in key_map.items():
d[dst if dst else src] = s.get(src)
return d
2022-12-15 17:29:35 -05:00
@SiteManager.register
class TMDB_Movie(AbstractSite):
2022-12-16 01:08:10 -05:00
SITE_NAME = SiteName.TMDB
ID_TYPE = IdType.TMDB_Movie
2022-12-29 23:57:02 -05:00
URL_PATTERNS = [r"\w+://www.themoviedb.org/movie/(\d+)"]
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = Movie
@classmethod
def id_to_url(self, id_value):
return f"https://www.themoviedb.org/movie/{id_value}"
def scrape(self):
is_series = False
if is_series:
api_url = f"https://api.themoviedb.org/3/tv/{self.id_value}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
else:
api_url = f"https://api.themoviedb.org/3/movie/{self.id_value}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
res_data = BasicDownloader(api_url).download().json()
if is_series:
2022-12-29 23:57:02 -05:00
title = res_data["name"]
orig_title = res_data["original_name"]
year = (
int(res_data["first_air_date"].split("-")[0])
if res_data["first_air_date"]
else None
)
imdb_code = res_data["external_ids"]["imdb_id"]
showtime = (
2023-06-07 04:14:38 -04:00
[{"time": res_data["first_air_date"], "region": "首播日期"}]
2022-12-29 23:57:02 -05:00
if res_data["first_air_date"]
else None
)
duration = None
else:
2022-12-29 23:57:02 -05:00
title = res_data["title"]
orig_title = res_data["original_title"]
year = (
int(res_data["release_date"].split("-")[0])
if res_data["release_date"]
else None
)
showtime = (
2023-06-07 04:14:38 -04:00
[{"time": res_data["release_date"], "region": "发布日期"}]
2022-12-29 23:57:02 -05:00
if res_data["release_date"]
else None
)
imdb_code = res_data["imdb_id"]
# in minutes
2022-12-29 23:57:02 -05:00
duration = res_data["runtime"] if res_data["runtime"] else None
2022-12-29 23:57:02 -05:00
genre = [x["name"] for x in res_data["genres"]]
language = list(map(lambda x: x["name"], res_data["spoken_languages"]))
brief = res_data["overview"]
if is_series:
2022-12-29 23:57:02 -05:00
director = list(map(lambda x: x["name"], res_data["created_by"]))
else:
2022-12-29 23:57:02 -05:00
director = list(
map(
lambda x: x["name"],
filter(
lambda c: c["job"] == "Director", res_data["credits"]["crew"]
),
)
)
playwright = list(
map(
lambda x: x["name"],
filter(lambda c: c["job"] == "Screenplay", res_data["credits"]["crew"]),
)
)
actor = list(map(lambda x: x["name"], res_data["credits"]["cast"]))
area = []
other_info = {}
# other_info['TMDB评分'] = res_data['vote_average']
# other_info['分级'] = res_data['contentRating']
# other_info['Metacritic评分'] = res_data['metacriticRating']
# other_info['奖项'] = res_data['awards']
# other_info['TMDB_ID'] = id
if is_series:
2022-12-29 23:57:02 -05:00
other_info["Seasons"] = res_data["number_of_seasons"]
other_info["Episodes"] = res_data["number_of_episodes"]
# TODO: use GET /configuration to get base url
2022-12-29 23:57:02 -05:00
img_url = (
("https://image.tmdb.org/t/p/original/" + res_data["poster_path"])
if res_data["poster_path"] is not None
else None
)
pd = ResourceContent(
metadata={
"title": title,
"orig_title": orig_title,
"other_title": [],
2022-12-29 23:57:02 -05:00
"imdb_code": imdb_code,
"director": director,
"playwright": playwright,
"actor": actor,
"genre": genre,
"showtime": showtime,
"site": None,
"area": area,
"language": language,
"year": year,
"duration": duration,
"season": None,
"episodes": None,
"single_episode_length": None,
"brief": brief,
"cover_image_url": img_url,
}
)
if imdb_code:
pd.lookup_ids[IdType.IMDB] = imdb_code
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
2022-12-29 23:57:02 -05:00
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd
2022-12-15 17:29:35 -05:00
@SiteManager.register
class TMDB_TV(AbstractSite):
2022-12-27 14:52:03 -05:00
SITE_NAME = SiteName.TMDB
ID_TYPE = IdType.TMDB_TV
2022-12-29 23:57:02 -05:00
URL_PATTERNS = [
r"\w+://www.themoviedb.org/tv/(\d+)[^/]*$",
r"\w+://www.themoviedb.org/tv/(\d+)[^/]*/seasons",
]
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = TVShow
@classmethod
def id_to_url(self, id_value):
return f"https://www.themoviedb.org/tv/{id_value}"
def scrape(self):
is_series = True
if is_series:
api_url = f"https://api.themoviedb.org/3/tv/{self.id_value}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
else:
api_url = f"https://api.themoviedb.org/3/movie/{self.id_value}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
res_data = BasicDownloader(api_url).download().json()
if is_series:
2022-12-29 23:57:02 -05:00
title = res_data["name"]
orig_title = res_data["original_name"]
year = (
int(res_data["first_air_date"].split("-")[0])
if res_data["first_air_date"]
else None
)
imdb_code = res_data["external_ids"]["imdb_id"]
showtime = (
2023-06-07 04:14:38 -04:00
[{"time": res_data["first_air_date"], "region": "首播日期"}]
2022-12-29 23:57:02 -05:00
if res_data["first_air_date"]
else None
)
duration = None
else:
2022-12-29 23:57:02 -05:00
title = res_data["title"]
orig_title = res_data["original_title"]
year = (
int(res_data["release_date"].split("-")[0])
if res_data["release_date"]
else None
)
showtime = (
2023-06-07 04:14:38 -04:00
[{"time": res_data["release_date"], "region": "发布日期"}]
2022-12-29 23:57:02 -05:00
if res_data["release_date"]
else None
)
imdb_code = res_data["imdb_id"]
# in minutes
2022-12-29 23:57:02 -05:00
duration = res_data["runtime"] if res_data["runtime"] else None
2022-12-29 23:57:02 -05:00
genre = [x["name"] for x in res_data["genres"]]
2022-12-16 01:08:10 -05:00
2022-12-29 23:57:02 -05:00
language = list(map(lambda x: x["name"], res_data["spoken_languages"]))
brief = res_data["overview"]
if is_series:
2022-12-29 23:57:02 -05:00
director = list(map(lambda x: x["name"], res_data["created_by"]))
else:
2022-12-29 23:57:02 -05:00
director = list(
map(
lambda x: x["name"],
filter(
lambda c: c["job"] == "Director", res_data["credits"]["crew"]
),
)
)
playwright = list(
map(
lambda x: x["name"],
filter(lambda c: c["job"] == "Screenplay", res_data["credits"]["crew"]),
)
)
actor = list(map(lambda x: x["name"], res_data["credits"]["cast"]))
area = []
other_info = {}
# other_info['TMDB评分'] = res_data['vote_average']
# other_info['分级'] = res_data['contentRating']
# other_info['Metacritic评分'] = res_data['metacriticRating']
# other_info['奖项'] = res_data['awards']
# other_info['TMDB_ID'] = id
if is_series:
2022-12-29 23:57:02 -05:00
other_info["Seasons"] = res_data["number_of_seasons"]
other_info["Episodes"] = res_data["number_of_episodes"]
# TODO: use GET /configuration to get base url
2022-12-29 23:57:02 -05:00
img_url = (
("https://image.tmdb.org/t/p/original/" + res_data["poster_path"])
if res_data["poster_path"] is not None
else None
)
season_links = list(
map(
lambda s: {
"model": "TVSeason",
"id_type": IdType.TMDB_TVSeason,
"id_value": f'{self.id_value}-{s["season_number"]}',
"title": s["name"],
"url": f'{self.url}/season/{s["season_number"]}',
},
res_data["seasons"],
)
)
pd = ResourceContent(
metadata={
"title": title,
"orig_title": orig_title,
"other_title": [],
2022-12-29 23:57:02 -05:00
"imdb_code": imdb_code,
"director": director,
"playwright": playwright,
"actor": actor,
"genre": genre,
"showtime": showtime,
"site": None,
"area": area,
"language": language,
"year": year,
"duration": duration,
"season_count": res_data["number_of_seasons"],
"season": None,
"episodes": None,
"single_episode_length": None,
"brief": brief,
"cover_image_url": img_url,
2023-05-22 17:54:18 -04:00
"related_resources": season_links, # FIXME crawling them for now, but many douban tv season data may have wrong imdb links
2022-12-29 23:57:02 -05:00
}
)
if imdb_code:
pd.lookup_ids[IdType.IMDB] = imdb_code
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
2022-12-29 23:57:02 -05:00
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd
2022-12-15 17:29:35 -05:00
@SiteManager.register
class TMDB_TVSeason(AbstractSite):
2022-12-27 14:52:03 -05:00
SITE_NAME = SiteName.TMDB
ID_TYPE = IdType.TMDB_TVSeason
2022-12-29 23:57:02 -05:00
URL_PATTERNS = [r"\w+://www.themoviedb.org/tv/(\d+)[^/]*/season/(\d+)[^/]*$"]
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = TVSeason
2022-12-29 23:57:02 -05:00
ID_PATTERN = r"^(\d+)-(\d+)$"
@classmethod
def url_to_id(cls, url: str):
2022-12-29 23:57:02 -05:00
u = next(
iter([re.match(p, url) for p in cls.URL_PATTERNS if re.match(p, url)]), None
)
return u[1] + "-" + u[2] if u else None
@classmethod
def id_to_url(cls, id_value):
2022-12-29 23:57:02 -05:00
v = id_value.split("-")
return f"https://www.themoviedb.org/tv/{v[0]}/season/{v[1]}"
def scrape(self):
2022-12-29 23:57:02 -05:00
v = self.id_value.split("-")
2023-01-07 12:00:09 -05:00
show_id = v[0]
season_id = v[1]
site = TMDB_TV(TMDB_TV.id_to_url(show_id))
show_resource = site.get_resource_ready(auto_create=False, auto_link=False)
api_url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season_id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
d = BasicDownloader(api_url).download().json()
2022-12-29 23:57:02 -05:00
if not d.get("id"):
raise ParseError("id")
pd = ResourceContent(
metadata=_copy_dict(
d,
{
"name": "title",
"overview": "brief",
"air_date": "air_date",
"season_number": 0,
"external_ids": [],
},
)
)
2023-01-07 12:00:09 -05:00
pd.metadata["title"] = (
show_resource.metadata["title"] + " " + pd.metadata["title"]
)
2022-12-29 23:57:02 -05:00
pd.metadata["required_resources"] = [
{
"model": "TVShow",
"id_type": IdType.TMDB_TV,
"id_value": show_id,
"title": f"TMDB TV Show {show_id}",
"url": f"https://www.themoviedb.org/tv/{show_id}",
2022-12-29 23:57:02 -05:00
}
]
pd.lookup_ids[IdType.IMDB] = d["external_ids"].get("imdb_id")
pd.metadata["cover_image_url"] = (
("https://image.tmdb.org/t/p/original/" + d["poster_path"])
if d["poster_path"]
else None
)
pd.metadata["title"] = (
pd.metadata["title"]
if pd.metadata["title"]
else f'Season {d["season_number"]}'
)
pd.metadata["episode_number_list"] = list(
map(lambda ep: ep["episode_number"], d["episodes"])
)
pd.metadata["episode_count"] = len(pd.metadata["episode_number_list"])
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
2022-12-29 23:57:02 -05:00
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
# use show's IMDB (for Season 1) or 1st episode's IMDB (if not Season 1) as this season's IMDB so that it can be compatible with TVSeason data from Douban
if pd.lookup_ids.get(IdType.IMDB):
# this should not happen
_logger.warning("Unexpected IMDB id for TMDB tv season")
elif pd.metadata.get("season_number") == 1:
res = SiteManager.get_site_by_url(
f"https://www.themoviedb.org/tv/{show_id}"
).get_resource_ready()
pd.lookup_ids[IdType.IMDB] = (
res.other_lookup_ids.get(IdType.IMDB) if res else None
)
elif len(pd.metadata["episode_number_list"]) == 0:
_logger.warning(
"Unable to lookup IMDB id for TMDB tv season with zero episodes"
)
else:
ep = pd.metadata["episode_number_list"][0]
api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
d2 = BasicDownloader(api_url2).download().json()
if not d2.get("id"):
raise ParseError("first episode id for season")
pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id")
return pd