2023-06-07 01:29:06 -04:00
|
|
|
import json
|
2023-08-10 11:27:31 -04:00
|
|
|
import logging
|
|
|
|
|
2022-12-07 19:09:05 -05:00
|
|
|
from catalog.common import *
|
|
|
|
from catalog.movie.models import *
|
|
|
|
from catalog.tv.models import *
|
|
|
|
|
2023-08-10 11:27:31 -04:00
|
|
|
from .tmdb import search_tmdb_by_imdb_id
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2022-12-08 16:59:03 +00:00
|
|
|
_logger = logging.getLogger(__name__)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
|
2022-12-15 17:29:35 -05:00
|
|
|
@SiteManager.register
|
2022-12-07 19:09:05 -05:00
|
|
|
class IMDB(AbstractSite):
|
2023-06-16 17:47:22 -04:00
|
|
|
"""
|
|
|
|
IMDb site manager
|
|
|
|
|
|
|
|
IMDB ids map to Movie, TVShow or TVEpisode
|
|
|
|
IMDB
|
|
|
|
"""
|
|
|
|
|
2022-12-16 01:08:10 -05:00
|
|
|
SITE_NAME = SiteName.IMDB
|
2022-12-07 19:09:05 -05:00
|
|
|
ID_TYPE = IdType.IMDB
|
2023-05-13 16:33:51 -04:00
|
|
|
URL_PATTERNS = [
|
|
|
|
r"\w+://www.imdb.com/title/(tt\d+)",
|
|
|
|
r"\w+://m.imdb.com/title/(tt\d+)",
|
|
|
|
]
|
2022-12-29 23:57:02 -05:00
|
|
|
WIKI_PROPERTY_ID = "?"
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
@classmethod
|
2023-01-08 16:26:05 -05:00
|
|
|
def id_to_url(cls, id_value):
|
2022-12-07 19:09:05 -05:00
|
|
|
return "https://www.imdb.com/title/" + id_value + "/"
|
|
|
|
|
|
|
|
def scrape(self):
|
2022-12-08 05:53:00 +00:00
|
|
|
res_data = search_tmdb_by_imdb_id(self.id_value)
|
2023-06-16 17:47:22 -04:00
|
|
|
url = None
|
|
|
|
pd = None
|
2023-01-08 16:26:05 -05:00
|
|
|
if (
|
|
|
|
"movie_results" in res_data
|
|
|
|
and len(res_data["movie_results"]) > 0
|
|
|
|
and self.DEFAULT_MODEL in [None, Movie]
|
|
|
|
):
|
2022-12-29 23:57:02 -05:00
|
|
|
url = (
|
|
|
|
f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}"
|
|
|
|
)
|
|
|
|
elif "tv_results" in res_data and len(res_data["tv_results"]) > 0:
|
2022-12-07 19:09:05 -05:00
|
|
|
url = f"https://www.themoviedb.org/tv/{res_data['tv_results'][0]['id']}"
|
2022-12-29 23:57:02 -05:00
|
|
|
elif "tv_season_results" in res_data and len(res_data["tv_season_results"]) > 0:
|
2022-12-07 19:09:05 -05:00
|
|
|
# this should not happen given IMDB only has ids for either show or episode
|
2022-12-29 23:57:02 -05:00
|
|
|
tv_id = res_data["tv_season_results"][0]["show_id"]
|
|
|
|
season_number = res_data["tv_season_results"][0]["season_number"]
|
2023-01-08 16:26:05 -05:00
|
|
|
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}"
|
2022-12-29 23:57:02 -05:00
|
|
|
elif (
|
|
|
|
"tv_episode_results" in res_data and len(res_data["tv_episode_results"]) > 0
|
|
|
|
):
|
|
|
|
tv_id = res_data["tv_episode_results"][0]["show_id"]
|
|
|
|
season_number = res_data["tv_episode_results"][0]["season_number"]
|
|
|
|
episode_number = res_data["tv_episode_results"][0]["episode_number"]
|
2023-06-16 17:47:22 -04:00
|
|
|
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}"
|
2023-08-11 11:55:42 -04:00
|
|
|
pd = None
|
2023-06-16 17:47:22 -04:00
|
|
|
if url:
|
|
|
|
tmdb = SiteManager.get_site_by_url(url)
|
2023-08-11 11:55:42 -04:00
|
|
|
if tmdb:
|
|
|
|
pd = tmdb.scrape()
|
|
|
|
pd.metadata["preferred_model"] = (
|
|
|
|
tmdb.DEFAULT_MODEL.__name__ if tmdb.DEFAULT_MODEL else None
|
|
|
|
)
|
|
|
|
# do not auto fetch parent season
|
|
|
|
pd.metadata["required_resources"] = []
|
2023-06-16 17:47:22 -04:00
|
|
|
if not pd:
|
|
|
|
# if IMDB id not found in TMDB, use real IMDB scraper
|
|
|
|
pd = self.scrape_imdb()
|
2022-12-07 19:09:05 -05:00
|
|
|
return pd
|
2023-06-07 01:29:06 -04:00
|
|
|
|
|
|
|
def scrape_imdb(self):
|
|
|
|
h = BasicDownloader(self.url).download().html()
|
2023-08-11 11:55:42 -04:00
|
|
|
src = self.query_str(h, '//script[@id="__NEXT_DATA__"]/text()')
|
2023-06-07 01:29:06 -04:00
|
|
|
if not src:
|
|
|
|
raise ParseError(self, "__NEXT_DATA__ element")
|
|
|
|
d = json.loads(src)["props"]["pageProps"]["aboveTheFoldData"]
|
|
|
|
data = {
|
|
|
|
"title": d["titleText"]["text"],
|
2023-06-20 11:47:42 -04:00
|
|
|
"year": d["releaseYear"]["year"] if d.get("releaseYear") else None,
|
2023-06-07 01:29:06 -04:00
|
|
|
"is_series": d["titleType"]["isSeries"],
|
|
|
|
"is_episode": d["titleType"]["isEpisode"],
|
2024-06-02 14:50:07 -04:00
|
|
|
"genre": (
|
|
|
|
[x["text"] for x in d["genres"]["genres"]] if d.get("genres") else []
|
|
|
|
),
|
2023-06-07 01:29:06 -04:00
|
|
|
"brief": d["plot"].get("plotText") if d.get("plot") else None,
|
2024-06-02 14:50:07 -04:00
|
|
|
"cover_image_url": (
|
|
|
|
d["primaryImage"].get("url") if d.get("primaryImage") else None
|
|
|
|
),
|
2023-06-07 01:29:06 -04:00
|
|
|
}
|
2023-06-16 17:47:22 -04:00
|
|
|
if d.get("series"):
|
|
|
|
episode_info = d["series"].get("episodeNumber")
|
|
|
|
if episode_info:
|
|
|
|
data["season_number"] = episode_info["seasonNumber"]
|
|
|
|
data["episode_number"] = episode_info["episodeNumber"]
|
|
|
|
series = d["series"].get("series")
|
|
|
|
if series:
|
|
|
|
data["show_imdb_id"] = series["id"]
|
2023-06-07 01:29:06 -04:00
|
|
|
# TODO more data fields and localized title (in <url>releaseinfo/)
|
|
|
|
data["preferred_model"] = (
|
2023-06-16 17:47:22 -04:00
|
|
|
"TVEpisode"
|
2023-06-07 01:29:06 -04:00
|
|
|
if data["is_episode"]
|
|
|
|
else ("TVShow" if data["is_series"] else "Movie")
|
|
|
|
)
|
2023-06-19 15:40:27 -04:00
|
|
|
if data["preferred_model"] == "TVEpisode" and data["title"].startswith(
|
|
|
|
"Episode #"
|
|
|
|
):
|
|
|
|
data["title"] = re.sub(r"#(\d+).(\d+)", r"S\1E\2", data["title"][8:])
|
2023-06-07 01:29:06 -04:00
|
|
|
pd = ResourceContent(metadata=data)
|
|
|
|
pd.lookup_ids[IdType.IMDB] = self.id_value
|
|
|
|
if pd.metadata["cover_image_url"]:
|
|
|
|
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
|
|
|
|
try:
|
|
|
|
pd.cover_image = imgdl.download().content
|
|
|
|
pd.cover_image_extention = imgdl.extention
|
|
|
|
except Exception:
|
|
|
|
_logger.debug(
|
|
|
|
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
|
|
|
|
)
|
|
|
|
return pd
|
2023-06-16 17:47:22 -04:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def get_episode_list(show_id, season_id):
|
|
|
|
url = f"https://m.imdb.com/title/{show_id}/"
|
|
|
|
h = BasicDownloader(url).download().html()
|
2023-08-11 11:55:42 -04:00
|
|
|
u: str = h.xpath('//a[@data-testid="hero-title-block__series-link"]/@href') # type: ignore
|
|
|
|
show_url = "".join(u).split("?")[0]
|
2023-06-16 17:47:22 -04:00
|
|
|
if not show_url:
|
|
|
|
show_url = f"/title/{show_id}/"
|
|
|
|
url = f"https://m.imdb.com{show_url}episodes/?season={season_id}"
|
|
|
|
h = BasicDownloader(url).download().html()
|
|
|
|
episodes = []
|
2023-08-11 11:55:42 -04:00
|
|
|
for e in h.xpath('//div[@id="eplist"]/div/a'): # type: ignore
|
2023-06-16 17:47:22 -04:00
|
|
|
episode_number = e.xpath(
|
|
|
|
'./span[contains(@class,"episode-list__title")]/text()'
|
|
|
|
)[0].strip()
|
|
|
|
episode_number = int(episode_number.split(".")[0])
|
|
|
|
episode_title = " ".join(
|
|
|
|
e.xpath('.//strong[@class="episode-list__title-text"]/text()')
|
|
|
|
).strip()
|
|
|
|
episode_url = e.xpath("./@href")[0]
|
|
|
|
episode_url = "https://www.imdb.com" + episode_url
|
|
|
|
episodes.append(
|
|
|
|
{
|
|
|
|
"model": "TVEpisode",
|
|
|
|
"id_type": IdType.IMDB,
|
|
|
|
"id_value": IMDB.url_to_id(episode_url),
|
|
|
|
"url": episode_url,
|
|
|
|
"title": episode_title,
|
|
|
|
"episode_number": episode_number,
|
|
|
|
}
|
|
|
|
)
|
|
|
|
return episodes
|
|
|
|
|
|
|
|
@staticmethod
|
2023-06-20 12:19:10 -04:00
|
|
|
def fetch_episodes_for_season(season):
|
2023-06-16 17:47:22 -04:00
|
|
|
if not season.season_number or not season.imdb:
|
|
|
|
_logger.warning(f"season {season} is missing season number or imdb id")
|
|
|
|
return
|
|
|
|
episodes = IMDB.get_episode_list(season.imdb, season.season_number)
|
2023-06-19 15:40:27 -04:00
|
|
|
if episodes:
|
|
|
|
if not season.episode_count or season.episode_count < len(episodes):
|
|
|
|
season.episode_count = len(episodes)
|
|
|
|
season.save()
|
|
|
|
for e in episodes:
|
|
|
|
episode = TVEpisode.objects.filter(
|
|
|
|
season=season, episode_number=e["episode_number"]
|
|
|
|
).first()
|
|
|
|
if not episode:
|
|
|
|
site = SiteManager.get_site_by_url(e["url"])
|
2023-08-11 11:55:42 -04:00
|
|
|
if site:
|
|
|
|
res = site.get_resource_ready()
|
|
|
|
if res and res.item:
|
|
|
|
episode = res.item
|
|
|
|
episode.set_parent_item(season)
|
2024-02-03 22:02:22 -05:00
|
|
|
episode.season_number = season.season_number
|
2023-08-11 11:55:42 -04:00
|
|
|
episode.save()
|
2023-06-19 15:40:27 -04:00
|
|
|
else:
|
|
|
|
_logger.warning(f"season {season} has no episodes fetched, creating dummy")
|
|
|
|
cnt = int(season.episode_count or 0)
|
2024-02-03 22:02:22 -05:00
|
|
|
if cnt > 50:
|
|
|
|
cnt = 50
|
2023-06-19 15:40:27 -04:00
|
|
|
for i in range(1, cnt + 1):
|
|
|
|
episode = TVEpisode.objects.filter(
|
|
|
|
season=season, episode_number=i
|
|
|
|
).first()
|
|
|
|
if not episode:
|
|
|
|
TVEpisode.objects.create(
|
|
|
|
title=f"S{season.season_number or '0'}E{i}",
|
|
|
|
season=season,
|
2024-02-03 22:02:22 -05:00
|
|
|
season_number=season.season_number,
|
2023-06-19 15:40:27 -04:00
|
|
|
episode_number=i,
|
|
|
|
)
|