2023-06-07 01:29:06 -04:00
|
|
|
import json
|
2022-12-07 19:09:05 -05:00
|
|
|
from catalog.common import *
|
2022-12-08 05:53:00 +00:00
|
|
|
from .tmdb import search_tmdb_by_imdb_id
|
2022-12-07 19:09:05 -05:00
|
|
|
from catalog.movie.models import *
|
|
|
|
from catalog.tv.models import *
|
|
|
|
import logging
|
|
|
|
|
|
|
|
|
2022-12-08 16:59:03 +00:00
|
|
|
_logger = logging.getLogger(__name__)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
|
2022-12-15 17:29:35 -05:00
|
|
|
@SiteManager.register
|
2022-12-07 19:09:05 -05:00
|
|
|
class IMDB(AbstractSite):
|
2022-12-16 01:08:10 -05:00
|
|
|
SITE_NAME = SiteName.IMDB
|
2022-12-07 19:09:05 -05:00
|
|
|
ID_TYPE = IdType.IMDB
|
2023-05-13 16:33:51 -04:00
|
|
|
URL_PATTERNS = [
|
|
|
|
r"\w+://www.imdb.com/title/(tt\d+)",
|
|
|
|
r"\w+://m.imdb.com/title/(tt\d+)",
|
|
|
|
]
|
2022-12-29 23:57:02 -05:00
|
|
|
WIKI_PROPERTY_ID = "?"
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
@classmethod
|
2023-01-08 16:26:05 -05:00
|
|
|
def id_to_url(cls, id_value):
|
2022-12-07 19:09:05 -05:00
|
|
|
return "https://www.imdb.com/title/" + id_value + "/"
|
|
|
|
|
|
|
|
def scrape(self):
|
2022-12-08 05:53:00 +00:00
|
|
|
res_data = search_tmdb_by_imdb_id(self.id_value)
|
2023-01-08 16:26:05 -05:00
|
|
|
if (
|
|
|
|
"movie_results" in res_data
|
|
|
|
and len(res_data["movie_results"]) > 0
|
|
|
|
and self.DEFAULT_MODEL in [None, Movie]
|
|
|
|
):
|
2022-12-29 23:57:02 -05:00
|
|
|
url = (
|
|
|
|
f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}"
|
|
|
|
)
|
|
|
|
elif "tv_results" in res_data and len(res_data["tv_results"]) > 0:
|
2022-12-07 19:09:05 -05:00
|
|
|
url = f"https://www.themoviedb.org/tv/{res_data['tv_results'][0]['id']}"
|
2022-12-29 23:57:02 -05:00
|
|
|
elif "tv_season_results" in res_data and len(res_data["tv_season_results"]) > 0:
|
2022-12-07 19:09:05 -05:00
|
|
|
# this should not happen given IMDB only has ids for either show or episode
|
2022-12-29 23:57:02 -05:00
|
|
|
tv_id = res_data["tv_season_results"][0]["show_id"]
|
|
|
|
season_number = res_data["tv_season_results"][0]["season_number"]
|
2023-01-08 16:26:05 -05:00
|
|
|
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}"
|
2022-12-29 23:57:02 -05:00
|
|
|
elif (
|
|
|
|
"tv_episode_results" in res_data and len(res_data["tv_episode_results"]) > 0
|
|
|
|
):
|
|
|
|
tv_id = res_data["tv_episode_results"][0]["show_id"]
|
|
|
|
season_number = res_data["tv_episode_results"][0]["season_number"]
|
|
|
|
episode_number = res_data["tv_episode_results"][0]["episode_number"]
|
2022-12-07 19:09:05 -05:00
|
|
|
if season_number == 0:
|
|
|
|
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}"
|
|
|
|
elif episode_number == 1:
|
|
|
|
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}"
|
|
|
|
else:
|
2022-12-29 23:57:02 -05:00
|
|
|
raise ParseError(
|
|
|
|
self,
|
|
|
|
"IMDB id matching TMDB but not first episode, this is not supported",
|
|
|
|
)
|
2022-12-07 19:09:05 -05:00
|
|
|
else:
|
2023-06-07 01:29:06 -04:00
|
|
|
# IMDB id not found in TMDB use real IMDB scraper
|
|
|
|
return self.scrape_imdb()
|
2022-12-15 17:29:35 -05:00
|
|
|
tmdb = SiteManager.get_site_by_url(url)
|
2022-12-07 19:09:05 -05:00
|
|
|
pd = tmdb.scrape()
|
2022-12-29 23:57:02 -05:00
|
|
|
pd.metadata["preferred_model"] = tmdb.DEFAULT_MODEL.__name__
|
2022-12-07 19:09:05 -05:00
|
|
|
return pd
|
2023-06-07 01:29:06 -04:00
|
|
|
|
|
|
|
def scrape_imdb(self):
|
|
|
|
h = BasicDownloader(self.url).download().html()
|
|
|
|
elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()')
|
|
|
|
src = elem[0].strip() if elem else None
|
|
|
|
if not src:
|
|
|
|
raise ParseError(self, "__NEXT_DATA__ element")
|
|
|
|
d = json.loads(src)["props"]["pageProps"]["aboveTheFoldData"]
|
|
|
|
data = {
|
|
|
|
"title": d["titleText"]["text"],
|
|
|
|
"year": d["releaseYear"]["year"],
|
|
|
|
"is_series": d["titleType"]["isSeries"],
|
|
|
|
"is_episode": d["titleType"]["isEpisode"],
|
|
|
|
"genre": [x["text"] for x in d["genres"]["genres"]],
|
|
|
|
"brief": d["plot"].get("plotText") if d.get("plot") else None,
|
|
|
|
"cover_image_url": d["primaryImage"].get("url")
|
|
|
|
if d.get("primaryImage")
|
|
|
|
else None,
|
|
|
|
}
|
|
|
|
# TODO more data fields and localized title (in <url>releaseinfo/)
|
|
|
|
data["preferred_model"] = (
|
|
|
|
"" # "TVSeason" not supported yet
|
|
|
|
if data["is_episode"]
|
|
|
|
else ("TVShow" if data["is_series"] else "Movie")
|
|
|
|
)
|
|
|
|
|
|
|
|
pd = ResourceContent(metadata=data)
|
|
|
|
pd.lookup_ids[IdType.IMDB] = self.id_value
|
|
|
|
if pd.metadata["cover_image_url"]:
|
|
|
|
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
|
|
|
|
try:
|
|
|
|
pd.cover_image = imgdl.download().content
|
|
|
|
pd.cover_image_extention = imgdl.extention
|
|
|
|
except Exception:
|
|
|
|
_logger.debug(
|
|
|
|
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
|
|
|
|
)
|
|
|
|
return pd
|