From 3b53d626bca458dae004fca74f5212d19c5e6cd9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 8 Jan 2023 16:26:05 -0500 Subject: [PATCH] all douban tv map to season and add script to find links to tv show --- catalog/sites/douban_movie.py | 108 ++++++++++-------------- catalog/sites/imdb.py | 10 ++- catalog/sites/tmdb.py | 46 ++++++----- catalog/tv/tests.py | 13 +-- legacy/management/commands/link_tv.py | 114 ++++++++++++++++++++++++++ 5 files changed, 200 insertions(+), 91 deletions(-) create mode 100644 legacy/management/commands/link_tv.py diff --git a/catalog/sites/douban_movie.py b/catalog/sites/douban_movie.py index 50e85ce6..2e7355d6 100644 --- a/catalog/sites/douban_movie.py +++ b/catalog/sites/douban_movie.py @@ -216,79 +216,61 @@ class DoubanMovie(AbstractSite): } ) pd.metadata["preferred_model"] = ( - ("TVShow" if is_series else "Movie") if not season else "TVSeason" + "TVSeason" if is_series or episodes or season else "Movie" ) + tmdb_season_id = None if imdb_code: res_data = search_tmdb_by_imdb_id(imdb_code) - tmdb_show_id = None - if "movie_results" in res_data and len(res_data["movie_results"]) > 0: - pd.metadata["preferred_model"] = "Movie" - elif "tv_results" in res_data and len(res_data["tv_results"]) > 0: - if pd.metadata["preferred_model"] == "TVSeason": - """ - determine if this Douban Movie item should map to - a single season tv show, or - first season of multi-season show - """ - tmdb_show_id = res_data["tv_results"][0]["id"] - tmdb_season_id = f"{tmdb_show_id}-1" - site = TMDB_TVSeason(TMDB_TVSeason.id_to_url(tmdb_season_id)) - tmdb_tvseason = site.get_resource_ready().item - tmdb_tv = tmdb_tvseason.show - if tmdb_tv.season_count == 1: - pd.metadata["preferred_model"] = "TVShow" - # else: - # pd.metadata["preferred_model"] = "TVSeason" - # resp = query_tmdb_tv_episode(tmdb_show_id, 1, 1) - # imdb_code = resp["external_ids"]["imdb_id"] - # _logger.warning( - # f"Douban Movie {self.url} re-mapped to imdb episode {imdb_code}" - # ) - elif ( - "tv_season_results" in res_data - and len(res_data["tv_season_results"]) > 0 - ): - pd.metadata["preferred_model"] = "TVSeason" - tmdb_show_id = res_data["tv_season_results"][0]["show_id"] - tmdb_season_id = f"{tmdb_show_id}-{season}" - elif ( + has_movie = ( + "movie_results" in res_data and len(res_data["movie_results"]) > 0 + ) + has_tv = "tv_results" in res_data and len(res_data["tv_results"]) > 0 + has_episode = ( "tv_episode_results" in res_data and len(res_data["tv_episode_results"]) > 0 - ): + ) + if pd.metadata["preferred_model"] == "TVSeason" and has_tv: + if pd.metadata.get("season") and pd.metadata.get("season") != 1: + _logger.warn(f"{imdb_code} matched imdb tv show, force season 1") + pd.metadata["season"] = 1 + elif pd.metadata["preferred_model"] == "TVSeason" and has_episode: + if res_data["tv_episode_results"][0]["episode_number"] != 1: + _logger.warning( + f"Douban Movie {self.url} IMDB {imdb_code} mapping to non-first episode in a season" + ) + elif res_data["tv_episode_results"][0]["season_number"] == 1: + _logger.warning( + f"Douban Movie {self.url} IMDB {imdb_code} mapping to first season episode in a season" + ) + elif has_movie: + if pd.metadata["preferred_model"] != "Movie": + _logger.warn(f"{imdb_code} matched imdb movie, force Movie") + pd.metadata["preferred_model"] = "Movie" + elif has_tv or has_episode: + _logger.warn(f"{imdb_code} matched imdb tv/episode, force TVSeason") pd.metadata["preferred_model"] = "TVSeason" - tmdb_show_id = res_data["tv_episode_results"][0]["show_id"] - tmdb_season_id = f"{tmdb_show_id}-{season}" - # if res_data["tv_episode_results"][0]["episode_number"] != 1: - # _logger.warning( - # f"Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}" - # ) - # resp = query_tmdb_tv_episode( - # tmdb_show_id, - # res_data["tv_episode_results"][0]["season_number"], - # 1, - # ) - # imdb_code = resp["external_ids"]["imdb_id"] - # _logger.warning( - # f"Douban Movie {self.url} re-mapped to imdb episode {imdb_code}" - # ) + else: + _logger.warn(f"{imdb_code} unknown to TMDB") pd.lookup_ids[IdType.IMDB] = imdb_code - if pd.metadata["preferred_model"] == "TVSeason": - pd.lookup_ids[IdType.TMDB_TVSeason] = tmdb_season_id - elif pd.metadata["preferred_model"] == "TVShow": - pd.lookup_ids[IdType.TMDB_TV] = tmdb_show_id - # if tmdb_show_id: - # pd.metadata["required_resources"] = [ - # { - # "model": "TVShow", - # "id_type": IdType.TMDB_TV, - # "id_value": tmdb_show_id, - # "title": title, - # "url": TMDB_TV.id_to_url(tmdb_show_id), - # } - # ] + if pd.metadata["preferred_model"] == "TVSeason": + tmdb_show_id = None + if has_tv: + tmdb_show_id = res_data["tv_results"][0]["id"] + elif has_episode: + tmdb_show_id = res_data["tv_episode_results"][0]["show_id"] + if tmdb_show_id: + pd.metadata["required_resources"] = [ + { + "model": "TVShow", + "id_type": IdType.TMDB_TV, + "id_value": tmdb_show_id, + "title": title, + "url": TMDB_TV.id_to_url(tmdb_show_id), + } + ] # TODO parse sister seasons # pd.metadata['related_resources'] = [] if pd.metadata["cover_image_url"]: diff --git a/catalog/sites/imdb.py b/catalog/sites/imdb.py index eb725bd3..541954c9 100644 --- a/catalog/sites/imdb.py +++ b/catalog/sites/imdb.py @@ -16,13 +16,17 @@ class IMDB(AbstractSite): WIKI_PROPERTY_ID = "?" @classmethod - def id_to_url(self, id_value): + def id_to_url(cls, id_value): return "https://www.imdb.com/title/" + id_value + "/" def scrape(self): self.scraped = False res_data = search_tmdb_by_imdb_id(self.id_value) - if "movie_results" in res_data and len(res_data["movie_results"]) > 0: + if ( + "movie_results" in res_data + and len(res_data["movie_results"]) > 0 + and self.DEFAULT_MODEL in [None, Movie] + ): url = ( f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}" ) @@ -32,7 +36,7 @@ class IMDB(AbstractSite): # this should not happen given IMDB only has ids for either show or episode tv_id = res_data["tv_season_results"][0]["show_id"] season_number = res_data["tv_season_results"][0]["season_number"] - url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}" + url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}" elif ( "tv_episode_results" in res_data and len(res_data["tv_episode_results"]) > 0 ): diff --git a/catalog/sites/tmdb.py b/catalog/sites/tmdb.py index 3651205d..32593433 100644 --- a/catalog/sites/tmdb.py +++ b/catalog/sites/tmdb.py @@ -296,7 +296,7 @@ class TMDB_TV(AbstractSite): "single_episode_length": None, "brief": brief, "cover_image_url": img_url, - "related_resources": season_links, + # "related_resources": season_links, # FIXME not crawling them for now given many douban tv season data has errors } ) if imdb_code: @@ -364,9 +364,9 @@ class TMDB_TVSeason(AbstractSite): { "model": "TVShow", "id_type": IdType.TMDB_TV, - "id_value": v[0], - "title": f"TMDB TV Show {v[0]}", - "url": f"https://www.themoviedb.org/tv/{v[0]}", + "id_value": show_id, + "title": f"TMDB TV Show {show_id}", + "url": f"https://www.themoviedb.org/tv/{show_id}", } ] pd.lookup_ids[IdType.IMDB] = d["external_ids"].get("imdb_id") @@ -394,18 +394,26 @@ class TMDB_TVSeason(AbstractSite): f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}' ) - # get external id from 1st episode - # if pd.lookup_ids[IdType.IMDB]: - # _logger.warning("Unexpected IMDB id for TMDB tv season") - # elif len(pd.metadata["episode_number_list"]) == 0: - # _logger.warning( - # "Unable to lookup IMDB id for TMDB tv season with zero episodes" - # ) - # else: - # ep = pd.metadata["episode_number_list"][0] - # api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits" - # d2 = BasicDownloader(api_url2).download().json() - # if not d2.get("id"): - # raise ParseError("episode id for season") - # pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id") - # return pd + # use show's IMDB (for Season 1) or 1st episode's IMDB (if not Season 1) as this season's IMDB so that it can be compatible with TVSeason data from Douban + if pd.lookup_ids.get(IdType.IMDB): + # this should not happen + _logger.warning("Unexpected IMDB id for TMDB tv season") + elif pd.metadata.get("season_number") == 1: + res = SiteManager.get_site_by_url( + f"https://www.themoviedb.org/tv/{show_id}" + ).get_resource_ready() + pd.lookup_ids[IdType.IMDB] = ( + res.other_lookup_ids.get(IdType.IMDB) if res else None + ) + elif len(pd.metadata["episode_number_list"]) == 0: + _logger.warning( + "Unable to lookup IMDB id for TMDB tv season with zero episodes" + ) + else: + ep = pd.metadata["episode_number_list"][0] + api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits" + d2 = BasicDownloader(api_url2).download().json() + if not d2.get("id"): + raise ParseError("first episode id for season") + pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id") + return pd diff --git a/catalog/tv/tests.py b/catalog/tv/tests.py index 210d514d..d6bf5c1f 100644 --- a/catalog/tv/tests.py +++ b/catalog/tv/tests.py @@ -56,7 +56,7 @@ class TMDBTVSeasonTestCase(TestCase): self.assertEqual(site.id_value, "57243-4") site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.resource.metadata["title"], "第 4 季") + self.assertEqual(site.resource.metadata["title"], "神秘博士 第 4 季") self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB) self.assertEqual(site.resource.item.__class__.__name__, "TVSeason") self.assertEqual(site.resource.item.imdb, "tt1159991") @@ -77,14 +77,15 @@ class DoubanMovieTVTestCase(TestCase): def test_scrape_singleseason(self): url3 = "https://movie.douban.com/subject/26895436/" p3 = SiteManager.get_site_by_url(url3).get_resource_ready() - self.assertEqual(p3.item.__class__.__name__, "TVShow") + self.assertEqual(p3.item.__class__.__name__, "TVSeason") @use_local_response def test_scrape_fix_imdb(self): + # this douban links to S6E3, we'll change it to S6E1 to keep consistant url = "https://movie.douban.com/subject/35597581/" item = SiteManager.get_site_by_url(url).get_resource_ready().item - # this douban links to S6E3, we'll reset it to S6E1 to keep consistant - self.assertEqual(item.imdb, "tt21599650") + # disable this test to make douban data less disrupted + # self.assertEqual(item.imdb, "tt21599650") class MultiTVSitesTestCase(TestCase): @@ -118,8 +119,8 @@ class MultiTVSitesTestCase(TestCase): url3 = "https://movie.douban.com/subject/26895436/" p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p3 = SiteManager.get_site_by_url(url3).get_resource_ready() - self.assertEqual(p3.item.__class__.__name__, "TVShow") - self.assertEqual(p1.item.id, p3.item.id) + self.assertEqual(p3.item.__class__.__name__, "TVSeason") + self.assertEqual(p1.item, p3.item.show) @use_local_response def test_tvspecial(self): diff --git a/legacy/management/commands/link_tv.py b/legacy/management/commands/link_tv.py new file mode 100644 index 00000000..6b9bfe91 --- /dev/null +++ b/legacy/management/commands/link_tv.py @@ -0,0 +1,114 @@ +from rq.utils import first +from catalog.common import * +from catalog.models import * +from catalog.sites import * +from catalog.sites.tmdb import * +from django.core.management.base import BaseCommand +from django.core.paginator import Paginator +import pprint +from tqdm import tqdm +import logging +import csv + +_logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + """ + load imdb episode -> show mapping - https://www.imdb.com/interfaces/ + """ + + help = "Refetch Douban TV Shows" + + def add_arguments(self, parser): + parser.add_argument("--minid", help="min id to start") + + def handle(self, *args, **options): + self.stdout.write(self.style.SUCCESS(f"Loading imdb data.tsv")) + catalog = {} + episodes = {} + seasons = {} + shows = {} + c = { + "fix-show": 0, + "fix-season": 0, + "missing-tmdb": 0, + "missing-imdb": 0, + } + with open("../data.tsv", newline="") as csvfile: + reader = csv.reader(csvfile, delimiter="\t") + next(reader) + for row in reader: + episodes[row[0]] = { + "parent": row[1], + "season": int(row[2]) if row[2] != "\\N" else 0, + "episode": int(row[3]) if row[3] != "\\N" else 0, + } + shows[row[1]] = True + if row[3] == "1": + seasons[f"{row[1]}-{row[2]}"] = row[0] + + self.stdout.write(self.style.SUCCESS(f"Refreshing catalog tv seasons")) + qs = ( + TVSeason.objects.all() + .order_by("id") + .filter(primary_lookup_id_type=IdType.IMDB, show__isnull=True) + ) + if options["minid"]: + qs = qs.filter(id__gte=int(options["minid"])) + + for item in tqdm(qs): + imdb = item.primary_lookup_id_value + show_imdb = None + ep1_imdb = None + season = None + if imdb in episodes: + show_imdb = episodes[imdb]["parent"] + season = episodes[imdb]["season"] + elif imdb in shows: + show_imdb = imdb + if show_imdb: + show = catalog.get(show_imdb) + if not show: + show = ( + TVShow.objects.all() + .filter( + primary_lookup_id_type=IdType.IMDB, + primary_lookup_id_value=show_imdb, + ) + .first() + ) + if not show: + res = None + try: + res_data = search_tmdb_by_imdb_id(show_imdb) + if "tv_results" in res_data and len(res_data["tv_results"]) > 0: + url = f"https://www.themoviedb.org/tv/{res_data['tv_results'][0]['id']}" + site = SiteManager.get_site_by_url(url) + res = site.get_resource_ready() + except Exception as e: + _logger.warn(e) + show = res.item if res else None + if show and show.__class__ != TVShow: + _logger.warn(f"error {show} is not show") + show = None + if show: + catalog[show_imdb] = show + item.show = show + _logger.info(f"linked {item} with {show}") + if season and season != item.season_number: + _logger.warn( + f"fix season number for {item} from {item.season_number} to {season}" + ) + item.season_number = season + c["fix-season"] += 1 + item.save() + c["fix-show"] += 1 + else: + _logger.warn(f"Can't find {show_imdb} in TMDB for {item}") + c["missing-tmdb"] += 1 + else: + c["missing-imdb"] += 1 + + self.stdout.write(self.style.SUCCESS(f"Done")) + pprint.pp(c)