all douban tv map to season and add script to find links to tv show

This commit is contained in:
Your Name 2023-01-08 16:26:05 -05:00
parent 2f97406c6b
commit 3b53d626bc
5 changed files with 200 additions and 91 deletions

View file

@ -216,79 +216,61 @@ class DoubanMovie(AbstractSite):
} }
) )
pd.metadata["preferred_model"] = ( pd.metadata["preferred_model"] = (
("TVShow" if is_series else "Movie") if not season else "TVSeason" "TVSeason" if is_series or episodes or season else "Movie"
) )
tmdb_season_id = None tmdb_season_id = None
if imdb_code: if imdb_code:
res_data = search_tmdb_by_imdb_id(imdb_code) res_data = search_tmdb_by_imdb_id(imdb_code)
tmdb_show_id = None has_movie = (
if "movie_results" in res_data and len(res_data["movie_results"]) > 0: "movie_results" in res_data and len(res_data["movie_results"]) > 0
pd.metadata["preferred_model"] = "Movie" )
elif "tv_results" in res_data and len(res_data["tv_results"]) > 0: has_tv = "tv_results" in res_data and len(res_data["tv_results"]) > 0
if pd.metadata["preferred_model"] == "TVSeason": has_episode = (
"""
determine if this Douban Movie item should map to
a single season tv show, or
first season of multi-season show
"""
tmdb_show_id = res_data["tv_results"][0]["id"]
tmdb_season_id = f"{tmdb_show_id}-1"
site = TMDB_TVSeason(TMDB_TVSeason.id_to_url(tmdb_season_id))
tmdb_tvseason = site.get_resource_ready().item
tmdb_tv = tmdb_tvseason.show
if tmdb_tv.season_count == 1:
pd.metadata["preferred_model"] = "TVShow"
# else:
# pd.metadata["preferred_model"] = "TVSeason"
# resp = query_tmdb_tv_episode(tmdb_show_id, 1, 1)
# imdb_code = resp["external_ids"]["imdb_id"]
# _logger.warning(
# f"Douban Movie {self.url} re-mapped to imdb episode {imdb_code}"
# )
elif (
"tv_season_results" in res_data
and len(res_data["tv_season_results"]) > 0
):
pd.metadata["preferred_model"] = "TVSeason"
tmdb_show_id = res_data["tv_season_results"][0]["show_id"]
tmdb_season_id = f"{tmdb_show_id}-{season}"
elif (
"tv_episode_results" in res_data "tv_episode_results" in res_data
and len(res_data["tv_episode_results"]) > 0 and len(res_data["tv_episode_results"]) > 0
): )
if pd.metadata["preferred_model"] == "TVSeason" and has_tv:
if pd.metadata.get("season") and pd.metadata.get("season") != 1:
_logger.warn(f"{imdb_code} matched imdb tv show, force season 1")
pd.metadata["season"] = 1
elif pd.metadata["preferred_model"] == "TVSeason" and has_episode:
if res_data["tv_episode_results"][0]["episode_number"] != 1:
_logger.warning(
f"Douban Movie {self.url} IMDB {imdb_code} mapping to non-first episode in a season"
)
elif res_data["tv_episode_results"][0]["season_number"] == 1:
_logger.warning(
f"Douban Movie {self.url} IMDB {imdb_code} mapping to first season episode in a season"
)
elif has_movie:
if pd.metadata["preferred_model"] != "Movie":
_logger.warn(f"{imdb_code} matched imdb movie, force Movie")
pd.metadata["preferred_model"] = "Movie"
elif has_tv or has_episode:
_logger.warn(f"{imdb_code} matched imdb tv/episode, force TVSeason")
pd.metadata["preferred_model"] = "TVSeason" pd.metadata["preferred_model"] = "TVSeason"
tmdb_show_id = res_data["tv_episode_results"][0]["show_id"] else:
tmdb_season_id = f"{tmdb_show_id}-{season}" _logger.warn(f"{imdb_code} unknown to TMDB")
# if res_data["tv_episode_results"][0]["episode_number"] != 1:
# _logger.warning(
# f"Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}"
# )
# resp = query_tmdb_tv_episode(
# tmdb_show_id,
# res_data["tv_episode_results"][0]["season_number"],
# 1,
# )
# imdb_code = resp["external_ids"]["imdb_id"]
# _logger.warning(
# f"Douban Movie {self.url} re-mapped to imdb episode {imdb_code}"
# )
pd.lookup_ids[IdType.IMDB] = imdb_code pd.lookup_ids[IdType.IMDB] = imdb_code
if pd.metadata["preferred_model"] == "TVSeason":
pd.lookup_ids[IdType.TMDB_TVSeason] = tmdb_season_id
elif pd.metadata["preferred_model"] == "TVShow":
pd.lookup_ids[IdType.TMDB_TV] = tmdb_show_id
# if tmdb_show_id: if pd.metadata["preferred_model"] == "TVSeason":
# pd.metadata["required_resources"] = [ tmdb_show_id = None
# { if has_tv:
# "model": "TVShow", tmdb_show_id = res_data["tv_results"][0]["id"]
# "id_type": IdType.TMDB_TV, elif has_episode:
# "id_value": tmdb_show_id, tmdb_show_id = res_data["tv_episode_results"][0]["show_id"]
# "title": title, if tmdb_show_id:
# "url": TMDB_TV.id_to_url(tmdb_show_id), pd.metadata["required_resources"] = [
# } {
# ] "model": "TVShow",
"id_type": IdType.TMDB_TV,
"id_value": tmdb_show_id,
"title": title,
"url": TMDB_TV.id_to_url(tmdb_show_id),
}
]
# TODO parse sister seasons # TODO parse sister seasons
# pd.metadata['related_resources'] = [] # pd.metadata['related_resources'] = []
if pd.metadata["cover_image_url"]: if pd.metadata["cover_image_url"]:

View file

@ -16,13 +16,17 @@ class IMDB(AbstractSite):
WIKI_PROPERTY_ID = "?" WIKI_PROPERTY_ID = "?"
@classmethod @classmethod
def id_to_url(self, id_value): def id_to_url(cls, id_value):
return "https://www.imdb.com/title/" + id_value + "/" return "https://www.imdb.com/title/" + id_value + "/"
def scrape(self): def scrape(self):
self.scraped = False self.scraped = False
res_data = search_tmdb_by_imdb_id(self.id_value) res_data = search_tmdb_by_imdb_id(self.id_value)
if "movie_results" in res_data and len(res_data["movie_results"]) > 0: if (
"movie_results" in res_data
and len(res_data["movie_results"]) > 0
and self.DEFAULT_MODEL in [None, Movie]
):
url = ( url = (
f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}" f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}"
) )
@ -32,7 +36,7 @@ class IMDB(AbstractSite):
# this should not happen given IMDB only has ids for either show or episode # this should not happen given IMDB only has ids for either show or episode
tv_id = res_data["tv_season_results"][0]["show_id"] tv_id = res_data["tv_season_results"][0]["show_id"]
season_number = res_data["tv_season_results"][0]["season_number"] season_number = res_data["tv_season_results"][0]["season_number"]
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}" url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}"
elif ( elif (
"tv_episode_results" in res_data and len(res_data["tv_episode_results"]) > 0 "tv_episode_results" in res_data and len(res_data["tv_episode_results"]) > 0
): ):

View file

@ -296,7 +296,7 @@ class TMDB_TV(AbstractSite):
"single_episode_length": None, "single_episode_length": None,
"brief": brief, "brief": brief,
"cover_image_url": img_url, "cover_image_url": img_url,
"related_resources": season_links, # "related_resources": season_links, # FIXME not crawling them for now given many douban tv season data has errors
} }
) )
if imdb_code: if imdb_code:
@ -364,9 +364,9 @@ class TMDB_TVSeason(AbstractSite):
{ {
"model": "TVShow", "model": "TVShow",
"id_type": IdType.TMDB_TV, "id_type": IdType.TMDB_TV,
"id_value": v[0], "id_value": show_id,
"title": f"TMDB TV Show {v[0]}", "title": f"TMDB TV Show {show_id}",
"url": f"https://www.themoviedb.org/tv/{v[0]}", "url": f"https://www.themoviedb.org/tv/{show_id}",
} }
] ]
pd.lookup_ids[IdType.IMDB] = d["external_ids"].get("imdb_id") pd.lookup_ids[IdType.IMDB] = d["external_ids"].get("imdb_id")
@ -394,18 +394,26 @@ class TMDB_TVSeason(AbstractSite):
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}' f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
) )
# get external id from 1st episode # use show's IMDB (for Season 1) or 1st episode's IMDB (if not Season 1) as this season's IMDB so that it can be compatible with TVSeason data from Douban
# if pd.lookup_ids[IdType.IMDB]: if pd.lookup_ids.get(IdType.IMDB):
# _logger.warning("Unexpected IMDB id for TMDB tv season") # this should not happen
# elif len(pd.metadata["episode_number_list"]) == 0: _logger.warning("Unexpected IMDB id for TMDB tv season")
# _logger.warning( elif pd.metadata.get("season_number") == 1:
# "Unable to lookup IMDB id for TMDB tv season with zero episodes" res = SiteManager.get_site_by_url(
# ) f"https://www.themoviedb.org/tv/{show_id}"
# else: ).get_resource_ready()
# ep = pd.metadata["episode_number_list"][0] pd.lookup_ids[IdType.IMDB] = (
# api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits" res.other_lookup_ids.get(IdType.IMDB) if res else None
# d2 = BasicDownloader(api_url2).download().json() )
# if not d2.get("id"): elif len(pd.metadata["episode_number_list"]) == 0:
# raise ParseError("episode id for season") _logger.warning(
# pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id") "Unable to lookup IMDB id for TMDB tv season with zero episodes"
# return pd )
else:
ep = pd.metadata["episode_number_list"][0]
api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
d2 = BasicDownloader(api_url2).download().json()
if not d2.get("id"):
raise ParseError("first episode id for season")
pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id")
return pd

View file

@ -56,7 +56,7 @@ class TMDBTVSeasonTestCase(TestCase):
self.assertEqual(site.id_value, "57243-4") self.assertEqual(site.id_value, "57243-4")
site.get_resource_ready() site.get_resource_ready()
self.assertEqual(site.ready, True) self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata["title"], "第 4 季") self.assertEqual(site.resource.metadata["title"], "神秘博士 第 4 季")
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB) self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, "TVSeason") self.assertEqual(site.resource.item.__class__.__name__, "TVSeason")
self.assertEqual(site.resource.item.imdb, "tt1159991") self.assertEqual(site.resource.item.imdb, "tt1159991")
@ -77,14 +77,15 @@ class DoubanMovieTVTestCase(TestCase):
def test_scrape_singleseason(self): def test_scrape_singleseason(self):
url3 = "https://movie.douban.com/subject/26895436/" url3 = "https://movie.douban.com/subject/26895436/"
p3 = SiteManager.get_site_by_url(url3).get_resource_ready() p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, "TVShow") self.assertEqual(p3.item.__class__.__name__, "TVSeason")
@use_local_response @use_local_response
def test_scrape_fix_imdb(self): def test_scrape_fix_imdb(self):
# this douban links to S6E3, we'll change it to S6E1 to keep consistant
url = "https://movie.douban.com/subject/35597581/" url = "https://movie.douban.com/subject/35597581/"
item = SiteManager.get_site_by_url(url).get_resource_ready().item item = SiteManager.get_site_by_url(url).get_resource_ready().item
# this douban links to S6E3, we'll reset it to S6E1 to keep consistant # disable this test to make douban data less disrupted
self.assertEqual(item.imdb, "tt21599650") # self.assertEqual(item.imdb, "tt21599650")
class MultiTVSitesTestCase(TestCase): class MultiTVSitesTestCase(TestCase):
@ -118,8 +119,8 @@ class MultiTVSitesTestCase(TestCase):
url3 = "https://movie.douban.com/subject/26895436/" url3 = "https://movie.douban.com/subject/26895436/"
p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p3 = SiteManager.get_site_by_url(url3).get_resource_ready() p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, "TVShow") self.assertEqual(p3.item.__class__.__name__, "TVSeason")
self.assertEqual(p1.item.id, p3.item.id) self.assertEqual(p1.item, p3.item.show)
@use_local_response @use_local_response
def test_tvspecial(self): def test_tvspecial(self):

View file

@ -0,0 +1,114 @@
from rq.utils import first
from catalog.common import *
from catalog.models import *
from catalog.sites import *
from catalog.sites.tmdb import *
from django.core.management.base import BaseCommand
from django.core.paginator import Paginator
import pprint
from tqdm import tqdm
import logging
import csv
_logger = logging.getLogger(__name__)
class Command(BaseCommand):
"""
load imdb episode -> show mapping - https://www.imdb.com/interfaces/
"""
help = "Refetch Douban TV Shows"
def add_arguments(self, parser):
parser.add_argument("--minid", help="min id to start")
def handle(self, *args, **options):
self.stdout.write(self.style.SUCCESS(f"Loading imdb data.tsv"))
catalog = {}
episodes = {}
seasons = {}
shows = {}
c = {
"fix-show": 0,
"fix-season": 0,
"missing-tmdb": 0,
"missing-imdb": 0,
}
with open("../data.tsv", newline="") as csvfile:
reader = csv.reader(csvfile, delimiter="\t")
next(reader)
for row in reader:
episodes[row[0]] = {
"parent": row[1],
"season": int(row[2]) if row[2] != "\\N" else 0,
"episode": int(row[3]) if row[3] != "\\N" else 0,
}
shows[row[1]] = True
if row[3] == "1":
seasons[f"{row[1]}-{row[2]}"] = row[0]
self.stdout.write(self.style.SUCCESS(f"Refreshing catalog tv seasons"))
qs = (
TVSeason.objects.all()
.order_by("id")
.filter(primary_lookup_id_type=IdType.IMDB, show__isnull=True)
)
if options["minid"]:
qs = qs.filter(id__gte=int(options["minid"]))
for item in tqdm(qs):
imdb = item.primary_lookup_id_value
show_imdb = None
ep1_imdb = None
season = None
if imdb in episodes:
show_imdb = episodes[imdb]["parent"]
season = episodes[imdb]["season"]
elif imdb in shows:
show_imdb = imdb
if show_imdb:
show = catalog.get(show_imdb)
if not show:
show = (
TVShow.objects.all()
.filter(
primary_lookup_id_type=IdType.IMDB,
primary_lookup_id_value=show_imdb,
)
.first()
)
if not show:
res = None
try:
res_data = search_tmdb_by_imdb_id(show_imdb)
if "tv_results" in res_data and len(res_data["tv_results"]) > 0:
url = f"https://www.themoviedb.org/tv/{res_data['tv_results'][0]['id']}"
site = SiteManager.get_site_by_url(url)
res = site.get_resource_ready()
except Exception as e:
_logger.warn(e)
show = res.item if res else None
if show and show.__class__ != TVShow:
_logger.warn(f"error {show} is not show")
show = None
if show:
catalog[show_imdb] = show
item.show = show
_logger.info(f"linked {item} with {show}")
if season and season != item.season_number:
_logger.warn(
f"fix season number for {item} from {item.season_number} to {season}"
)
item.season_number = season
c["fix-season"] += 1
item.save()
c["fix-show"] += 1
else:
_logger.warn(f"Can't find {show_imdb} in TMDB for {item}")
c["missing-tmdb"] += 1
else:
c["missing-imdb"] += 1
self.stdout.write(self.style.SUCCESS(f"Done"))
pprint.pp(c)