all douban tv map to season and add script to find links to tv show

This commit is contained in:
Your Name 2023-01-08 16:26:05 -05:00
parent 2f97406c6b
commit 3b53d626bc
5 changed files with 200 additions and 91 deletions

View file

@ -216,79 +216,61 @@ class DoubanMovie(AbstractSite):
}
)
pd.metadata["preferred_model"] = (
("TVShow" if is_series else "Movie") if not season else "TVSeason"
"TVSeason" if is_series or episodes or season else "Movie"
)
tmdb_season_id = None
if imdb_code:
res_data = search_tmdb_by_imdb_id(imdb_code)
tmdb_show_id = None
if "movie_results" in res_data and len(res_data["movie_results"]) > 0:
pd.metadata["preferred_model"] = "Movie"
elif "tv_results" in res_data and len(res_data["tv_results"]) > 0:
if pd.metadata["preferred_model"] == "TVSeason":
"""
determine if this Douban Movie item should map to
a single season tv show, or
first season of multi-season show
"""
tmdb_show_id = res_data["tv_results"][0]["id"]
tmdb_season_id = f"{tmdb_show_id}-1"
site = TMDB_TVSeason(TMDB_TVSeason.id_to_url(tmdb_season_id))
tmdb_tvseason = site.get_resource_ready().item
tmdb_tv = tmdb_tvseason.show
if tmdb_tv.season_count == 1:
pd.metadata["preferred_model"] = "TVShow"
# else:
# pd.metadata["preferred_model"] = "TVSeason"
# resp = query_tmdb_tv_episode(tmdb_show_id, 1, 1)
# imdb_code = resp["external_ids"]["imdb_id"]
# _logger.warning(
# f"Douban Movie {self.url} re-mapped to imdb episode {imdb_code}"
# )
elif (
"tv_season_results" in res_data
and len(res_data["tv_season_results"]) > 0
):
pd.metadata["preferred_model"] = "TVSeason"
tmdb_show_id = res_data["tv_season_results"][0]["show_id"]
tmdb_season_id = f"{tmdb_show_id}-{season}"
elif (
has_movie = (
"movie_results" in res_data and len(res_data["movie_results"]) > 0
)
has_tv = "tv_results" in res_data and len(res_data["tv_results"]) > 0
has_episode = (
"tv_episode_results" in res_data
and len(res_data["tv_episode_results"]) > 0
):
)
if pd.metadata["preferred_model"] == "TVSeason" and has_tv:
if pd.metadata.get("season") and pd.metadata.get("season") != 1:
_logger.warn(f"{imdb_code} matched imdb tv show, force season 1")
pd.metadata["season"] = 1
elif pd.metadata["preferred_model"] == "TVSeason" and has_episode:
if res_data["tv_episode_results"][0]["episode_number"] != 1:
_logger.warning(
f"Douban Movie {self.url} IMDB {imdb_code} mapping to non-first episode in a season"
)
elif res_data["tv_episode_results"][0]["season_number"] == 1:
_logger.warning(
f"Douban Movie {self.url} IMDB {imdb_code} mapping to first season episode in a season"
)
elif has_movie:
if pd.metadata["preferred_model"] != "Movie":
_logger.warn(f"{imdb_code} matched imdb movie, force Movie")
pd.metadata["preferred_model"] = "Movie"
elif has_tv or has_episode:
_logger.warn(f"{imdb_code} matched imdb tv/episode, force TVSeason")
pd.metadata["preferred_model"] = "TVSeason"
tmdb_show_id = res_data["tv_episode_results"][0]["show_id"]
tmdb_season_id = f"{tmdb_show_id}-{season}"
# if res_data["tv_episode_results"][0]["episode_number"] != 1:
# _logger.warning(
# f"Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}"
# )
# resp = query_tmdb_tv_episode(
# tmdb_show_id,
# res_data["tv_episode_results"][0]["season_number"],
# 1,
# )
# imdb_code = resp["external_ids"]["imdb_id"]
# _logger.warning(
# f"Douban Movie {self.url} re-mapped to imdb episode {imdb_code}"
# )
else:
_logger.warn(f"{imdb_code} unknown to TMDB")
pd.lookup_ids[IdType.IMDB] = imdb_code
if pd.metadata["preferred_model"] == "TVSeason":
pd.lookup_ids[IdType.TMDB_TVSeason] = tmdb_season_id
elif pd.metadata["preferred_model"] == "TVShow":
pd.lookup_ids[IdType.TMDB_TV] = tmdb_show_id
# if tmdb_show_id:
# pd.metadata["required_resources"] = [
# {
# "model": "TVShow",
# "id_type": IdType.TMDB_TV,
# "id_value": tmdb_show_id,
# "title": title,
# "url": TMDB_TV.id_to_url(tmdb_show_id),
# }
# ]
if pd.metadata["preferred_model"] == "TVSeason":
tmdb_show_id = None
if has_tv:
tmdb_show_id = res_data["tv_results"][0]["id"]
elif has_episode:
tmdb_show_id = res_data["tv_episode_results"][0]["show_id"]
if tmdb_show_id:
pd.metadata["required_resources"] = [
{
"model": "TVShow",
"id_type": IdType.TMDB_TV,
"id_value": tmdb_show_id,
"title": title,
"url": TMDB_TV.id_to_url(tmdb_show_id),
}
]
# TODO parse sister seasons
# pd.metadata['related_resources'] = []
if pd.metadata["cover_image_url"]:

View file

@ -16,13 +16,17 @@ class IMDB(AbstractSite):
WIKI_PROPERTY_ID = "?"
@classmethod
def id_to_url(self, id_value):
def id_to_url(cls, id_value):
return "https://www.imdb.com/title/" + id_value + "/"
def scrape(self):
self.scraped = False
res_data = search_tmdb_by_imdb_id(self.id_value)
if "movie_results" in res_data and len(res_data["movie_results"]) > 0:
if (
"movie_results" in res_data
and len(res_data["movie_results"]) > 0
and self.DEFAULT_MODEL in [None, Movie]
):
url = (
f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}"
)
@ -32,7 +36,7 @@ class IMDB(AbstractSite):
# this should not happen given IMDB only has ids for either show or episode
tv_id = res_data["tv_season_results"][0]["show_id"]
season_number = res_data["tv_season_results"][0]["season_number"]
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}"
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}"
elif (
"tv_episode_results" in res_data and len(res_data["tv_episode_results"]) > 0
):

View file

@ -296,7 +296,7 @@ class TMDB_TV(AbstractSite):
"single_episode_length": None,
"brief": brief,
"cover_image_url": img_url,
"related_resources": season_links,
# "related_resources": season_links, # FIXME not crawling them for now given many douban tv season data has errors
}
)
if imdb_code:
@ -364,9 +364,9 @@ class TMDB_TVSeason(AbstractSite):
{
"model": "TVShow",
"id_type": IdType.TMDB_TV,
"id_value": v[0],
"title": f"TMDB TV Show {v[0]}",
"url": f"https://www.themoviedb.org/tv/{v[0]}",
"id_value": show_id,
"title": f"TMDB TV Show {show_id}",
"url": f"https://www.themoviedb.org/tv/{show_id}",
}
]
pd.lookup_ids[IdType.IMDB] = d["external_ids"].get("imdb_id")
@ -394,18 +394,26 @@ class TMDB_TVSeason(AbstractSite):
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
# get external id from 1st episode
# if pd.lookup_ids[IdType.IMDB]:
# _logger.warning("Unexpected IMDB id for TMDB tv season")
# elif len(pd.metadata["episode_number_list"]) == 0:
# _logger.warning(
# "Unable to lookup IMDB id for TMDB tv season with zero episodes"
# )
# else:
# ep = pd.metadata["episode_number_list"][0]
# api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
# d2 = BasicDownloader(api_url2).download().json()
# if not d2.get("id"):
# raise ParseError("episode id for season")
# pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id")
# return pd
# use show's IMDB (for Season 1) or 1st episode's IMDB (if not Season 1) as this season's IMDB so that it can be compatible with TVSeason data from Douban
if pd.lookup_ids.get(IdType.IMDB):
# this should not happen
_logger.warning("Unexpected IMDB id for TMDB tv season")
elif pd.metadata.get("season_number") == 1:
res = SiteManager.get_site_by_url(
f"https://www.themoviedb.org/tv/{show_id}"
).get_resource_ready()
pd.lookup_ids[IdType.IMDB] = (
res.other_lookup_ids.get(IdType.IMDB) if res else None
)
elif len(pd.metadata["episode_number_list"]) == 0:
_logger.warning(
"Unable to lookup IMDB id for TMDB tv season with zero episodes"
)
else:
ep = pd.metadata["episode_number_list"][0]
api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
d2 = BasicDownloader(api_url2).download().json()
if not d2.get("id"):
raise ParseError("first episode id for season")
pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id")
return pd

View file

@ -56,7 +56,7 @@ class TMDBTVSeasonTestCase(TestCase):
self.assertEqual(site.id_value, "57243-4")
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata["title"], "第 4 季")
self.assertEqual(site.resource.metadata["title"], "神秘博士 第 4 季")
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, "TVSeason")
self.assertEqual(site.resource.item.imdb, "tt1159991")
@ -77,14 +77,15 @@ class DoubanMovieTVTestCase(TestCase):
def test_scrape_singleseason(self):
url3 = "https://movie.douban.com/subject/26895436/"
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, "TVShow")
self.assertEqual(p3.item.__class__.__name__, "TVSeason")
@use_local_response
def test_scrape_fix_imdb(self):
# this douban links to S6E3, we'll change it to S6E1 to keep consistant
url = "https://movie.douban.com/subject/35597581/"
item = SiteManager.get_site_by_url(url).get_resource_ready().item
# this douban links to S6E3, we'll reset it to S6E1 to keep consistant
self.assertEqual(item.imdb, "tt21599650")
# disable this test to make douban data less disrupted
# self.assertEqual(item.imdb, "tt21599650")
class MultiTVSitesTestCase(TestCase):
@ -118,8 +119,8 @@ class MultiTVSitesTestCase(TestCase):
url3 = "https://movie.douban.com/subject/26895436/"
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, "TVShow")
self.assertEqual(p1.item.id, p3.item.id)
self.assertEqual(p3.item.__class__.__name__, "TVSeason")
self.assertEqual(p1.item, p3.item.show)
@use_local_response
def test_tvspecial(self):

View file

@ -0,0 +1,114 @@
from rq.utils import first
from catalog.common import *
from catalog.models import *
from catalog.sites import *
from catalog.sites.tmdb import *
from django.core.management.base import BaseCommand
from django.core.paginator import Paginator
import pprint
from tqdm import tqdm
import logging
import csv
_logger = logging.getLogger(__name__)
class Command(BaseCommand):
"""
load imdb episode -> show mapping - https://www.imdb.com/interfaces/
"""
help = "Refetch Douban TV Shows"
def add_arguments(self, parser):
parser.add_argument("--minid", help="min id to start")
def handle(self, *args, **options):
self.stdout.write(self.style.SUCCESS(f"Loading imdb data.tsv"))
catalog = {}
episodes = {}
seasons = {}
shows = {}
c = {
"fix-show": 0,
"fix-season": 0,
"missing-tmdb": 0,
"missing-imdb": 0,
}
with open("../data.tsv", newline="") as csvfile:
reader = csv.reader(csvfile, delimiter="\t")
next(reader)
for row in reader:
episodes[row[0]] = {
"parent": row[1],
"season": int(row[2]) if row[2] != "\\N" else 0,
"episode": int(row[3]) if row[3] != "\\N" else 0,
}
shows[row[1]] = True
if row[3] == "1":
seasons[f"{row[1]}-{row[2]}"] = row[0]
self.stdout.write(self.style.SUCCESS(f"Refreshing catalog tv seasons"))
qs = (
TVSeason.objects.all()
.order_by("id")
.filter(primary_lookup_id_type=IdType.IMDB, show__isnull=True)
)
if options["minid"]:
qs = qs.filter(id__gte=int(options["minid"]))
for item in tqdm(qs):
imdb = item.primary_lookup_id_value
show_imdb = None
ep1_imdb = None
season = None
if imdb in episodes:
show_imdb = episodes[imdb]["parent"]
season = episodes[imdb]["season"]
elif imdb in shows:
show_imdb = imdb
if show_imdb:
show = catalog.get(show_imdb)
if not show:
show = (
TVShow.objects.all()
.filter(
primary_lookup_id_type=IdType.IMDB,
primary_lookup_id_value=show_imdb,
)
.first()
)
if not show:
res = None
try:
res_data = search_tmdb_by_imdb_id(show_imdb)
if "tv_results" in res_data and len(res_data["tv_results"]) > 0:
url = f"https://www.themoviedb.org/tv/{res_data['tv_results'][0]['id']}"
site = SiteManager.get_site_by_url(url)
res = site.get_resource_ready()
except Exception as e:
_logger.warn(e)
show = res.item if res else None
if show and show.__class__ != TVShow:
_logger.warn(f"error {show} is not show")
show = None
if show:
catalog[show_imdb] = show
item.show = show
_logger.info(f"linked {item} with {show}")
if season and season != item.season_number:
_logger.warn(
f"fix season number for {item} from {item.season_number} to {season}"
)
item.season_number = season
c["fix-season"] += 1
item.save()
c["fix-show"] += 1
else:
_logger.warn(f"Can't find {show_imdb} in TMDB for {item}")
c["missing-tmdb"] += 1
else:
c["missing-imdb"] += 1
self.stdout.write(self.style.SUCCESS(f"Done"))
pprint.pp(c)