all douban tv map to season and add script to find links to tv show
This commit is contained in:
parent
2f97406c6b
commit
3b53d626bc
5 changed files with 200 additions and 91 deletions
|
@ -216,79 +216,61 @@ class DoubanMovie(AbstractSite):
|
|||
}
|
||||
)
|
||||
pd.metadata["preferred_model"] = (
|
||||
("TVShow" if is_series else "Movie") if not season else "TVSeason"
|
||||
"TVSeason" if is_series or episodes or season else "Movie"
|
||||
)
|
||||
|
||||
tmdb_season_id = None
|
||||
if imdb_code:
|
||||
res_data = search_tmdb_by_imdb_id(imdb_code)
|
||||
tmdb_show_id = None
|
||||
if "movie_results" in res_data and len(res_data["movie_results"]) > 0:
|
||||
pd.metadata["preferred_model"] = "Movie"
|
||||
elif "tv_results" in res_data and len(res_data["tv_results"]) > 0:
|
||||
if pd.metadata["preferred_model"] == "TVSeason":
|
||||
"""
|
||||
determine if this Douban Movie item should map to
|
||||
a single season tv show, or
|
||||
first season of multi-season show
|
||||
"""
|
||||
tmdb_show_id = res_data["tv_results"][0]["id"]
|
||||
tmdb_season_id = f"{tmdb_show_id}-1"
|
||||
site = TMDB_TVSeason(TMDB_TVSeason.id_to_url(tmdb_season_id))
|
||||
tmdb_tvseason = site.get_resource_ready().item
|
||||
tmdb_tv = tmdb_tvseason.show
|
||||
if tmdb_tv.season_count == 1:
|
||||
pd.metadata["preferred_model"] = "TVShow"
|
||||
# else:
|
||||
# pd.metadata["preferred_model"] = "TVSeason"
|
||||
# resp = query_tmdb_tv_episode(tmdb_show_id, 1, 1)
|
||||
# imdb_code = resp["external_ids"]["imdb_id"]
|
||||
# _logger.warning(
|
||||
# f"Douban Movie {self.url} re-mapped to imdb episode {imdb_code}"
|
||||
# )
|
||||
elif (
|
||||
"tv_season_results" in res_data
|
||||
and len(res_data["tv_season_results"]) > 0
|
||||
):
|
||||
pd.metadata["preferred_model"] = "TVSeason"
|
||||
tmdb_show_id = res_data["tv_season_results"][0]["show_id"]
|
||||
tmdb_season_id = f"{tmdb_show_id}-{season}"
|
||||
elif (
|
||||
has_movie = (
|
||||
"movie_results" in res_data and len(res_data["movie_results"]) > 0
|
||||
)
|
||||
has_tv = "tv_results" in res_data and len(res_data["tv_results"]) > 0
|
||||
has_episode = (
|
||||
"tv_episode_results" in res_data
|
||||
and len(res_data["tv_episode_results"]) > 0
|
||||
):
|
||||
)
|
||||
if pd.metadata["preferred_model"] == "TVSeason" and has_tv:
|
||||
if pd.metadata.get("season") and pd.metadata.get("season") != 1:
|
||||
_logger.warn(f"{imdb_code} matched imdb tv show, force season 1")
|
||||
pd.metadata["season"] = 1
|
||||
elif pd.metadata["preferred_model"] == "TVSeason" and has_episode:
|
||||
if res_data["tv_episode_results"][0]["episode_number"] != 1:
|
||||
_logger.warning(
|
||||
f"Douban Movie {self.url} IMDB {imdb_code} mapping to non-first episode in a season"
|
||||
)
|
||||
elif res_data["tv_episode_results"][0]["season_number"] == 1:
|
||||
_logger.warning(
|
||||
f"Douban Movie {self.url} IMDB {imdb_code} mapping to first season episode in a season"
|
||||
)
|
||||
elif has_movie:
|
||||
if pd.metadata["preferred_model"] != "Movie":
|
||||
_logger.warn(f"{imdb_code} matched imdb movie, force Movie")
|
||||
pd.metadata["preferred_model"] = "Movie"
|
||||
elif has_tv or has_episode:
|
||||
_logger.warn(f"{imdb_code} matched imdb tv/episode, force TVSeason")
|
||||
pd.metadata["preferred_model"] = "TVSeason"
|
||||
tmdb_show_id = res_data["tv_episode_results"][0]["show_id"]
|
||||
tmdb_season_id = f"{tmdb_show_id}-{season}"
|
||||
# if res_data["tv_episode_results"][0]["episode_number"] != 1:
|
||||
# _logger.warning(
|
||||
# f"Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}"
|
||||
# )
|
||||
# resp = query_tmdb_tv_episode(
|
||||
# tmdb_show_id,
|
||||
# res_data["tv_episode_results"][0]["season_number"],
|
||||
# 1,
|
||||
# )
|
||||
# imdb_code = resp["external_ids"]["imdb_id"]
|
||||
# _logger.warning(
|
||||
# f"Douban Movie {self.url} re-mapped to imdb episode {imdb_code}"
|
||||
# )
|
||||
else:
|
||||
_logger.warn(f"{imdb_code} unknown to TMDB")
|
||||
|
||||
pd.lookup_ids[IdType.IMDB] = imdb_code
|
||||
if pd.metadata["preferred_model"] == "TVSeason":
|
||||
pd.lookup_ids[IdType.TMDB_TVSeason] = tmdb_season_id
|
||||
elif pd.metadata["preferred_model"] == "TVShow":
|
||||
pd.lookup_ids[IdType.TMDB_TV] = tmdb_show_id
|
||||
|
||||
# if tmdb_show_id:
|
||||
# pd.metadata["required_resources"] = [
|
||||
# {
|
||||
# "model": "TVShow",
|
||||
# "id_type": IdType.TMDB_TV,
|
||||
# "id_value": tmdb_show_id,
|
||||
# "title": title,
|
||||
# "url": TMDB_TV.id_to_url(tmdb_show_id),
|
||||
# }
|
||||
# ]
|
||||
if pd.metadata["preferred_model"] == "TVSeason":
|
||||
tmdb_show_id = None
|
||||
if has_tv:
|
||||
tmdb_show_id = res_data["tv_results"][0]["id"]
|
||||
elif has_episode:
|
||||
tmdb_show_id = res_data["tv_episode_results"][0]["show_id"]
|
||||
if tmdb_show_id:
|
||||
pd.metadata["required_resources"] = [
|
||||
{
|
||||
"model": "TVShow",
|
||||
"id_type": IdType.TMDB_TV,
|
||||
"id_value": tmdb_show_id,
|
||||
"title": title,
|
||||
"url": TMDB_TV.id_to_url(tmdb_show_id),
|
||||
}
|
||||
]
|
||||
# TODO parse sister seasons
|
||||
# pd.metadata['related_resources'] = []
|
||||
if pd.metadata["cover_image_url"]:
|
||||
|
|
|
@ -16,13 +16,17 @@ class IMDB(AbstractSite):
|
|||
WIKI_PROPERTY_ID = "?"
|
||||
|
||||
@classmethod
|
||||
def id_to_url(self, id_value):
|
||||
def id_to_url(cls, id_value):
|
||||
return "https://www.imdb.com/title/" + id_value + "/"
|
||||
|
||||
def scrape(self):
|
||||
self.scraped = False
|
||||
res_data = search_tmdb_by_imdb_id(self.id_value)
|
||||
if "movie_results" in res_data and len(res_data["movie_results"]) > 0:
|
||||
if (
|
||||
"movie_results" in res_data
|
||||
and len(res_data["movie_results"]) > 0
|
||||
and self.DEFAULT_MODEL in [None, Movie]
|
||||
):
|
||||
url = (
|
||||
f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}"
|
||||
)
|
||||
|
@ -32,7 +36,7 @@ class IMDB(AbstractSite):
|
|||
# this should not happen given IMDB only has ids for either show or episode
|
||||
tv_id = res_data["tv_season_results"][0]["show_id"]
|
||||
season_number = res_data["tv_season_results"][0]["season_number"]
|
||||
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}"
|
||||
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}"
|
||||
elif (
|
||||
"tv_episode_results" in res_data and len(res_data["tv_episode_results"]) > 0
|
||||
):
|
||||
|
|
|
@ -296,7 +296,7 @@ class TMDB_TV(AbstractSite):
|
|||
"single_episode_length": None,
|
||||
"brief": brief,
|
||||
"cover_image_url": img_url,
|
||||
"related_resources": season_links,
|
||||
# "related_resources": season_links, # FIXME not crawling them for now given many douban tv season data has errors
|
||||
}
|
||||
)
|
||||
if imdb_code:
|
||||
|
@ -364,9 +364,9 @@ class TMDB_TVSeason(AbstractSite):
|
|||
{
|
||||
"model": "TVShow",
|
||||
"id_type": IdType.TMDB_TV,
|
||||
"id_value": v[0],
|
||||
"title": f"TMDB TV Show {v[0]}",
|
||||
"url": f"https://www.themoviedb.org/tv/{v[0]}",
|
||||
"id_value": show_id,
|
||||
"title": f"TMDB TV Show {show_id}",
|
||||
"url": f"https://www.themoviedb.org/tv/{show_id}",
|
||||
}
|
||||
]
|
||||
pd.lookup_ids[IdType.IMDB] = d["external_ids"].get("imdb_id")
|
||||
|
@ -394,18 +394,26 @@ class TMDB_TVSeason(AbstractSite):
|
|||
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
|
||||
)
|
||||
|
||||
# get external id from 1st episode
|
||||
# if pd.lookup_ids[IdType.IMDB]:
|
||||
# _logger.warning("Unexpected IMDB id for TMDB tv season")
|
||||
# elif len(pd.metadata["episode_number_list"]) == 0:
|
||||
# _logger.warning(
|
||||
# "Unable to lookup IMDB id for TMDB tv season with zero episodes"
|
||||
# )
|
||||
# else:
|
||||
# ep = pd.metadata["episode_number_list"][0]
|
||||
# api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
|
||||
# d2 = BasicDownloader(api_url2).download().json()
|
||||
# if not d2.get("id"):
|
||||
# raise ParseError("episode id for season")
|
||||
# pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id")
|
||||
# return pd
|
||||
# use show's IMDB (for Season 1) or 1st episode's IMDB (if not Season 1) as this season's IMDB so that it can be compatible with TVSeason data from Douban
|
||||
if pd.lookup_ids.get(IdType.IMDB):
|
||||
# this should not happen
|
||||
_logger.warning("Unexpected IMDB id for TMDB tv season")
|
||||
elif pd.metadata.get("season_number") == 1:
|
||||
res = SiteManager.get_site_by_url(
|
||||
f"https://www.themoviedb.org/tv/{show_id}"
|
||||
).get_resource_ready()
|
||||
pd.lookup_ids[IdType.IMDB] = (
|
||||
res.other_lookup_ids.get(IdType.IMDB) if res else None
|
||||
)
|
||||
elif len(pd.metadata["episode_number_list"]) == 0:
|
||||
_logger.warning(
|
||||
"Unable to lookup IMDB id for TMDB tv season with zero episodes"
|
||||
)
|
||||
else:
|
||||
ep = pd.metadata["episode_number_list"][0]
|
||||
api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
|
||||
d2 = BasicDownloader(api_url2).download().json()
|
||||
if not d2.get("id"):
|
||||
raise ParseError("first episode id for season")
|
||||
pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id")
|
||||
return pd
|
||||
|
|
|
@ -56,7 +56,7 @@ class TMDBTVSeasonTestCase(TestCase):
|
|||
self.assertEqual(site.id_value, "57243-4")
|
||||
site.get_resource_ready()
|
||||
self.assertEqual(site.ready, True)
|
||||
self.assertEqual(site.resource.metadata["title"], "第 4 季")
|
||||
self.assertEqual(site.resource.metadata["title"], "神秘博士 第 4 季")
|
||||
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
|
||||
self.assertEqual(site.resource.item.__class__.__name__, "TVSeason")
|
||||
self.assertEqual(site.resource.item.imdb, "tt1159991")
|
||||
|
@ -77,14 +77,15 @@ class DoubanMovieTVTestCase(TestCase):
|
|||
def test_scrape_singleseason(self):
|
||||
url3 = "https://movie.douban.com/subject/26895436/"
|
||||
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
|
||||
self.assertEqual(p3.item.__class__.__name__, "TVShow")
|
||||
self.assertEqual(p3.item.__class__.__name__, "TVSeason")
|
||||
|
||||
@use_local_response
|
||||
def test_scrape_fix_imdb(self):
|
||||
# this douban links to S6E3, we'll change it to S6E1 to keep consistant
|
||||
url = "https://movie.douban.com/subject/35597581/"
|
||||
item = SiteManager.get_site_by_url(url).get_resource_ready().item
|
||||
# this douban links to S6E3, we'll reset it to S6E1 to keep consistant
|
||||
self.assertEqual(item.imdb, "tt21599650")
|
||||
# disable this test to make douban data less disrupted
|
||||
# self.assertEqual(item.imdb, "tt21599650")
|
||||
|
||||
|
||||
class MultiTVSitesTestCase(TestCase):
|
||||
|
@ -118,8 +119,8 @@ class MultiTVSitesTestCase(TestCase):
|
|||
url3 = "https://movie.douban.com/subject/26895436/"
|
||||
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
|
||||
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
|
||||
self.assertEqual(p3.item.__class__.__name__, "TVShow")
|
||||
self.assertEqual(p1.item.id, p3.item.id)
|
||||
self.assertEqual(p3.item.__class__.__name__, "TVSeason")
|
||||
self.assertEqual(p1.item, p3.item.show)
|
||||
|
||||
@use_local_response
|
||||
def test_tvspecial(self):
|
||||
|
|
114
legacy/management/commands/link_tv.py
Normal file
114
legacy/management/commands/link_tv.py
Normal file
|
@ -0,0 +1,114 @@
|
|||
from rq.utils import first
|
||||
from catalog.common import *
|
||||
from catalog.models import *
|
||||
from catalog.sites import *
|
||||
from catalog.sites.tmdb import *
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.paginator import Paginator
|
||||
import pprint
|
||||
from tqdm import tqdm
|
||||
import logging
|
||||
import csv
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
"""
|
||||
load imdb episode -> show mapping - https://www.imdb.com/interfaces/
|
||||
"""
|
||||
|
||||
help = "Refetch Douban TV Shows"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("--minid", help="min id to start")
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.stdout.write(self.style.SUCCESS(f"Loading imdb data.tsv"))
|
||||
catalog = {}
|
||||
episodes = {}
|
||||
seasons = {}
|
||||
shows = {}
|
||||
c = {
|
||||
"fix-show": 0,
|
||||
"fix-season": 0,
|
||||
"missing-tmdb": 0,
|
||||
"missing-imdb": 0,
|
||||
}
|
||||
with open("../data.tsv", newline="") as csvfile:
|
||||
reader = csv.reader(csvfile, delimiter="\t")
|
||||
next(reader)
|
||||
for row in reader:
|
||||
episodes[row[0]] = {
|
||||
"parent": row[1],
|
||||
"season": int(row[2]) if row[2] != "\\N" else 0,
|
||||
"episode": int(row[3]) if row[3] != "\\N" else 0,
|
||||
}
|
||||
shows[row[1]] = True
|
||||
if row[3] == "1":
|
||||
seasons[f"{row[1]}-{row[2]}"] = row[0]
|
||||
|
||||
self.stdout.write(self.style.SUCCESS(f"Refreshing catalog tv seasons"))
|
||||
qs = (
|
||||
TVSeason.objects.all()
|
||||
.order_by("id")
|
||||
.filter(primary_lookup_id_type=IdType.IMDB, show__isnull=True)
|
||||
)
|
||||
if options["minid"]:
|
||||
qs = qs.filter(id__gte=int(options["minid"]))
|
||||
|
||||
for item in tqdm(qs):
|
||||
imdb = item.primary_lookup_id_value
|
||||
show_imdb = None
|
||||
ep1_imdb = None
|
||||
season = None
|
||||
if imdb in episodes:
|
||||
show_imdb = episodes[imdb]["parent"]
|
||||
season = episodes[imdb]["season"]
|
||||
elif imdb in shows:
|
||||
show_imdb = imdb
|
||||
if show_imdb:
|
||||
show = catalog.get(show_imdb)
|
||||
if not show:
|
||||
show = (
|
||||
TVShow.objects.all()
|
||||
.filter(
|
||||
primary_lookup_id_type=IdType.IMDB,
|
||||
primary_lookup_id_value=show_imdb,
|
||||
)
|
||||
.first()
|
||||
)
|
||||
if not show:
|
||||
res = None
|
||||
try:
|
||||
res_data = search_tmdb_by_imdb_id(show_imdb)
|
||||
if "tv_results" in res_data and len(res_data["tv_results"]) > 0:
|
||||
url = f"https://www.themoviedb.org/tv/{res_data['tv_results'][0]['id']}"
|
||||
site = SiteManager.get_site_by_url(url)
|
||||
res = site.get_resource_ready()
|
||||
except Exception as e:
|
||||
_logger.warn(e)
|
||||
show = res.item if res else None
|
||||
if show and show.__class__ != TVShow:
|
||||
_logger.warn(f"error {show} is not show")
|
||||
show = None
|
||||
if show:
|
||||
catalog[show_imdb] = show
|
||||
item.show = show
|
||||
_logger.info(f"linked {item} with {show}")
|
||||
if season and season != item.season_number:
|
||||
_logger.warn(
|
||||
f"fix season number for {item} from {item.season_number} to {season}"
|
||||
)
|
||||
item.season_number = season
|
||||
c["fix-season"] += 1
|
||||
item.save()
|
||||
c["fix-show"] += 1
|
||||
else:
|
||||
_logger.warn(f"Can't find {show_imdb} in TMDB for {item}")
|
||||
c["missing-tmdb"] += 1
|
||||
else:
|
||||
c["missing-imdb"] += 1
|
||||
|
||||
self.stdout.write(self.style.SUCCESS(f"Done"))
|
||||
pprint.pp(c)
|
Loading…
Add table
Reference in a new issue