all douban tv map to season and add script to find links to tv show
This commit is contained in:
parent
2f97406c6b
commit
3b53d626bc
5 changed files with 200 additions and 91 deletions
|
@ -216,79 +216,61 @@ class DoubanMovie(AbstractSite):
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
pd.metadata["preferred_model"] = (
|
pd.metadata["preferred_model"] = (
|
||||||
("TVShow" if is_series else "Movie") if not season else "TVSeason"
|
"TVSeason" if is_series or episodes or season else "Movie"
|
||||||
)
|
)
|
||||||
|
|
||||||
tmdb_season_id = None
|
tmdb_season_id = None
|
||||||
if imdb_code:
|
if imdb_code:
|
||||||
res_data = search_tmdb_by_imdb_id(imdb_code)
|
res_data = search_tmdb_by_imdb_id(imdb_code)
|
||||||
tmdb_show_id = None
|
has_movie = (
|
||||||
if "movie_results" in res_data and len(res_data["movie_results"]) > 0:
|
"movie_results" in res_data and len(res_data["movie_results"]) > 0
|
||||||
pd.metadata["preferred_model"] = "Movie"
|
)
|
||||||
elif "tv_results" in res_data and len(res_data["tv_results"]) > 0:
|
has_tv = "tv_results" in res_data and len(res_data["tv_results"]) > 0
|
||||||
if pd.metadata["preferred_model"] == "TVSeason":
|
has_episode = (
|
||||||
"""
|
|
||||||
determine if this Douban Movie item should map to
|
|
||||||
a single season tv show, or
|
|
||||||
first season of multi-season show
|
|
||||||
"""
|
|
||||||
tmdb_show_id = res_data["tv_results"][0]["id"]
|
|
||||||
tmdb_season_id = f"{tmdb_show_id}-1"
|
|
||||||
site = TMDB_TVSeason(TMDB_TVSeason.id_to_url(tmdb_season_id))
|
|
||||||
tmdb_tvseason = site.get_resource_ready().item
|
|
||||||
tmdb_tv = tmdb_tvseason.show
|
|
||||||
if tmdb_tv.season_count == 1:
|
|
||||||
pd.metadata["preferred_model"] = "TVShow"
|
|
||||||
# else:
|
|
||||||
# pd.metadata["preferred_model"] = "TVSeason"
|
|
||||||
# resp = query_tmdb_tv_episode(tmdb_show_id, 1, 1)
|
|
||||||
# imdb_code = resp["external_ids"]["imdb_id"]
|
|
||||||
# _logger.warning(
|
|
||||||
# f"Douban Movie {self.url} re-mapped to imdb episode {imdb_code}"
|
|
||||||
# )
|
|
||||||
elif (
|
|
||||||
"tv_season_results" in res_data
|
|
||||||
and len(res_data["tv_season_results"]) > 0
|
|
||||||
):
|
|
||||||
pd.metadata["preferred_model"] = "TVSeason"
|
|
||||||
tmdb_show_id = res_data["tv_season_results"][0]["show_id"]
|
|
||||||
tmdb_season_id = f"{tmdb_show_id}-{season}"
|
|
||||||
elif (
|
|
||||||
"tv_episode_results" in res_data
|
"tv_episode_results" in res_data
|
||||||
and len(res_data["tv_episode_results"]) > 0
|
and len(res_data["tv_episode_results"]) > 0
|
||||||
):
|
)
|
||||||
|
if pd.metadata["preferred_model"] == "TVSeason" and has_tv:
|
||||||
|
if pd.metadata.get("season") and pd.metadata.get("season") != 1:
|
||||||
|
_logger.warn(f"{imdb_code} matched imdb tv show, force season 1")
|
||||||
|
pd.metadata["season"] = 1
|
||||||
|
elif pd.metadata["preferred_model"] == "TVSeason" and has_episode:
|
||||||
|
if res_data["tv_episode_results"][0]["episode_number"] != 1:
|
||||||
|
_logger.warning(
|
||||||
|
f"Douban Movie {self.url} IMDB {imdb_code} mapping to non-first episode in a season"
|
||||||
|
)
|
||||||
|
elif res_data["tv_episode_results"][0]["season_number"] == 1:
|
||||||
|
_logger.warning(
|
||||||
|
f"Douban Movie {self.url} IMDB {imdb_code} mapping to first season episode in a season"
|
||||||
|
)
|
||||||
|
elif has_movie:
|
||||||
|
if pd.metadata["preferred_model"] != "Movie":
|
||||||
|
_logger.warn(f"{imdb_code} matched imdb movie, force Movie")
|
||||||
|
pd.metadata["preferred_model"] = "Movie"
|
||||||
|
elif has_tv or has_episode:
|
||||||
|
_logger.warn(f"{imdb_code} matched imdb tv/episode, force TVSeason")
|
||||||
pd.metadata["preferred_model"] = "TVSeason"
|
pd.metadata["preferred_model"] = "TVSeason"
|
||||||
tmdb_show_id = res_data["tv_episode_results"][0]["show_id"]
|
else:
|
||||||
tmdb_season_id = f"{tmdb_show_id}-{season}"
|
_logger.warn(f"{imdb_code} unknown to TMDB")
|
||||||
# if res_data["tv_episode_results"][0]["episode_number"] != 1:
|
|
||||||
# _logger.warning(
|
|
||||||
# f"Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}"
|
|
||||||
# )
|
|
||||||
# resp = query_tmdb_tv_episode(
|
|
||||||
# tmdb_show_id,
|
|
||||||
# res_data["tv_episode_results"][0]["season_number"],
|
|
||||||
# 1,
|
|
||||||
# )
|
|
||||||
# imdb_code = resp["external_ids"]["imdb_id"]
|
|
||||||
# _logger.warning(
|
|
||||||
# f"Douban Movie {self.url} re-mapped to imdb episode {imdb_code}"
|
|
||||||
# )
|
|
||||||
|
|
||||||
pd.lookup_ids[IdType.IMDB] = imdb_code
|
pd.lookup_ids[IdType.IMDB] = imdb_code
|
||||||
if pd.metadata["preferred_model"] == "TVSeason":
|
|
||||||
pd.lookup_ids[IdType.TMDB_TVSeason] = tmdb_season_id
|
|
||||||
elif pd.metadata["preferred_model"] == "TVShow":
|
|
||||||
pd.lookup_ids[IdType.TMDB_TV] = tmdb_show_id
|
|
||||||
|
|
||||||
# if tmdb_show_id:
|
if pd.metadata["preferred_model"] == "TVSeason":
|
||||||
# pd.metadata["required_resources"] = [
|
tmdb_show_id = None
|
||||||
# {
|
if has_tv:
|
||||||
# "model": "TVShow",
|
tmdb_show_id = res_data["tv_results"][0]["id"]
|
||||||
# "id_type": IdType.TMDB_TV,
|
elif has_episode:
|
||||||
# "id_value": tmdb_show_id,
|
tmdb_show_id = res_data["tv_episode_results"][0]["show_id"]
|
||||||
# "title": title,
|
if tmdb_show_id:
|
||||||
# "url": TMDB_TV.id_to_url(tmdb_show_id),
|
pd.metadata["required_resources"] = [
|
||||||
# }
|
{
|
||||||
# ]
|
"model": "TVShow",
|
||||||
|
"id_type": IdType.TMDB_TV,
|
||||||
|
"id_value": tmdb_show_id,
|
||||||
|
"title": title,
|
||||||
|
"url": TMDB_TV.id_to_url(tmdb_show_id),
|
||||||
|
}
|
||||||
|
]
|
||||||
# TODO parse sister seasons
|
# TODO parse sister seasons
|
||||||
# pd.metadata['related_resources'] = []
|
# pd.metadata['related_resources'] = []
|
||||||
if pd.metadata["cover_image_url"]:
|
if pd.metadata["cover_image_url"]:
|
||||||
|
|
|
@ -16,13 +16,17 @@ class IMDB(AbstractSite):
|
||||||
WIKI_PROPERTY_ID = "?"
|
WIKI_PROPERTY_ID = "?"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def id_to_url(self, id_value):
|
def id_to_url(cls, id_value):
|
||||||
return "https://www.imdb.com/title/" + id_value + "/"
|
return "https://www.imdb.com/title/" + id_value + "/"
|
||||||
|
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
self.scraped = False
|
self.scraped = False
|
||||||
res_data = search_tmdb_by_imdb_id(self.id_value)
|
res_data = search_tmdb_by_imdb_id(self.id_value)
|
||||||
if "movie_results" in res_data and len(res_data["movie_results"]) > 0:
|
if (
|
||||||
|
"movie_results" in res_data
|
||||||
|
and len(res_data["movie_results"]) > 0
|
||||||
|
and self.DEFAULT_MODEL in [None, Movie]
|
||||||
|
):
|
||||||
url = (
|
url = (
|
||||||
f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}"
|
f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}"
|
||||||
)
|
)
|
||||||
|
@ -32,7 +36,7 @@ class IMDB(AbstractSite):
|
||||||
# this should not happen given IMDB only has ids for either show or episode
|
# this should not happen given IMDB only has ids for either show or episode
|
||||||
tv_id = res_data["tv_season_results"][0]["show_id"]
|
tv_id = res_data["tv_season_results"][0]["show_id"]
|
||||||
season_number = res_data["tv_season_results"][0]["season_number"]
|
season_number = res_data["tv_season_results"][0]["season_number"]
|
||||||
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}"
|
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}"
|
||||||
elif (
|
elif (
|
||||||
"tv_episode_results" in res_data and len(res_data["tv_episode_results"]) > 0
|
"tv_episode_results" in res_data and len(res_data["tv_episode_results"]) > 0
|
||||||
):
|
):
|
||||||
|
|
|
@ -296,7 +296,7 @@ class TMDB_TV(AbstractSite):
|
||||||
"single_episode_length": None,
|
"single_episode_length": None,
|
||||||
"brief": brief,
|
"brief": brief,
|
||||||
"cover_image_url": img_url,
|
"cover_image_url": img_url,
|
||||||
"related_resources": season_links,
|
# "related_resources": season_links, # FIXME not crawling them for now given many douban tv season data has errors
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
if imdb_code:
|
if imdb_code:
|
||||||
|
@ -364,9 +364,9 @@ class TMDB_TVSeason(AbstractSite):
|
||||||
{
|
{
|
||||||
"model": "TVShow",
|
"model": "TVShow",
|
||||||
"id_type": IdType.TMDB_TV,
|
"id_type": IdType.TMDB_TV,
|
||||||
"id_value": v[0],
|
"id_value": show_id,
|
||||||
"title": f"TMDB TV Show {v[0]}",
|
"title": f"TMDB TV Show {show_id}",
|
||||||
"url": f"https://www.themoviedb.org/tv/{v[0]}",
|
"url": f"https://www.themoviedb.org/tv/{show_id}",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
pd.lookup_ids[IdType.IMDB] = d["external_ids"].get("imdb_id")
|
pd.lookup_ids[IdType.IMDB] = d["external_ids"].get("imdb_id")
|
||||||
|
@ -394,18 +394,26 @@ class TMDB_TVSeason(AbstractSite):
|
||||||
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
|
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
|
||||||
)
|
)
|
||||||
|
|
||||||
# get external id from 1st episode
|
# use show's IMDB (for Season 1) or 1st episode's IMDB (if not Season 1) as this season's IMDB so that it can be compatible with TVSeason data from Douban
|
||||||
# if pd.lookup_ids[IdType.IMDB]:
|
if pd.lookup_ids.get(IdType.IMDB):
|
||||||
# _logger.warning("Unexpected IMDB id for TMDB tv season")
|
# this should not happen
|
||||||
# elif len(pd.metadata["episode_number_list"]) == 0:
|
_logger.warning("Unexpected IMDB id for TMDB tv season")
|
||||||
# _logger.warning(
|
elif pd.metadata.get("season_number") == 1:
|
||||||
# "Unable to lookup IMDB id for TMDB tv season with zero episodes"
|
res = SiteManager.get_site_by_url(
|
||||||
# )
|
f"https://www.themoviedb.org/tv/{show_id}"
|
||||||
# else:
|
).get_resource_ready()
|
||||||
# ep = pd.metadata["episode_number_list"][0]
|
pd.lookup_ids[IdType.IMDB] = (
|
||||||
# api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
|
res.other_lookup_ids.get(IdType.IMDB) if res else None
|
||||||
# d2 = BasicDownloader(api_url2).download().json()
|
)
|
||||||
# if not d2.get("id"):
|
elif len(pd.metadata["episode_number_list"]) == 0:
|
||||||
# raise ParseError("episode id for season")
|
_logger.warning(
|
||||||
# pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id")
|
"Unable to lookup IMDB id for TMDB tv season with zero episodes"
|
||||||
# return pd
|
)
|
||||||
|
else:
|
||||||
|
ep = pd.metadata["episode_number_list"][0]
|
||||||
|
api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
|
||||||
|
d2 = BasicDownloader(api_url2).download().json()
|
||||||
|
if not d2.get("id"):
|
||||||
|
raise ParseError("first episode id for season")
|
||||||
|
pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id")
|
||||||
|
return pd
|
||||||
|
|
|
@ -56,7 +56,7 @@ class TMDBTVSeasonTestCase(TestCase):
|
||||||
self.assertEqual(site.id_value, "57243-4")
|
self.assertEqual(site.id_value, "57243-4")
|
||||||
site.get_resource_ready()
|
site.get_resource_ready()
|
||||||
self.assertEqual(site.ready, True)
|
self.assertEqual(site.ready, True)
|
||||||
self.assertEqual(site.resource.metadata["title"], "第 4 季")
|
self.assertEqual(site.resource.metadata["title"], "神秘博士 第 4 季")
|
||||||
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
|
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
|
||||||
self.assertEqual(site.resource.item.__class__.__name__, "TVSeason")
|
self.assertEqual(site.resource.item.__class__.__name__, "TVSeason")
|
||||||
self.assertEqual(site.resource.item.imdb, "tt1159991")
|
self.assertEqual(site.resource.item.imdb, "tt1159991")
|
||||||
|
@ -77,14 +77,15 @@ class DoubanMovieTVTestCase(TestCase):
|
||||||
def test_scrape_singleseason(self):
|
def test_scrape_singleseason(self):
|
||||||
url3 = "https://movie.douban.com/subject/26895436/"
|
url3 = "https://movie.douban.com/subject/26895436/"
|
||||||
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
|
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
|
||||||
self.assertEqual(p3.item.__class__.__name__, "TVShow")
|
self.assertEqual(p3.item.__class__.__name__, "TVSeason")
|
||||||
|
|
||||||
@use_local_response
|
@use_local_response
|
||||||
def test_scrape_fix_imdb(self):
|
def test_scrape_fix_imdb(self):
|
||||||
|
# this douban links to S6E3, we'll change it to S6E1 to keep consistant
|
||||||
url = "https://movie.douban.com/subject/35597581/"
|
url = "https://movie.douban.com/subject/35597581/"
|
||||||
item = SiteManager.get_site_by_url(url).get_resource_ready().item
|
item = SiteManager.get_site_by_url(url).get_resource_ready().item
|
||||||
# this douban links to S6E3, we'll reset it to S6E1 to keep consistant
|
# disable this test to make douban data less disrupted
|
||||||
self.assertEqual(item.imdb, "tt21599650")
|
# self.assertEqual(item.imdb, "tt21599650")
|
||||||
|
|
||||||
|
|
||||||
class MultiTVSitesTestCase(TestCase):
|
class MultiTVSitesTestCase(TestCase):
|
||||||
|
@ -118,8 +119,8 @@ class MultiTVSitesTestCase(TestCase):
|
||||||
url3 = "https://movie.douban.com/subject/26895436/"
|
url3 = "https://movie.douban.com/subject/26895436/"
|
||||||
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
|
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
|
||||||
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
|
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
|
||||||
self.assertEqual(p3.item.__class__.__name__, "TVShow")
|
self.assertEqual(p3.item.__class__.__name__, "TVSeason")
|
||||||
self.assertEqual(p1.item.id, p3.item.id)
|
self.assertEqual(p1.item, p3.item.show)
|
||||||
|
|
||||||
@use_local_response
|
@use_local_response
|
||||||
def test_tvspecial(self):
|
def test_tvspecial(self):
|
||||||
|
|
114
legacy/management/commands/link_tv.py
Normal file
114
legacy/management/commands/link_tv.py
Normal file
|
@ -0,0 +1,114 @@
|
||||||
|
from rq.utils import first
|
||||||
|
from catalog.common import *
|
||||||
|
from catalog.models import *
|
||||||
|
from catalog.sites import *
|
||||||
|
from catalog.sites.tmdb import *
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from django.core.paginator import Paginator
|
||||||
|
import pprint
|
||||||
|
from tqdm import tqdm
|
||||||
|
import logging
|
||||||
|
import csv
|
||||||
|
|
||||||
|
_logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
"""
|
||||||
|
load imdb episode -> show mapping - https://www.imdb.com/interfaces/
|
||||||
|
"""
|
||||||
|
|
||||||
|
help = "Refetch Douban TV Shows"
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument("--minid", help="min id to start")
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
self.stdout.write(self.style.SUCCESS(f"Loading imdb data.tsv"))
|
||||||
|
catalog = {}
|
||||||
|
episodes = {}
|
||||||
|
seasons = {}
|
||||||
|
shows = {}
|
||||||
|
c = {
|
||||||
|
"fix-show": 0,
|
||||||
|
"fix-season": 0,
|
||||||
|
"missing-tmdb": 0,
|
||||||
|
"missing-imdb": 0,
|
||||||
|
}
|
||||||
|
with open("../data.tsv", newline="") as csvfile:
|
||||||
|
reader = csv.reader(csvfile, delimiter="\t")
|
||||||
|
next(reader)
|
||||||
|
for row in reader:
|
||||||
|
episodes[row[0]] = {
|
||||||
|
"parent": row[1],
|
||||||
|
"season": int(row[2]) if row[2] != "\\N" else 0,
|
||||||
|
"episode": int(row[3]) if row[3] != "\\N" else 0,
|
||||||
|
}
|
||||||
|
shows[row[1]] = True
|
||||||
|
if row[3] == "1":
|
||||||
|
seasons[f"{row[1]}-{row[2]}"] = row[0]
|
||||||
|
|
||||||
|
self.stdout.write(self.style.SUCCESS(f"Refreshing catalog tv seasons"))
|
||||||
|
qs = (
|
||||||
|
TVSeason.objects.all()
|
||||||
|
.order_by("id")
|
||||||
|
.filter(primary_lookup_id_type=IdType.IMDB, show__isnull=True)
|
||||||
|
)
|
||||||
|
if options["minid"]:
|
||||||
|
qs = qs.filter(id__gte=int(options["minid"]))
|
||||||
|
|
||||||
|
for item in tqdm(qs):
|
||||||
|
imdb = item.primary_lookup_id_value
|
||||||
|
show_imdb = None
|
||||||
|
ep1_imdb = None
|
||||||
|
season = None
|
||||||
|
if imdb in episodes:
|
||||||
|
show_imdb = episodes[imdb]["parent"]
|
||||||
|
season = episodes[imdb]["season"]
|
||||||
|
elif imdb in shows:
|
||||||
|
show_imdb = imdb
|
||||||
|
if show_imdb:
|
||||||
|
show = catalog.get(show_imdb)
|
||||||
|
if not show:
|
||||||
|
show = (
|
||||||
|
TVShow.objects.all()
|
||||||
|
.filter(
|
||||||
|
primary_lookup_id_type=IdType.IMDB,
|
||||||
|
primary_lookup_id_value=show_imdb,
|
||||||
|
)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
if not show:
|
||||||
|
res = None
|
||||||
|
try:
|
||||||
|
res_data = search_tmdb_by_imdb_id(show_imdb)
|
||||||
|
if "tv_results" in res_data and len(res_data["tv_results"]) > 0:
|
||||||
|
url = f"https://www.themoviedb.org/tv/{res_data['tv_results'][0]['id']}"
|
||||||
|
site = SiteManager.get_site_by_url(url)
|
||||||
|
res = site.get_resource_ready()
|
||||||
|
except Exception as e:
|
||||||
|
_logger.warn(e)
|
||||||
|
show = res.item if res else None
|
||||||
|
if show and show.__class__ != TVShow:
|
||||||
|
_logger.warn(f"error {show} is not show")
|
||||||
|
show = None
|
||||||
|
if show:
|
||||||
|
catalog[show_imdb] = show
|
||||||
|
item.show = show
|
||||||
|
_logger.info(f"linked {item} with {show}")
|
||||||
|
if season and season != item.season_number:
|
||||||
|
_logger.warn(
|
||||||
|
f"fix season number for {item} from {item.season_number} to {season}"
|
||||||
|
)
|
||||||
|
item.season_number = season
|
||||||
|
c["fix-season"] += 1
|
||||||
|
item.save()
|
||||||
|
c["fix-show"] += 1
|
||||||
|
else:
|
||||||
|
_logger.warn(f"Can't find {show_imdb} in TMDB for {item}")
|
||||||
|
c["missing-tmdb"] += 1
|
||||||
|
else:
|
||||||
|
c["missing-imdb"] += 1
|
||||||
|
|
||||||
|
self.stdout.write(self.style.SUCCESS(f"Done"))
|
||||||
|
pprint.pp(c)
|
Loading…
Add table
Reference in a new issue