add douban book works; add goodread works; auto link season to show

This commit is contained in:
Your Name 2022-12-08 05:53:00 +00:00
parent 9e219bfac9
commit 3a95e5fc5c
16 changed files with 15041 additions and 67 deletions

View file

@ -47,15 +47,14 @@ class Edition(Item):
def update_linked_items_from_extenal_page(self, page):
"""add Work from page.metadata['work'] if not yet"""
w = page.metadata.get('work', None)
if w:
work = Work.objects.filter(primary_lookup_id_type=w['lookup_id_type'], primary_lookup_id_value=w['lookup_id_value']).first()
if work:
if any(edition == self for edition in work.editions.all()):
return
else:
work = Work.objects.create(primary_lookup_id_type=w['lookup_id_type'], primary_lookup_id_value=w['lookup_id_value'], title=w['title'])
work.editions.add(self)
links = page.required_pages + page.related_pages
for w in links:
if w['model'] == 'Work':
work = Work.objects.filter(primary_lookup_id_type=w['id_type'], primary_lookup_id_value=w['id_value']).first()
if work and work not in self.works.all():
self.works.add(work)
# if not work:
# logger.info(f'Unable to find link for {w["url"]}')
class Work(Item):

View file

@ -78,8 +78,7 @@ class GoodreadsTestCase(TestCase):
self.assertEqual(site.ready, True)
self.assertEqual(site.page.metadata.get('title'), 'Hyperion')
self.assertEqual(site.page.metadata.get('isbn'), isbn)
self.assertEqual(site.page.metadata['work']['lookup_id_value'], '1383900')
self.assertEqual(site.page.metadata['work']['title'], 'Hyperion')
self.assertEqual(site.page.required_pages[0]['id_value'], '1383900')
edition = Edition.objects.get(primary_lookup_id_type=IdType.ISBN, primary_lookup_id_value=isbn)
page = edition.external_pages.all().first()
self.assertEqual(page.id_type, IdType.Goodreads)
@ -105,19 +104,19 @@ class GoodreadsTestCase(TestCase):
@use_local_response
def test_work(self):
# url = 'https://www.goodreads.com/work/editions/153313'
url = 'https://www.goodreads.com/work/editions/153313'
p = SiteList.get_site_by_url(url).get_page_ready()
self.assertEqual(p.item.title, '1984')
url1 = 'https://www.goodreads.com/book/show/3597767-rok-1984'
url2 = 'https://www.goodreads.com/book/show/40961427-1984'
p1 = SiteList.get_site_by_url(url1).get_page_ready()
p2 = SiteList.get_site_by_url(url2).get_page_ready()
w1 = p1.item.works.all().first()
w2 = p2.item.works.all().first()
self.assertEqual(w1.title, 'Nineteen Eighty-Four')
self.assertEqual(w2.title, 'Nineteen Eighty-Four')
self.assertEqual(w1, w2)
class DoubanTestCase(TestCase):
class DoubanBookTestCase(TestCase):
def setUp(self):
pass

View file

@ -1,5 +1,6 @@
from polymorphic.models import PolymorphicModel
from django.db import models
from catalog.common import jsondata
from django.utils.translation import gettext_lazy as _
from django.utils import timezone
from django.core.files.uploadedfile import SimpleUploadedFile
@ -220,6 +221,8 @@ class ExternalPage(models.Model):
scraped_time = models.DateTimeField(null=True)
created_time = models.DateTimeField(auto_now_add=True)
edited_time = models.DateTimeField(auto_now=True)
required_pages = jsondata.ArrayField(null=False, blank=False, default=list)
related_pages = jsondata.ArrayField(null=False, blank=False, default=list)
class Meta:
unique_together = [['id_type', 'id_value']]
@ -237,7 +240,7 @@ class ExternalPage(models.Model):
@property
def ready(self):
return bool(self.metadata)
return bool(self.metadata and self.scraped_time)
def get_all_lookup_ids(self):
d = self.other_lookup_ids.copy()
@ -254,11 +257,3 @@ class ExternalPage(models.Model):
else:
raise ValueError(f'preferred model {model} does not exist')
return None
def get_dependent_urls(self):
ll = self.metadata.get('dependent_urls')
return ll if ll else []
def get_related_urls(self):
ll = self.metadata.get('related_urls')
return ll if ll else []

View file

@ -2,6 +2,10 @@ from typing import *
import re
from .models import ExternalPage
from dataclasses import dataclass, field
import logging
logger = logging.getLogger(__name__)
@dataclass
@ -50,6 +54,10 @@ class AbstractSite:
self.page = ExternalPage(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url)
return self.page
def bypass_scrape(self, data_from_link) -> PageData | None:
"""subclass may implement this to use data from linked page and bypass actual scrape"""
return None
def scrape(self) -> PageData:
"""subclass should implement this, return PageData object"""
data = PageData()
@ -76,7 +84,7 @@ class AbstractSite:
def ready(self):
return bool(self.page and self.page.ready)
def get_page_ready(self, auto_save=True, auto_create=True, auto_link=True):
def get_page_ready(self, auto_save=True, auto_create=True, auto_link=True, data_from_link=None):
"""return a page scraped, or scrape if not yet"""
if auto_link:
auto_create = True
@ -87,7 +95,9 @@ class AbstractSite:
if not self.page:
return None
if not p.ready:
pagedata = self.scrape()
pagedata = self.bypass_scrape(data_from_link)
if not pagedata:
pagedata = self.scrape()
p.update_content(pagedata)
if not p.ready:
logger.error(f'unable to get page {self.url} ready')
@ -100,14 +110,16 @@ class AbstractSite:
p.item.merge_data_from_extenal_pages()
p.item.save()
if auto_link:
# todo rewrite this
for linked_pages in p.required_pages:
linked_site = SiteList.get_site_by_url(linked_pages['url'])
if linked_site:
linked_site.get_page_ready(auto_link=False)
else:
logger.error(f'unable to get site for {linked_pages["url"]}')
p.item.update_linked_items_from_extenal_page(p)
p.item.save()
return p
def get_dependent_pages_ready(self, urls):
# set depth = 2 so in a case of douban season can find an IMDB episode then a TMDB Serie
pass
class SiteList:
registry = {}

View file

@ -108,16 +108,16 @@ class DoubanBook(AbstractSite, ScraperMixin):
translators = None
self.data['translators'] = translators
self.data['work'] = {}
work_link = self.parse_str('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
if work_link:
# TODO move logic to a differnet class
r = re.match(r'\w+://book.douban.com/works/(\d+)', work_link)
self.data['work']['lookup_id_type'] = IdType.DoubanBook_Work
self.data['work']['lookup_id_value'] = r[1] if r else None
self.data['work']['title'] = self.data['title']
self.data['work']['url'] = work_link
self.data['required_pages'] = [{
'model': 'Work',
'id_type': IdType.DoubanBook_Work,
'id_value': r[1] if r else None,
'title': self.data['title'],
'url': work_link,
}]
pd = PageData(metadata=self.data)
pd.lookup_ids[IdType.ISBN] = self.data.get('isbn')
pd.lookup_ids[IdType.CUBN] = self.data.get('cubn')
@ -129,3 +129,34 @@ class DoubanBook(AbstractSite, ScraperMixin):
except Exception:
logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}')
return pd
@SiteList.register
class DoubanBook_Work(AbstractSite):
ID_TYPE = IdType.DoubanBook_Work
URL_PATTERNS = [r"\w+://book\.douban\.com/works/(\d+)"]
WIKI_PROPERTY_ID = '?'
DEFAULT_MODEL = Work
@classmethod
def id_to_url(self, id_value):
return "https://book.douban.com/works/" + id_value + "/"
def bypass_scrape(self, data_from_link):
if not data_from_link:
return None
pd = PageData(metadata={
'title': data_from_link['title'],
})
return pd
def scrape(self):
content = html.fromstring(DoubanDownloader(self.url).download().text.strip())
title_elem = content.xpath("//h1/text()")
title = title_elem[0].split('全部版本(')[0].strip() if title_elem else None
if not title:
raise ParseError(self, 'title')
pd = PageData(metadata={
'title': title,
})
return pd

View file

@ -6,6 +6,7 @@ from catalog.tv.models import *
import logging
from django.db import models
from django.utils.translation import gettext_lazy as _
from .tmdb import TMDB_TV, search_tmdb_by_imdb_id
logger = logging.getLogger(__name__)
@ -238,19 +239,33 @@ class DoubanMovie(AbstractSite):
})
pd.metadata['preferred_model'] = ('TVSeason' if season else 'TVShow') if is_series else 'Movie'
# tmdb_api_url = f"https://api.themoviedb.org/3/find/{self.imdb_code}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&external_source=imdb_id"
# res_data = BasicDownloader(tmdb_api_url).download().json()
# if 'movie_results' in res_data and len(res_data['movie_results']) > 0:
# pd.metadata['preferred_model'] = 'Movie'
# elif 'tv_results' in res_data and len(res_data['tv_results']) > 0:
# pd.metadata['preferred_model'] = 'TVShow'
# elif 'tv_season_results' in res_data and len(res_data['tv_season_results']) > 0:
# pd.metadata['preferred_model'] = 'TVSeason'
# elif 'tv_episode_results' in res_data and len(res_data['tv_episode_results']) > 0:
# pd.metadata['preferred_model'] = 'TVSeason'
if imdb_code:
res_data = search_tmdb_by_imdb_id(imdb_code)
tmdb_show_id = None
if 'movie_results' in res_data and len(res_data['movie_results']) > 0:
pd.metadata['preferred_model'] = 'Movie'
elif 'tv_results' in res_data and len(res_data['tv_results']) > 0:
pd.metadata['preferred_model'] = 'TVShow'
elif 'tv_season_results' in res_data and len(res_data['tv_season_results']) > 0:
pd.metadata['preferred_model'] = 'TVSeason'
tmdb_show_id = res_data['tv_season_results'][0]['show_id']
elif 'tv_episode_results' in res_data and len(res_data['tv_episode_results']) > 0:
pd.metadata['preferred_model'] = 'TVSeason'
tmdb_show_id = res_data['tv_episode_results'][0]['show_id']
if res_data['tv_episode_results'][0]['episode_number'] != 1:
logger.error(f'Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}')
# TODO correct the IMDB id
pd.lookup_ids[IdType.IMDB] = imdb_code
if tmdb_show_id:
pd.metadata['required_pages'] = [{
'model': 'TVShow',
'id_type': IdType.TMDB_TV,
'id_value': tmdb_show_id,
'title': title,
'url': TMDB_TV.id_to_url(tmdb_show_id),
}]
# TODO parse sister seasons
# pd.metadata['related_pages'] = []
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:

View file

@ -1,12 +1,12 @@
import re
from catalog.book.models import Edition
from catalog.book.models import Edition, Work
from catalog.common import *
from lxml import html
import json
import logging
logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
class GoodreadsDownloader(RetryDownloader):
@ -64,14 +64,15 @@ class Goodreads(AbstractSite):
data['asin'] = asin
data['pages'] = b['details'].get('numPages')
data['cover_image_url'] = b['imageUrl']
data['work'] = {}
w = next(filter(lambda x: x.get('details'), o['Work']), None)
if w:
data['work']['lookup_id_type'] = IdType.Goodreads_Work
data['work']['lookup_id_value'] = str(w['legacyId'])
data['work']['title'] = w['details']['originalTitle']
data['work']['url'] = w['details']['webUrl']
data['required_pages'] = [{
'model': 'Work',
'id_type': IdType.Goodreads_Work,
'id_value': str(w['legacyId']),
'title': w['details']['originalTitle'],
'url': w['editions']['webUrl'],
}]
pd = PageData(metadata=data)
pd.lookup_ids[IdType.ISBN] = data.get('isbn')
pd.lookup_ids[IdType.ASIN] = data.get('asin')
@ -81,5 +82,34 @@ class Goodreads(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
logger.debug(f'failed to download cover for {self.url} from {data["cover_image_url"]}')
_logger.debug(f'failed to download cover for {self.url} from {data["cover_image_url"]}')
return pd
@SiteList.register
class Goodreads_Work(AbstractSite):
ID_TYPE = IdType.Goodreads_Work
WIKI_PROPERTY_ID = ''
DEFAULT_MODEL = Work
URL_PATTERNS = [r".+goodreads.com/work/editions/(\d+)"]
@classmethod
def id_to_url(self, id_value):
return "https://www.goodreads.com/work/editions/" + id_value
def scrape(self, response=None):
content = html.fromstring(BasicDownloader(self.url).download().text.strip())
title_elem = content.xpath("//h1/a/text()")
title = title_elem[0].strip() if title_elem else None
if not title:
raise ParseError(self, 'title')
author_elem = content.xpath("//h2/a/text()")
author = author_elem[0].strip() if author_elem else None
first_published_elem = content.xpath("//h2/span/text()")
first_published = first_published_elem[0].strip() if first_published_elem else None
pd = PageData(metadata={
'title': title,
'author': author,
'first_published': first_published
})
return pd

View file

@ -1,6 +1,5 @@
from django.conf import settings
from catalog.common import *
from .douban import *
from .tmdb import search_tmdb_by_imdb_id
from catalog.movie.models import *
from catalog.tv.models import *
import logging
@ -21,8 +20,7 @@ class IMDB(AbstractSite):
def scrape(self):
self.scraped = False
api_url = f"https://api.themoviedb.org/3/find/{self.id_value}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&external_source=imdb_id"
res_data = BasicDownloader(api_url).download().json()
res_data = search_tmdb_by_imdb_id(self.id_value)
if 'movie_results' in res_data and len(res_data['movie_results']) > 0:
url = f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}"
elif 'tv_results' in res_data and len(res_data['tv_results']) > 0:

View file

@ -14,6 +14,12 @@ import logging
logger = logging.getLogger(__name__)
def search_tmdb_by_imdb_id(imdb_id):
tmdb_api_url = f"https://api.themoviedb.org/3/find/{imdb_id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&external_source=imdb_id"
res_data = BasicDownloader(tmdb_api_url).download().json()
return res_data
def _copy_dict(s, key_map):
d = {}
for src, dst in key_map.items():
@ -61,7 +67,7 @@ class TMDB_Movie(AbstractSite):
@classmethod
def id_to_url(self, id_value):
return "https://www.themoviedb.org/movie/" + id_value
return f"https://www.themoviedb.org/movie/{id_value}"
def scrape(self):
is_series = False
@ -162,7 +168,7 @@ class TMDB_TV(AbstractSite):
@classmethod
def id_to_url(self, id_value):
return "https://www.themoviedb.org/tv/" + id_value
return f"https://www.themoviedb.org/tv/{id_value}"
def scrape(self):
is_series = True
@ -221,6 +227,12 @@ class TMDB_TV(AbstractSite):
# TODO: use GET /configuration to get base url
img_url = ('https://image.tmdb.org/t/p/original/' + res_data['poster_path']) if res_data['poster_path'] is not None else None
season_links = list(map(lambda s: {
'model': 'TVSeason',
'id_type': IdType.TMDB_TVSeason,
'id_value': f'{self.id_value}-{s["season_number"]}',
'title': s['name'],
'url': f'{self.url}/season/{s["season_number"]}'}, res_data['seasons']))
pd = PageData(metadata={
'title': title,
'orig_title': orig_title,
@ -241,9 +253,11 @@ class TMDB_TV(AbstractSite):
'single_episode_length': None,
'brief': brief,
'cover_image_url': img_url,
'related_pages': season_links,
})
if imdb_code:
pd.lookup_ids[IdType.IMDB] = imdb_code
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
@ -279,6 +293,13 @@ class TMDB_TVSeason(AbstractSite):
if not d.get('id'):
raise ParseError('id')
pd = PageData(metadata=_copy_dict(d, {'name': 'title', 'overview': 'brief', 'air_date': 'air_date', 'season_number': 0, 'external_ids': 0}))
pd.metadata['required_pages'] = [{
'model': 'TVShow',
'id_type': IdType.TMDB_TV,
'id_value': v[0],
'title': f'TMDB TV Show {v[0]}',
'url': f"https://www.themoviedb.org/tv/{v[0]}",
}]
pd.lookup_ids[IdType.IMDB] = d['external_ids'].get('imdb_id')
pd.metadata['cover_image_url'] = ('https://image.tmdb.org/t/p/original/' + d['poster_path']) if d['poster_path'] else None
pd.metadata['title'] = pd.metadata['title'] if pd.metadata['title'] else f'Season {d["season_number"]}'
@ -295,7 +316,7 @@ class TMDB_TVSeason(AbstractSite):
# get external id from 1st episode
if pd.lookup_ids[IdType.IMDB]:
logger.warning("Unexpected IMDB id for TMDB tv season")
elif len(pd.metadata['episode_number_list']) == 0:
elif len(pd.metadata['episode_number_list']) == 0:
logger.warning("Unable to lookup IMDB id for TMDB tv season with zero episodes")
else:
ep = pd.metadata['episode_number_list'][0]

View file

@ -39,14 +39,23 @@ class TVSeason(Item):
douban_movie = PrimaryLookupIdDescriptor(IdType.DoubanMovie)
imdb = PrimaryLookupIdDescriptor(IdType.IMDB)
tmdb_tvseason = PrimaryLookupIdDescriptor(IdType.TMDB_TVSeason)
series = models.ForeignKey(TVShow, null=True, on_delete=models.SET_NULL, related_name='seasons')
show = models.ForeignKey(TVShow, null=True, on_delete=models.SET_NULL, related_name='seasons')
season_number = models.PositiveIntegerField()
episode_count = jsondata.IntegerField(blank=True, default=None)
METADATA_COPY_LIST = ['title', 'brief', 'season_number', 'episode_count']
def update_linked_items_from_extenal_page(self, page):
"""add Work from page.metadata['work'] if not yet"""
links = page.required_pages + page.related_pages
for w in links:
if w['model'] == 'TVShow':
p = ExternalPage.objects.filter(id_type=w['id_type'], id_value=w['id_value']).first()
if p and p.item and self.show != p.item:
self.show = p.item
class TVEpisode(Item):
series = models.ForeignKey(TVShow, null=True, on_delete=models.SET_NULL, related_name='episodes')
show = models.ForeignKey(TVShow, null=True, on_delete=models.SET_NULL, related_name='episodes')
season = models.ForeignKey(TVSeason, null=True, on_delete=models.SET_NULL, related_name='episodes')
episode_number = models.PositiveIntegerField()
imdb = PrimaryLookupIdDescriptor(IdType.IMDB)

View file

@ -60,6 +60,8 @@ class TMDBTVSeasonTestCase(TestCase):
self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.page.item.__class__.__name__, 'TVSeason')
self.assertEqual(site.page.item.imdb, 'tt1159991')
self.assertIsNotNone(site.page.item.show)
self.assertEqual(site.page.item.show.imdb, 'tt0436992')
class DoubanMovieTVTestCase(TestCase):
@ -68,6 +70,8 @@ class DoubanMovieTVTestCase(TestCase):
url3 = 'https://movie.douban.com/subject/3627919/'
p3 = SiteList.get_site_by_url(url3).get_page_ready()
self.assertEqual(p3.item.__class__.__name__, 'TVSeason')
self.assertIsNotNone(p3.item.show)
self.assertEqual(p3.item.show.imdb, 'tt0436992')
@use_local_response
def test_scrape_singleseason(self):

View file

@ -0,0 +1 @@
{"movie_results":[],"person_results":[],"tv_results":[{"adult":false,"backdrop_path":"/8IC1q0lHFwi5m8VtChLzIfmpaZH.jpg","id":86941,"name":"北海鲸梦","original_language":"en","original_name":"The North Water","overview":"改编自伊恩·麦奎尔的同名获奖小说聚焦19世纪一次灾难性的捕鲸活动。故事围绕帕特里克·萨姆纳展开他是一名声名狼藉的前战地医生后成为捕鲸船上的医生在船上遇到了鱼叉手亨利·德拉克斯一个残忍、不道德的杀手。萨姆纳没有逃离过去的恐惧而是被迫在北极荒原上为生存而进行残酷的斗争...","poster_path":"/9CM0ca8pX1os3SJ24hsIc0nN8ph.jpg","media_type":"tv","genre_ids":[18,9648],"popularity":11.318,"first_air_date":"2021-07-14","vote_average":7.5,"vote_count":75,"origin_country":["US"]}],"tv_episode_results":[],"tv_season_results":[]}

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long