new data model: rename some classes

This commit is contained in:
Your Name 2022-12-08 16:08:59 +00:00
parent 997e5fad0d
commit 7d04d29613
20 changed files with 164 additions and 164 deletions

View file

@ -13,7 +13,7 @@ only has Edition level ("volume") data
Douban:
old editions has only CUBN(Chinese Unified Book Number)
work data seems asymmetric (a book page links to a work page, but may not listed on that work page as one of the editions)
work data seems asymmetric (a book links to a work, but may not listed in that work as one of its editions)
"""
@ -45,9 +45,9 @@ class Edition(Item):
def isbn10(self, value):
self.isbn = isbn_10_to_13(value)
def update_linked_items_from_extenal_page(self, page):
"""add Work from page.metadata['work'] if not yet"""
links = page.required_pages + page.related_pages
def update_linked_items_from_external_resource(self, resource):
"""add Work from resource.metadata['work'] if not yet"""
links = resource.required_resources + resource.related_resources
for w in links:
if w['model'] == 'Work':
work = Work.objects.filter(primary_lookup_id_type=w['id_type'], primary_lookup_id_value=w['id_value']).first()

View file

@ -71,19 +71,19 @@ class GoodreadsTestCase(TestCase):
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.url, t_url2)
site.get_page()
site.get_resource()
self.assertEqual(site.ready, False)
self.assertIsNotNone(site.page)
site.get_page_ready()
self.assertIsNotNone(site.resource)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.page.metadata.get('title'), 'Hyperion')
self.assertEqual(site.page.metadata.get('isbn'), isbn)
self.assertEqual(site.page.required_pages[0]['id_value'], '1383900')
self.assertEqual(site.resource.metadata.get('title'), 'Hyperion')
self.assertEqual(site.resource.metadata.get('isbn'), isbn)
self.assertEqual(site.resource.required_resources[0]['id_value'], '1383900')
edition = Edition.objects.get(primary_lookup_id_type=IdType.ISBN, primary_lookup_id_value=isbn)
page = edition.external_pages.all().first()
self.assertEqual(page.id_type, IdType.Goodreads)
self.assertEqual(page.id_value, '77566')
self.assertNotEqual(page.cover, '/media/item/default.svg')
resource = edition.external_resources.all().first()
self.assertEqual(resource.id_type, IdType.Goodreads)
self.assertEqual(resource.id_value, '77566')
self.assertNotEqual(resource.cover, '/media/item/default.svg')
self.assertEqual(edition.isbn, '9780553283686')
self.assertEqual(edition.title, 'Hyperion')
@ -91,26 +91,26 @@ class GoodreadsTestCase(TestCase):
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.url, t_url2)
site.get_page()
self.assertEqual(site.ready, True, 'previous page should still exist with data')
site.get_resource()
self.assertEqual(site.ready, True, 'previous resource should still exist with data')
@use_local_response
def test_asin(self):
t_url = 'https://www.goodreads.com/book/show/45064996-hyperion'
site = SiteList.get_site_by_url(t_url)
site.get_page_ready()
self.assertEqual(site.page.item.title, 'Hyperion')
self.assertEqual(site.page.item.asin, 'B004G60EHS')
site.get_resource_ready()
self.assertEqual(site.resource.item.title, 'Hyperion')
self.assertEqual(site.resource.item.asin, 'B004G60EHS')
@use_local_response
def test_work(self):
url = 'https://www.goodreads.com/work/editions/153313'
p = SiteList.get_site_by_url(url).get_page_ready()
p = SiteList.get_site_by_url(url).get_resource_ready()
self.assertEqual(p.item.title, '1984')
url1 = 'https://www.goodreads.com/book/show/3597767-rok-1984'
url2 = 'https://www.goodreads.com/book/show/40961427-1984'
p1 = SiteList.get_site_by_url(url1).get_page_ready()
p2 = SiteList.get_site_by_url(url2).get_page_ready()
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
w1 = p1.item.works.all().first()
w2 = p2.item.works.all().first()
self.assertEqual(w1, w2)
@ -137,22 +137,22 @@ class DoubanBookTestCase(TestCase):
t_url = 'https://book.douban.com/subject/35902899/'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_page_ready()
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.page.metadata.get('title'), '1984 Nineteen Eighty-Four')
self.assertEqual(site.page.metadata.get('isbn'), '9781847498571')
self.assertEqual(site.page.id_type, IdType.DoubanBook)
self.assertEqual(site.page.id_value, '35902899')
self.assertEqual(site.page.item.isbn, '9781847498571')
self.assertEqual(site.page.item.title, '1984 Nineteen Eighty-Four')
self.assertEqual(site.resource.metadata.get('title'), '1984 Nineteen Eighty-Four')
self.assertEqual(site.resource.metadata.get('isbn'), '9781847498571')
self.assertEqual(site.resource.id_type, IdType.DoubanBook)
self.assertEqual(site.resource.id_value, '35902899')
self.assertEqual(site.resource.item.isbn, '9781847498571')
self.assertEqual(site.resource.item.title, '1984 Nineteen Eighty-Four')
@use_local_response
def test_work(self):
# url = 'https://www.goodreads.com/work/editions/153313'
url1 = 'https://book.douban.com/subject/1089243/'
url2 = 'https://book.douban.com/subject/2037260/'
p1 = SiteList.get_site_by_url(url1).get_page_ready()
p2 = SiteList.get_site_by_url(url2).get_page_ready()
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
w1 = p1.item.works.all().first()
w2 = p2.item.works.all().first()
self.assertEqual(w1.title, '黄金时代')
@ -169,8 +169,8 @@ class MultiBookSitesTestCase(TestCase):
# isbn = '9781847498571'
url1 = 'https://www.goodreads.com/book/show/56821625-1984'
url2 = 'https://book.douban.com/subject/35902899/'
p1 = SiteList.get_site_by_url(url1).get_page_ready()
p2 = SiteList.get_site_by_url(url2).get_page_ready()
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
self.assertEqual(p1.item.id, p2.item.id)
@use_local_response
@ -180,16 +180,16 @@ class MultiBookSitesTestCase(TestCase):
url2 = 'https://book.douban.com/subject/2037260/'
url3 = 'https://www.goodreads.com/book/show/59952545-golden-age'
url4 = 'https://www.goodreads.com/book/show/11798823'
p1 = SiteList.get_site_by_url(url1).get_page_ready() # lxml bug may break this
p1 = SiteList.get_site_by_url(url1).get_resource_ready() # lxml bug may break this
w1 = p1.item.works.all().first()
p2 = SiteList.get_site_by_url(url2).get_page_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
w2 = p2.item.works.all().first()
self.assertEqual(w1, w2)
self.assertEqual(p1.item.works.all().count(), 1)
p3 = SiteList.get_site_by_url(url3).get_page_ready()
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
w3 = p3.item.works.all().first()
self.assertNotEqual(w3, w2)
p4 = SiteList.get_site_by_url(url4).get_page_ready()
p4 = SiteList.get_site_by_url(url4).get_resource_ready()
self.assertEqual(p4.item.works.all().count(), 2)
self.assertEqual(p1.item.works.all().count(), 2)
w2e = w2.editions.all().order_by('title')

View file

@ -5,4 +5,4 @@ from .scrapers import *
from . import jsondata
__all__ = ('IdType', 'Item', 'ExternalPage', 'PageData', 'ParseError', 'ScraperMixin', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'setMockMode', 'getMockMode', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP')
__all__ = ('IdType', 'Item', 'ExternalResource', 'ResourceContent', 'ParseError', 'ScraperMixin', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'setMockMode', 'getMockMode', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP')

View file

@ -179,16 +179,16 @@ class Item(PolymorphicModel):
# print(ll)
pass
METADATA_COPY_LIST = ['title', 'brief'] # list of metadata keys to copy from page to item
METADATA_COPY_LIST = ['title', 'brief'] # list of metadata keys to copy from resource to item
@classmethod
def copy_metadata(cls, metadata):
return dict((k, v) for k, v in metadata.items() if k in cls.METADATA_COPY_LIST and v is not None)
def merge_data_from_extenal_pages(self):
def merge_data_from_external_resources(self):
"""Subclass may override this"""
lookup_ids = []
for p in self.external_pages.all():
for p in self.external_resources.all():
lookup_ids.append((p.id_type, p.id_value))
lookup_ids += p.other_lookup_ids.items()
for k in self.METADATA_COPY_LIST:
@ -198,7 +198,7 @@ class Item(PolymorphicModel):
self.cover = p.cover
self.update_lookup_ids(lookup_ids)
def update_linked_items_from_extenal_page(self, page):
def update_linked_items_from_external_resource(self, resource):
"""Subclass should override this"""
pass
@ -213,19 +213,19 @@ class ItemLookupId(models.Model):
unique_together = [['id_type', 'id_value']]
class ExternalPage(models.Model):
item = models.ForeignKey(Item, null=True, on_delete=models.SET_NULL, related_name='external_pages')
class ExternalResource(models.Model):
item = models.ForeignKey(Item, null=True, on_delete=models.SET_NULL, related_name='external_resources')
id_type = models.CharField(_("IdType of the source site"), blank=False, choices=IdType.choices, max_length=50)
id_value = models.CharField(_("Primary Id on the source site"), blank=False, max_length=1000)
url = models.CharField(_("url to the page"), blank=False, max_length=1000, unique=True)
url = models.CharField(_("url to the resource"), blank=False, max_length=1000, unique=True)
cover = models.ImageField(upload_to=item_cover_path, default=DEFAULT_ITEM_COVER, blank=True)
other_lookup_ids = models.JSONField(default=dict)
metadata = models.JSONField(default=dict)
scraped_time = models.DateTimeField(null=True)
created_time = models.DateTimeField(auto_now_add=True)
edited_time = models.DateTimeField(auto_now=True)
required_pages = jsondata.ArrayField(null=False, blank=False, default=list)
related_pages = jsondata.ArrayField(null=False, blank=False, default=list)
required_resources = jsondata.ArrayField(null=False, blank=False, default=list)
related_resources = jsondata.ArrayField(null=False, blank=False, default=list)
class Meta:
unique_together = [['id_type', 'id_value']]
@ -233,11 +233,11 @@ class ExternalPage(models.Model):
def __str__(self):
return f"{self.id}{':' + self.id_type + ':' + self.id_value if self.id_value else ''} ({self.url})"
def update_content(self, page_data):
self.other_lookup_ids = page_data.lookup_ids
self.metadata = page_data.metadata
if page_data.cover_image and page_data.cover_image_extention:
self.cover = SimpleUploadedFile('temp.' + page_data.cover_image_extention, page_data.cover_image)
def update_content(self, resource_content):
self.other_lookup_ids = resource_content.lookup_ids
self.metadata = resource_content.metadata
if resource_content.cover_image and resource_content.cover_image_extention:
self.cover = SimpleUploadedFile('temp.' + resource_content.cover_image_extention, resource_content.cover_image)
self.scraped_time = timezone.now()
self.save()

View file

@ -1,6 +1,6 @@
from typing import *
import re
from .models import ExternalPage
from .models import ExternalResource
from dataclasses import dataclass, field
import logging
@ -9,7 +9,7 @@ logger = logging.getLogger(__name__)
@dataclass
class PageData:
class ResourceContent:
lookup_ids: dict = field(default_factory=dict)
metadata: dict = field(default_factory=dict)
cover_image = None
@ -45,28 +45,28 @@ class AbstractSite:
def __init__(self, url=None):
self.id_value = self.url_to_id(url) if url else None
self.url = self.id_to_url(self.id_value) if url else None
self.page = None
self.resource = None
def get_page(self):
if not self.page:
self.page = ExternalPage.objects.filter(url=self.url).first()
if self.page is None:
self.page = ExternalPage(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url)
return self.page
def get_resource(self):
if not self.resource:
self.resource = ExternalResource.objects.filter(url=self.url).first()
if self.resource is None:
self.resource = ExternalResource(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url)
return self.resource
def bypass_scrape(self, data_from_link) -> PageData | None:
"""subclass may implement this to use data from linked page and bypass actual scrape"""
def bypass_scrape(self, data_from_link) -> ResourceContent | None:
"""subclass may implement this to use data from linked resource and bypass actual scrape"""
return None
def scrape(self) -> PageData:
"""subclass should implement this, return PageData object"""
data = PageData()
def scrape(self) -> ResourceContent:
"""subclass should implement this, return ResourceContent object"""
data = ResourceContent()
return data
def get_item(self):
p = self.get_page()
p = self.get_resource()
if not p:
raise ValueError(f'page not available for {self.url}')
raise ValueError(f'resource not available for {self.url}')
model = p.get_preferred_model()
if not model:
model = self.DEFAULT_MODEL
@ -82,41 +82,41 @@ class AbstractSite:
@property
def ready(self):
return bool(self.page and self.page.ready)
return bool(self.resource and self.resource.ready)
def get_page_ready(self, auto_save=True, auto_create=True, auto_link=True, data_from_link=None):
"""return a page scraped, or scrape if not yet"""
def get_resource_ready(self, auto_save=True, auto_create=True, auto_link=True, data_from_link=None):
"""return a resource scraped, or scrape if not yet"""
if auto_link:
auto_create = True
if auto_create:
auto_save = True
p = self.get_page()
pagedata = {}
if not self.page:
p = self.get_resource()
resource_content = {}
if not self.resource:
return None
if not p.ready:
pagedata = self.bypass_scrape(data_from_link)
if not pagedata:
pagedata = self.scrape()
p.update_content(pagedata)
resource_content = self.bypass_scrape(data_from_link)
if not resource_content:
resource_content = self.scrape()
p.update_content(resource_content)
if not p.ready:
logger.error(f'unable to get page {self.url} ready')
logger.error(f'unable to get resource {self.url} ready')
return None
if auto_create and p.item is None:
self.get_item()
if auto_save:
p.save()
if p.item:
p.item.merge_data_from_extenal_pages()
p.item.merge_data_from_external_resources()
p.item.save()
if auto_link:
for linked_pages in p.required_pages:
linked_site = SiteList.get_site_by_url(linked_pages['url'])
for linked_resources in p.required_resources:
linked_site = SiteList.get_site_by_url(linked_resources['url'])
if linked_site:
linked_site.get_page_ready(auto_link=False)
linked_site.get_resource_ready(auto_link=False)
else:
logger.error(f'unable to get site for {linked_pages["url"]}')
p.item.update_linked_items_from_extenal_page(p)
logger.error(f'unable to get site for {linked_resources["url"]}')
p.item.update_linked_items_from_external_resource(p)
p.item.save()
return p

View file

@ -14,9 +14,9 @@ logger = logging.getLogger(__name__)
DEFAULT_ITEM_COVER = 'item/default.svg'
def item_cover_path(page, filename):
def item_cover_path(resource, filename):
fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1]
return 'items/' + page.id_type + '/' + fn
return 'items/' + resource.id_type + '/' + fn
TestDataDir = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/'

View file

@ -5,7 +5,7 @@ from catalog.sites import *
class Command(BaseCommand):
help = 'Scrape a catalog item from external page (but not save it)'
help = 'Scrape a catalog item from external resource (but not save it)'
def add_arguments(self, parser):
parser.add_argument('url', type=str, help='URL to scrape')
@ -17,6 +17,6 @@ class Command(BaseCommand):
self.stdout.write(self.style.ERROR(f'Unknown site for {url}'))
return
self.stdout.write(f'Fetching from {site}')
page = site.get_page_ready(auto_link=False, auto_save=False)
resource = site.get_resource_ready(auto_link=False, auto_save=False)
self.stdout.write(self.style.SUCCESS(f'Done.'))
pprint.pp(page.metadata)
pprint.pp(resource.metadata)

View file

@ -19,11 +19,11 @@ class DoubanMovieTestCase(TestCase):
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '3541415')
site.get_page_ready()
self.assertEqual(site.page.metadata['title'], '盗梦空间')
self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.page.item.__class__.__name__, 'Movie')
self.assertEqual(site.page.item.imdb, 'tt1375666')
site.get_resource_ready()
self.assertEqual(site.resource.metadata['title'], '盗梦空间')
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, 'Movie')
self.assertEqual(site.resource.item.imdb, 'tt1375666')
class TMDBMovieTestCase(TestCase):
@ -45,11 +45,11 @@ class TMDBMovieTestCase(TestCase):
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '293767')
site.get_page_ready()
self.assertEqual(site.page.metadata['title'], '比利·林恩的中场战事')
self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.page.item.__class__.__name__, 'Movie')
self.assertEqual(site.page.item.imdb, 'tt2513074')
site.get_resource_ready()
self.assertEqual(site.resource.metadata['title'], '比利·林恩的中场战事')
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, 'Movie')
self.assertEqual(site.resource.item.imdb, 'tt2513074')
class IMDBMovieTestCase(TestCase):
@ -71,10 +71,10 @@ class IMDBMovieTestCase(TestCase):
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, 'tt1375666')
site.get_page_ready()
self.assertEqual(site.page.metadata['title'], '盗梦空间')
self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.page.item.imdb, 'tt1375666')
site.get_resource_ready()
self.assertEqual(site.resource.metadata['title'], '盗梦空间')
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.imdb, 'tt1375666')
class MultiMovieSitesTestCase(TestCase):
@ -83,8 +83,8 @@ class MultiMovieSitesTestCase(TestCase):
url1 = 'https://www.themoviedb.org/movie/27205-inception'
url2 = 'https://movie.douban.com/subject/3541415/'
url3 = 'https://www.imdb.com/title/tt1375666/'
p1 = SiteList.get_site_by_url(url1).get_page_ready()
p2 = SiteList.get_site_by_url(url2).get_page_ready()
p3 = SiteList.get_site_by_url(url3).get_page_ready()
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p1.item.id, p2.item.id)
self.assertEqual(p2.item.id, p3.item.id)

View file

@ -20,8 +20,8 @@ class SpotifyTestCase(TestCase):
t_url = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_page_ready()
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.page.metadata['title'], 'The Race For Space')
self.assertIsInstance(site.page.item, Album)
self.assertEqual(site.page.item.barcode, '3610159662676')
self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
self.assertIsInstance(site.resource.item, Album)
self.assertEqual(site.resource.item.barcode, '3610159662676')

View file

@ -22,9 +22,9 @@ class DoubanDramaTestCase(TestCase):
t_url = 'https://www.douban.com/location/drama/24849279/'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
page = site.get_page_ready()
resource = site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(page.metadata['title'], '红花侠')
self.assertEqual(resource.metadata['title'], '红花侠')
item = site.get_item()
self.assertEqual(item.title, '红花侠')

View file

@ -24,7 +24,7 @@ class ApplePodcastTestCase(TestCase):
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '1050430296')
site.get_page_ready()
self.assertEqual(site.page.metadata['title'], 'The New Yorker Radio Hour')
# self.assertEqual(site.page.metadata['feed_url'], 'http://feeds.wnyc.org/newyorkerradiohour')
self.assertEqual(site.page.metadata['feed_url'], 'http://feeds.feedburner.com/newyorkerradiohour')
site.get_resource_ready()
self.assertEqual(site.resource.metadata['title'], 'The New Yorker Radio Hour')
# self.assertEqual(site.resource.metadata['feed_url'], 'http://feeds.wnyc.org/newyorkerradiohour')
self.assertEqual(site.resource.metadata['feed_url'], 'http://feeds.feedburner.com/newyorkerradiohour')

View file

@ -22,7 +22,7 @@ class ApplePodcast(AbstractSite):
dl = BasicDownloader(api_url)
resp = dl.download()
r = resp.json()['results'][0]
pd = PageData(metadata={
pd = ResourceContent(metadata={
'title': r['trackName'],
'feed_url': r['feedUrl'],
'hosts': [r['artistName']],

View file

@ -111,14 +111,14 @@ class DoubanBook(AbstractSite, ScraperMixin):
work_link = self.parse_str('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
if work_link:
r = re.match(r'\w+://book.douban.com/works/(\d+)', work_link)
self.data['required_pages'] = [{
self.data['required_resources'] = [{
'model': 'Work',
'id_type': IdType.DoubanBook_Work,
'id_value': r[1] if r else None,
'title': self.data['title'],
'url': work_link,
}]
pd = PageData(metadata=self.data)
pd = ResourceContent(metadata=self.data)
pd.lookup_ids[IdType.ISBN] = self.data.get('isbn')
pd.lookup_ids[IdType.CUBN] = self.data.get('cubn')
if self.data["cover_image_url"]:
@ -145,7 +145,7 @@ class DoubanBook_Work(AbstractSite):
def bypass_scrape(self, data_from_link):
if not data_from_link:
return None
pd = PageData(metadata={
pd = ResourceContent(metadata={
'title': data_from_link['title'],
})
return pd
@ -156,7 +156,7 @@ class DoubanBook_Work(AbstractSite):
title = title_elem[0].split('全部版本(')[0].strip() if title_elem else None
if not title:
raise ParseError(self, 'title')
pd = PageData(metadata={
pd = ResourceContent(metadata={
'title': title,
})
return pd

View file

@ -48,7 +48,7 @@ class DoubanDrama(AbstractSite):
img_url_elem = h.xpath("//img[@itemprop='image']/@src")
data['cover_image_url'] = img_url_elem[0].strip() if img_url_elem else None
pd = PageData(metadata=data)
pd = ResourceContent(metadata=data)
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:

View file

@ -215,7 +215,7 @@ class DoubanMovie(AbstractSite):
img_url_elem = content.xpath("//img[@rel='v:image']/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
pd = PageData(metadata={
pd = ResourceContent(metadata={
'title': title,
'orig_title': orig_title,
'other_title': other_title,
@ -257,7 +257,7 @@ class DoubanMovie(AbstractSite):
# TODO correct the IMDB id
pd.lookup_ids[IdType.IMDB] = imdb_code
if tmdb_show_id:
pd.metadata['required_pages'] = [{
pd.metadata['required_resources'] = [{
'model': 'TVShow',
'id_type': IdType.TMDB_TV,
'id_value': tmdb_show_id,
@ -265,7 +265,7 @@ class DoubanMovie(AbstractSite):
'url': TMDB_TV.id_to_url(tmdb_show_id),
}]
# TODO parse sister seasons
# pd.metadata['related_pages'] = []
# pd.metadata['related_resources'] = []
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:

View file

@ -66,14 +66,14 @@ class Goodreads(AbstractSite):
data['cover_image_url'] = b['imageUrl']
w = next(filter(lambda x: x.get('details'), o['Work']), None)
if w:
data['required_pages'] = [{
data['required_resources'] = [{
'model': 'Work',
'id_type': IdType.Goodreads_Work,
'id_value': str(w['legacyId']),
'title': w['details']['originalTitle'],
'url': w['editions']['webUrl'],
}]
pd = PageData(metadata=data)
pd = ResourceContent(metadata=data)
pd.lookup_ids[IdType.ISBN] = data.get('isbn')
pd.lookup_ids[IdType.ASIN] = data.get('asin')
if data["cover_image_url"]:
@ -107,7 +107,7 @@ class Goodreads_Work(AbstractSite):
author = author_elem[0].strip() if author_elem else None
first_published_elem = content.xpath("//h2/span/text()")
first_published = first_published_elem[0].strip() if first_published_elem else None
pd = PageData(metadata={
pd = ResourceContent(metadata={
'title': title,
'author': author,
'first_published': first_published

View file

@ -74,7 +74,7 @@ class Spotify(AbstractSite):
# isrc = res_data['external_ids'].get('isrc')
# _logger.error('isrc for album? this should not happen')
pd = PageData(metadata={
pd = ResourceContent(metadata={
'title': title,
'artist': artist,
'genre': genre,

View file

@ -126,7 +126,7 @@ class TMDB_Movie(AbstractSite):
# TODO: use GET /configuration to get base url
img_url = ('https://image.tmdb.org/t/p/original/' + res_data['poster_path']) if res_data['poster_path'] is not None else None
pd = PageData(metadata={
pd = ResourceContent(metadata={
'title': title,
'orig_title': orig_title,
'other_title': None,
@ -233,7 +233,7 @@ class TMDB_TV(AbstractSite):
'id_value': f'{self.id_value}-{s["season_number"]}',
'title': s['name'],
'url': f'{self.url}/season/{s["season_number"]}'}, res_data['seasons']))
pd = PageData(metadata={
pd = ResourceContent(metadata={
'title': title,
'orig_title': orig_title,
'other_title': None,
@ -253,7 +253,7 @@ class TMDB_TV(AbstractSite):
'single_episode_length': None,
'brief': brief,
'cover_image_url': img_url,
'related_pages': season_links,
'related_resources': season_links,
})
if imdb_code:
pd.lookup_ids[IdType.IMDB] = imdb_code
@ -292,8 +292,8 @@ class TMDB_TVSeason(AbstractSite):
d = BasicDownloader(api_url).download().json()
if not d.get('id'):
raise ParseError('id')
pd = PageData(metadata=_copy_dict(d, {'name': 'title', 'overview': 'brief', 'air_date': 'air_date', 'season_number': 0, 'external_ids': 0}))
pd.metadata['required_pages'] = [{
pd = ResourceContent(metadata=_copy_dict(d, {'name': 'title', 'overview': 'brief', 'air_date': 'air_date', 'season_number': 0, 'external_ids': 0}))
pd.metadata['required_resources'] = [{
'model': 'TVShow',
'id_type': IdType.TMDB_TV,
'id_value': v[0],

View file

@ -44,12 +44,12 @@ class TVSeason(Item):
episode_count = jsondata.IntegerField(blank=True, default=None)
METADATA_COPY_LIST = ['title', 'brief', 'season_number', 'episode_count']
def update_linked_items_from_extenal_page(self, page):
"""add Work from page.metadata['work'] if not yet"""
links = page.required_pages + page.related_pages
def update_linked_items_from_external_resource(self, resource):
"""add Work from resource.metadata['work'] if not yet"""
links = resource.required_resources + resource.related_resources
for w in links:
if w['model'] == 'TVShow':
p = ExternalPage.objects.filter(id_type=w['id_type'], id_value=w['id_value']).first()
p = ExternalResource.objects.filter(id_type=w['id_type'], id_value=w['id_value']).first()
if p and p.item and self.show != p.item:
self.show = p.item

View file

@ -27,12 +27,12 @@ class TMDBTVTestCase(TestCase):
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '57243')
site.get_page_ready()
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.page.metadata['title'], '神秘博士')
self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.page.item.__class__.__name__, 'TVShow')
self.assertEqual(site.page.item.imdb, 'tt0436992')
self.assertEqual(site.resource.metadata['title'], '神秘博士')
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, 'TVShow')
self.assertEqual(site.resource.item.imdb, 'tt0436992')
class TMDBTVSeasonTestCase(TestCase):
@ -54,21 +54,21 @@ class TMDBTVSeasonTestCase(TestCase):
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '57243-4')
site.get_page_ready()
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.page.metadata['title'], '第 4 季')
self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.page.item.__class__.__name__, 'TVSeason')
self.assertEqual(site.page.item.imdb, 'tt1159991')
self.assertIsNotNone(site.page.item.show)
self.assertEqual(site.page.item.show.imdb, 'tt0436992')
self.assertEqual(site.resource.metadata['title'], '第 4 季')
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, 'TVSeason')
self.assertEqual(site.resource.item.imdb, 'tt1159991')
self.assertIsNotNone(site.resource.item.show)
self.assertEqual(site.resource.item.show.imdb, 'tt0436992')
class DoubanMovieTVTestCase(TestCase):
@use_local_response
def test_scrape(self):
url3 = 'https://movie.douban.com/subject/3627919/'
p3 = SiteList.get_site_by_url(url3).get_page_ready()
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, 'TVSeason')
self.assertIsNotNone(p3.item.show)
self.assertEqual(p3.item.show.imdb, 'tt0436992')
@ -76,7 +76,7 @@ class DoubanMovieTVTestCase(TestCase):
@use_local_response
def test_scrape_singleseason(self):
url3 = 'https://movie.douban.com/subject/26895436/'
p3 = SiteList.get_site_by_url(url3).get_page_ready()
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, 'TVShow')
@ -86,9 +86,9 @@ class MultiTVSitesTestCase(TestCase):
url1 = 'https://www.themoviedb.org/tv/57243-doctor-who'
url2 = 'https://www.imdb.com/title/tt0436992/'
# url3 = 'https://movie.douban.com/subject/3541415/'
p1 = SiteList.get_site_by_url(url1).get_page_ready()
p2 = SiteList.get_site_by_url(url2).get_page_ready()
# p3 = SiteList.get_site_by_url(url3).get_page_ready()
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
# p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p1.item.id, p2.item.id)
# self.assertEqual(p2.item.id, p3.item.id)
@ -97,9 +97,9 @@ class MultiTVSitesTestCase(TestCase):
url1 = 'https://www.themoviedb.org/tv/57243-doctor-who/season/4'
url2 = 'https://www.imdb.com/title/tt1159991/'
url3 = 'https://movie.douban.com/subject/3627919/'
p1 = SiteList.get_site_by_url(url1).get_page_ready()
p2 = SiteList.get_site_by_url(url2).get_page_ready()
p3 = SiteList.get_site_by_url(url3).get_page_ready()
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p1.item.imdb, p2.item.imdb)
self.assertEqual(p2.item.imdb, p3.item.imdb)
self.assertEqual(p1.item.id, p2.item.id)
@ -109,8 +109,8 @@ class MultiTVSitesTestCase(TestCase):
def test_miniseries(self):
url1 = 'https://www.themoviedb.org/tv/86941-the-north-water'
url3 = 'https://movie.douban.com/subject/26895436/'
p1 = SiteList.get_site_by_url(url1).get_page_ready()
p3 = SiteList.get_site_by_url(url3).get_page_ready()
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, 'TVShow')
self.assertEqual(p1.item.id, p3.item.id)
@ -119,9 +119,9 @@ class MultiTVSitesTestCase(TestCase):
url1 = 'https://www.themoviedb.org/movie/282758-doctor-who-the-runaway-bride'
url2 = 'hhttps://www.imdb.com/title/tt0827573/'
url3 = 'https://movie.douban.com/subject/4296866/'
p1 = SiteList.get_site_by_url(url1).get_page_ready()
p2 = SiteList.get_site_by_url(url2).get_page_ready()
p3 = SiteList.get_site_by_url(url3).get_page_ready()
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p1.item.imdb, p2.item.imdb)
self.assertEqual(p2.item.imdb, p3.item.imdb)
self.assertEqual(p1.item.id, p2.item.id)