new data model: rename some classes
This commit is contained in:
parent
997e5fad0d
commit
7d04d29613
20 changed files with 164 additions and 164 deletions
|
@ -13,7 +13,7 @@ only has Edition level ("volume") data
|
|||
|
||||
Douban:
|
||||
old editions has only CUBN(Chinese Unified Book Number)
|
||||
work data seems asymmetric (a book page links to a work page, but may not listed on that work page as one of the editions)
|
||||
work data seems asymmetric (a book links to a work, but may not listed in that work as one of its editions)
|
||||
|
||||
"""
|
||||
|
||||
|
@ -45,9 +45,9 @@ class Edition(Item):
|
|||
def isbn10(self, value):
|
||||
self.isbn = isbn_10_to_13(value)
|
||||
|
||||
def update_linked_items_from_extenal_page(self, page):
|
||||
"""add Work from page.metadata['work'] if not yet"""
|
||||
links = page.required_pages + page.related_pages
|
||||
def update_linked_items_from_external_resource(self, resource):
|
||||
"""add Work from resource.metadata['work'] if not yet"""
|
||||
links = resource.required_resources + resource.related_resources
|
||||
for w in links:
|
||||
if w['model'] == 'Work':
|
||||
work = Work.objects.filter(primary_lookup_id_type=w['id_type'], primary_lookup_id_value=w['id_value']).first()
|
||||
|
|
|
@ -71,19 +71,19 @@ class GoodreadsTestCase(TestCase):
|
|||
site = SiteList.get_site_by_url(t_url)
|
||||
self.assertEqual(site.ready, False)
|
||||
self.assertEqual(site.url, t_url2)
|
||||
site.get_page()
|
||||
site.get_resource()
|
||||
self.assertEqual(site.ready, False)
|
||||
self.assertIsNotNone(site.page)
|
||||
site.get_page_ready()
|
||||
self.assertIsNotNone(site.resource)
|
||||
site.get_resource_ready()
|
||||
self.assertEqual(site.ready, True)
|
||||
self.assertEqual(site.page.metadata.get('title'), 'Hyperion')
|
||||
self.assertEqual(site.page.metadata.get('isbn'), isbn)
|
||||
self.assertEqual(site.page.required_pages[0]['id_value'], '1383900')
|
||||
self.assertEqual(site.resource.metadata.get('title'), 'Hyperion')
|
||||
self.assertEqual(site.resource.metadata.get('isbn'), isbn)
|
||||
self.assertEqual(site.resource.required_resources[0]['id_value'], '1383900')
|
||||
edition = Edition.objects.get(primary_lookup_id_type=IdType.ISBN, primary_lookup_id_value=isbn)
|
||||
page = edition.external_pages.all().first()
|
||||
self.assertEqual(page.id_type, IdType.Goodreads)
|
||||
self.assertEqual(page.id_value, '77566')
|
||||
self.assertNotEqual(page.cover, '/media/item/default.svg')
|
||||
resource = edition.external_resources.all().first()
|
||||
self.assertEqual(resource.id_type, IdType.Goodreads)
|
||||
self.assertEqual(resource.id_value, '77566')
|
||||
self.assertNotEqual(resource.cover, '/media/item/default.svg')
|
||||
self.assertEqual(edition.isbn, '9780553283686')
|
||||
self.assertEqual(edition.title, 'Hyperion')
|
||||
|
||||
|
@ -91,26 +91,26 @@ class GoodreadsTestCase(TestCase):
|
|||
site = SiteList.get_site_by_url(t_url)
|
||||
self.assertEqual(site.ready, False)
|
||||
self.assertEqual(site.url, t_url2)
|
||||
site.get_page()
|
||||
self.assertEqual(site.ready, True, 'previous page should still exist with data')
|
||||
site.get_resource()
|
||||
self.assertEqual(site.ready, True, 'previous resource should still exist with data')
|
||||
|
||||
@use_local_response
|
||||
def test_asin(self):
|
||||
t_url = 'https://www.goodreads.com/book/show/45064996-hyperion'
|
||||
site = SiteList.get_site_by_url(t_url)
|
||||
site.get_page_ready()
|
||||
self.assertEqual(site.page.item.title, 'Hyperion')
|
||||
self.assertEqual(site.page.item.asin, 'B004G60EHS')
|
||||
site.get_resource_ready()
|
||||
self.assertEqual(site.resource.item.title, 'Hyperion')
|
||||
self.assertEqual(site.resource.item.asin, 'B004G60EHS')
|
||||
|
||||
@use_local_response
|
||||
def test_work(self):
|
||||
url = 'https://www.goodreads.com/work/editions/153313'
|
||||
p = SiteList.get_site_by_url(url).get_page_ready()
|
||||
p = SiteList.get_site_by_url(url).get_resource_ready()
|
||||
self.assertEqual(p.item.title, '1984')
|
||||
url1 = 'https://www.goodreads.com/book/show/3597767-rok-1984'
|
||||
url2 = 'https://www.goodreads.com/book/show/40961427-1984'
|
||||
p1 = SiteList.get_site_by_url(url1).get_page_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_page_ready()
|
||||
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
|
||||
w1 = p1.item.works.all().first()
|
||||
w2 = p2.item.works.all().first()
|
||||
self.assertEqual(w1, w2)
|
||||
|
@ -137,22 +137,22 @@ class DoubanBookTestCase(TestCase):
|
|||
t_url = 'https://book.douban.com/subject/35902899/'
|
||||
site = SiteList.get_site_by_url(t_url)
|
||||
self.assertEqual(site.ready, False)
|
||||
site.get_page_ready()
|
||||
site.get_resource_ready()
|
||||
self.assertEqual(site.ready, True)
|
||||
self.assertEqual(site.page.metadata.get('title'), '1984 Nineteen Eighty-Four')
|
||||
self.assertEqual(site.page.metadata.get('isbn'), '9781847498571')
|
||||
self.assertEqual(site.page.id_type, IdType.DoubanBook)
|
||||
self.assertEqual(site.page.id_value, '35902899')
|
||||
self.assertEqual(site.page.item.isbn, '9781847498571')
|
||||
self.assertEqual(site.page.item.title, '1984 Nineteen Eighty-Four')
|
||||
self.assertEqual(site.resource.metadata.get('title'), '1984 Nineteen Eighty-Four')
|
||||
self.assertEqual(site.resource.metadata.get('isbn'), '9781847498571')
|
||||
self.assertEqual(site.resource.id_type, IdType.DoubanBook)
|
||||
self.assertEqual(site.resource.id_value, '35902899')
|
||||
self.assertEqual(site.resource.item.isbn, '9781847498571')
|
||||
self.assertEqual(site.resource.item.title, '1984 Nineteen Eighty-Four')
|
||||
|
||||
@use_local_response
|
||||
def test_work(self):
|
||||
# url = 'https://www.goodreads.com/work/editions/153313'
|
||||
url1 = 'https://book.douban.com/subject/1089243/'
|
||||
url2 = 'https://book.douban.com/subject/2037260/'
|
||||
p1 = SiteList.get_site_by_url(url1).get_page_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_page_ready()
|
||||
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
|
||||
w1 = p1.item.works.all().first()
|
||||
w2 = p2.item.works.all().first()
|
||||
self.assertEqual(w1.title, '黄金时代')
|
||||
|
@ -169,8 +169,8 @@ class MultiBookSitesTestCase(TestCase):
|
|||
# isbn = '9781847498571'
|
||||
url1 = 'https://www.goodreads.com/book/show/56821625-1984'
|
||||
url2 = 'https://book.douban.com/subject/35902899/'
|
||||
p1 = SiteList.get_site_by_url(url1).get_page_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_page_ready()
|
||||
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
|
||||
self.assertEqual(p1.item.id, p2.item.id)
|
||||
|
||||
@use_local_response
|
||||
|
@ -180,16 +180,16 @@ class MultiBookSitesTestCase(TestCase):
|
|||
url2 = 'https://book.douban.com/subject/2037260/'
|
||||
url3 = 'https://www.goodreads.com/book/show/59952545-golden-age'
|
||||
url4 = 'https://www.goodreads.com/book/show/11798823'
|
||||
p1 = SiteList.get_site_by_url(url1).get_page_ready() # lxml bug may break this
|
||||
p1 = SiteList.get_site_by_url(url1).get_resource_ready() # lxml bug may break this
|
||||
w1 = p1.item.works.all().first()
|
||||
p2 = SiteList.get_site_by_url(url2).get_page_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
|
||||
w2 = p2.item.works.all().first()
|
||||
self.assertEqual(w1, w2)
|
||||
self.assertEqual(p1.item.works.all().count(), 1)
|
||||
p3 = SiteList.get_site_by_url(url3).get_page_ready()
|
||||
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
|
||||
w3 = p3.item.works.all().first()
|
||||
self.assertNotEqual(w3, w2)
|
||||
p4 = SiteList.get_site_by_url(url4).get_page_ready()
|
||||
p4 = SiteList.get_site_by_url(url4).get_resource_ready()
|
||||
self.assertEqual(p4.item.works.all().count(), 2)
|
||||
self.assertEqual(p1.item.works.all().count(), 2)
|
||||
w2e = w2.editions.all().order_by('title')
|
||||
|
|
|
@ -5,4 +5,4 @@ from .scrapers import *
|
|||
from . import jsondata
|
||||
|
||||
|
||||
__all__ = ('IdType', 'Item', 'ExternalPage', 'PageData', 'ParseError', 'ScraperMixin', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'setMockMode', 'getMockMode', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP')
|
||||
__all__ = ('IdType', 'Item', 'ExternalResource', 'ResourceContent', 'ParseError', 'ScraperMixin', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'setMockMode', 'getMockMode', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP')
|
||||
|
|
|
@ -179,16 +179,16 @@ class Item(PolymorphicModel):
|
|||
# print(ll)
|
||||
pass
|
||||
|
||||
METADATA_COPY_LIST = ['title', 'brief'] # list of metadata keys to copy from page to item
|
||||
METADATA_COPY_LIST = ['title', 'brief'] # list of metadata keys to copy from resource to item
|
||||
|
||||
@classmethod
|
||||
def copy_metadata(cls, metadata):
|
||||
return dict((k, v) for k, v in metadata.items() if k in cls.METADATA_COPY_LIST and v is not None)
|
||||
|
||||
def merge_data_from_extenal_pages(self):
|
||||
def merge_data_from_external_resources(self):
|
||||
"""Subclass may override this"""
|
||||
lookup_ids = []
|
||||
for p in self.external_pages.all():
|
||||
for p in self.external_resources.all():
|
||||
lookup_ids.append((p.id_type, p.id_value))
|
||||
lookup_ids += p.other_lookup_ids.items()
|
||||
for k in self.METADATA_COPY_LIST:
|
||||
|
@ -198,7 +198,7 @@ class Item(PolymorphicModel):
|
|||
self.cover = p.cover
|
||||
self.update_lookup_ids(lookup_ids)
|
||||
|
||||
def update_linked_items_from_extenal_page(self, page):
|
||||
def update_linked_items_from_external_resource(self, resource):
|
||||
"""Subclass should override this"""
|
||||
pass
|
||||
|
||||
|
@ -213,19 +213,19 @@ class ItemLookupId(models.Model):
|
|||
unique_together = [['id_type', 'id_value']]
|
||||
|
||||
|
||||
class ExternalPage(models.Model):
|
||||
item = models.ForeignKey(Item, null=True, on_delete=models.SET_NULL, related_name='external_pages')
|
||||
class ExternalResource(models.Model):
|
||||
item = models.ForeignKey(Item, null=True, on_delete=models.SET_NULL, related_name='external_resources')
|
||||
id_type = models.CharField(_("IdType of the source site"), blank=False, choices=IdType.choices, max_length=50)
|
||||
id_value = models.CharField(_("Primary Id on the source site"), blank=False, max_length=1000)
|
||||
url = models.CharField(_("url to the page"), blank=False, max_length=1000, unique=True)
|
||||
url = models.CharField(_("url to the resource"), blank=False, max_length=1000, unique=True)
|
||||
cover = models.ImageField(upload_to=item_cover_path, default=DEFAULT_ITEM_COVER, blank=True)
|
||||
other_lookup_ids = models.JSONField(default=dict)
|
||||
metadata = models.JSONField(default=dict)
|
||||
scraped_time = models.DateTimeField(null=True)
|
||||
created_time = models.DateTimeField(auto_now_add=True)
|
||||
edited_time = models.DateTimeField(auto_now=True)
|
||||
required_pages = jsondata.ArrayField(null=False, blank=False, default=list)
|
||||
related_pages = jsondata.ArrayField(null=False, blank=False, default=list)
|
||||
required_resources = jsondata.ArrayField(null=False, blank=False, default=list)
|
||||
related_resources = jsondata.ArrayField(null=False, blank=False, default=list)
|
||||
|
||||
class Meta:
|
||||
unique_together = [['id_type', 'id_value']]
|
||||
|
@ -233,11 +233,11 @@ class ExternalPage(models.Model):
|
|||
def __str__(self):
|
||||
return f"{self.id}{':' + self.id_type + ':' + self.id_value if self.id_value else ''} ({self.url})"
|
||||
|
||||
def update_content(self, page_data):
|
||||
self.other_lookup_ids = page_data.lookup_ids
|
||||
self.metadata = page_data.metadata
|
||||
if page_data.cover_image and page_data.cover_image_extention:
|
||||
self.cover = SimpleUploadedFile('temp.' + page_data.cover_image_extention, page_data.cover_image)
|
||||
def update_content(self, resource_content):
|
||||
self.other_lookup_ids = resource_content.lookup_ids
|
||||
self.metadata = resource_content.metadata
|
||||
if resource_content.cover_image and resource_content.cover_image_extention:
|
||||
self.cover = SimpleUploadedFile('temp.' + resource_content.cover_image_extention, resource_content.cover_image)
|
||||
self.scraped_time = timezone.now()
|
||||
self.save()
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from typing import *
|
||||
import re
|
||||
from .models import ExternalPage
|
||||
from .models import ExternalResource
|
||||
from dataclasses import dataclass, field
|
||||
import logging
|
||||
|
||||
|
@ -9,7 +9,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
@dataclass
|
||||
class PageData:
|
||||
class ResourceContent:
|
||||
lookup_ids: dict = field(default_factory=dict)
|
||||
metadata: dict = field(default_factory=dict)
|
||||
cover_image = None
|
||||
|
@ -45,28 +45,28 @@ class AbstractSite:
|
|||
def __init__(self, url=None):
|
||||
self.id_value = self.url_to_id(url) if url else None
|
||||
self.url = self.id_to_url(self.id_value) if url else None
|
||||
self.page = None
|
||||
self.resource = None
|
||||
|
||||
def get_page(self):
|
||||
if not self.page:
|
||||
self.page = ExternalPage.objects.filter(url=self.url).first()
|
||||
if self.page is None:
|
||||
self.page = ExternalPage(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url)
|
||||
return self.page
|
||||
def get_resource(self):
|
||||
if not self.resource:
|
||||
self.resource = ExternalResource.objects.filter(url=self.url).first()
|
||||
if self.resource is None:
|
||||
self.resource = ExternalResource(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url)
|
||||
return self.resource
|
||||
|
||||
def bypass_scrape(self, data_from_link) -> PageData | None:
|
||||
"""subclass may implement this to use data from linked page and bypass actual scrape"""
|
||||
def bypass_scrape(self, data_from_link) -> ResourceContent | None:
|
||||
"""subclass may implement this to use data from linked resource and bypass actual scrape"""
|
||||
return None
|
||||
|
||||
def scrape(self) -> PageData:
|
||||
"""subclass should implement this, return PageData object"""
|
||||
data = PageData()
|
||||
def scrape(self) -> ResourceContent:
|
||||
"""subclass should implement this, return ResourceContent object"""
|
||||
data = ResourceContent()
|
||||
return data
|
||||
|
||||
def get_item(self):
|
||||
p = self.get_page()
|
||||
p = self.get_resource()
|
||||
if not p:
|
||||
raise ValueError(f'page not available for {self.url}')
|
||||
raise ValueError(f'resource not available for {self.url}')
|
||||
model = p.get_preferred_model()
|
||||
if not model:
|
||||
model = self.DEFAULT_MODEL
|
||||
|
@ -82,41 +82,41 @@ class AbstractSite:
|
|||
|
||||
@property
|
||||
def ready(self):
|
||||
return bool(self.page and self.page.ready)
|
||||
return bool(self.resource and self.resource.ready)
|
||||
|
||||
def get_page_ready(self, auto_save=True, auto_create=True, auto_link=True, data_from_link=None):
|
||||
"""return a page scraped, or scrape if not yet"""
|
||||
def get_resource_ready(self, auto_save=True, auto_create=True, auto_link=True, data_from_link=None):
|
||||
"""return a resource scraped, or scrape if not yet"""
|
||||
if auto_link:
|
||||
auto_create = True
|
||||
if auto_create:
|
||||
auto_save = True
|
||||
p = self.get_page()
|
||||
pagedata = {}
|
||||
if not self.page:
|
||||
p = self.get_resource()
|
||||
resource_content = {}
|
||||
if not self.resource:
|
||||
return None
|
||||
if not p.ready:
|
||||
pagedata = self.bypass_scrape(data_from_link)
|
||||
if not pagedata:
|
||||
pagedata = self.scrape()
|
||||
p.update_content(pagedata)
|
||||
resource_content = self.bypass_scrape(data_from_link)
|
||||
if not resource_content:
|
||||
resource_content = self.scrape()
|
||||
p.update_content(resource_content)
|
||||
if not p.ready:
|
||||
logger.error(f'unable to get page {self.url} ready')
|
||||
logger.error(f'unable to get resource {self.url} ready')
|
||||
return None
|
||||
if auto_create and p.item is None:
|
||||
self.get_item()
|
||||
if auto_save:
|
||||
p.save()
|
||||
if p.item:
|
||||
p.item.merge_data_from_extenal_pages()
|
||||
p.item.merge_data_from_external_resources()
|
||||
p.item.save()
|
||||
if auto_link:
|
||||
for linked_pages in p.required_pages:
|
||||
linked_site = SiteList.get_site_by_url(linked_pages['url'])
|
||||
for linked_resources in p.required_resources:
|
||||
linked_site = SiteList.get_site_by_url(linked_resources['url'])
|
||||
if linked_site:
|
||||
linked_site.get_page_ready(auto_link=False)
|
||||
linked_site.get_resource_ready(auto_link=False)
|
||||
else:
|
||||
logger.error(f'unable to get site for {linked_pages["url"]}')
|
||||
p.item.update_linked_items_from_extenal_page(p)
|
||||
logger.error(f'unable to get site for {linked_resources["url"]}')
|
||||
p.item.update_linked_items_from_external_resource(p)
|
||||
p.item.save()
|
||||
return p
|
||||
|
||||
|
|
|
@ -14,9 +14,9 @@ logger = logging.getLogger(__name__)
|
|||
DEFAULT_ITEM_COVER = 'item/default.svg'
|
||||
|
||||
|
||||
def item_cover_path(page, filename):
|
||||
def item_cover_path(resource, filename):
|
||||
fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1]
|
||||
return 'items/' + page.id_type + '/' + fn
|
||||
return 'items/' + resource.id_type + '/' + fn
|
||||
|
||||
|
||||
TestDataDir = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/'
|
||||
|
|
|
@ -5,7 +5,7 @@ from catalog.sites import *
|
|||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Scrape a catalog item from external page (but not save it)'
|
||||
help = 'Scrape a catalog item from external resource (but not save it)'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('url', type=str, help='URL to scrape')
|
||||
|
@ -17,6 +17,6 @@ class Command(BaseCommand):
|
|||
self.stdout.write(self.style.ERROR(f'Unknown site for {url}'))
|
||||
return
|
||||
self.stdout.write(f'Fetching from {site}')
|
||||
page = site.get_page_ready(auto_link=False, auto_save=False)
|
||||
resource = site.get_resource_ready(auto_link=False, auto_save=False)
|
||||
self.stdout.write(self.style.SUCCESS(f'Done.'))
|
||||
pprint.pp(page.metadata)
|
||||
pprint.pp(resource.metadata)
|
||||
|
|
|
@ -19,11 +19,11 @@ class DoubanMovieTestCase(TestCase):
|
|||
site = SiteList.get_site_by_url(t_url)
|
||||
self.assertEqual(site.ready, False)
|
||||
self.assertEqual(site.id_value, '3541415')
|
||||
site.get_page_ready()
|
||||
self.assertEqual(site.page.metadata['title'], '盗梦空间')
|
||||
self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB)
|
||||
self.assertEqual(site.page.item.__class__.__name__, 'Movie')
|
||||
self.assertEqual(site.page.item.imdb, 'tt1375666')
|
||||
site.get_resource_ready()
|
||||
self.assertEqual(site.resource.metadata['title'], '盗梦空间')
|
||||
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
|
||||
self.assertEqual(site.resource.item.__class__.__name__, 'Movie')
|
||||
self.assertEqual(site.resource.item.imdb, 'tt1375666')
|
||||
|
||||
|
||||
class TMDBMovieTestCase(TestCase):
|
||||
|
@ -45,11 +45,11 @@ class TMDBMovieTestCase(TestCase):
|
|||
site = SiteList.get_site_by_url(t_url)
|
||||
self.assertEqual(site.ready, False)
|
||||
self.assertEqual(site.id_value, '293767')
|
||||
site.get_page_ready()
|
||||
self.assertEqual(site.page.metadata['title'], '比利·林恩的中场战事')
|
||||
self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB)
|
||||
self.assertEqual(site.page.item.__class__.__name__, 'Movie')
|
||||
self.assertEqual(site.page.item.imdb, 'tt2513074')
|
||||
site.get_resource_ready()
|
||||
self.assertEqual(site.resource.metadata['title'], '比利·林恩的中场战事')
|
||||
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
|
||||
self.assertEqual(site.resource.item.__class__.__name__, 'Movie')
|
||||
self.assertEqual(site.resource.item.imdb, 'tt2513074')
|
||||
|
||||
|
||||
class IMDBMovieTestCase(TestCase):
|
||||
|
@ -71,10 +71,10 @@ class IMDBMovieTestCase(TestCase):
|
|||
site = SiteList.get_site_by_url(t_url)
|
||||
self.assertEqual(site.ready, False)
|
||||
self.assertEqual(site.id_value, 'tt1375666')
|
||||
site.get_page_ready()
|
||||
self.assertEqual(site.page.metadata['title'], '盗梦空间')
|
||||
self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB)
|
||||
self.assertEqual(site.page.item.imdb, 'tt1375666')
|
||||
site.get_resource_ready()
|
||||
self.assertEqual(site.resource.metadata['title'], '盗梦空间')
|
||||
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
|
||||
self.assertEqual(site.resource.item.imdb, 'tt1375666')
|
||||
|
||||
|
||||
class MultiMovieSitesTestCase(TestCase):
|
||||
|
@ -83,8 +83,8 @@ class MultiMovieSitesTestCase(TestCase):
|
|||
url1 = 'https://www.themoviedb.org/movie/27205-inception'
|
||||
url2 = 'https://movie.douban.com/subject/3541415/'
|
||||
url3 = 'https://www.imdb.com/title/tt1375666/'
|
||||
p1 = SiteList.get_site_by_url(url1).get_page_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_page_ready()
|
||||
p3 = SiteList.get_site_by_url(url3).get_page_ready()
|
||||
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
|
||||
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
|
||||
self.assertEqual(p1.item.id, p2.item.id)
|
||||
self.assertEqual(p2.item.id, p3.item.id)
|
||||
|
|
|
@ -20,8 +20,8 @@ class SpotifyTestCase(TestCase):
|
|||
t_url = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP'
|
||||
site = SiteList.get_site_by_url(t_url)
|
||||
self.assertEqual(site.ready, False)
|
||||
site.get_page_ready()
|
||||
site.get_resource_ready()
|
||||
self.assertEqual(site.ready, True)
|
||||
self.assertEqual(site.page.metadata['title'], 'The Race For Space')
|
||||
self.assertIsInstance(site.page.item, Album)
|
||||
self.assertEqual(site.page.item.barcode, '3610159662676')
|
||||
self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
|
||||
self.assertIsInstance(site.resource.item, Album)
|
||||
self.assertEqual(site.resource.item.barcode, '3610159662676')
|
||||
|
|
|
@ -22,9 +22,9 @@ class DoubanDramaTestCase(TestCase):
|
|||
t_url = 'https://www.douban.com/location/drama/24849279/'
|
||||
site = SiteList.get_site_by_url(t_url)
|
||||
self.assertEqual(site.ready, False)
|
||||
page = site.get_page_ready()
|
||||
resource = site.get_resource_ready()
|
||||
self.assertEqual(site.ready, True)
|
||||
self.assertEqual(page.metadata['title'], '红花侠')
|
||||
self.assertEqual(resource.metadata['title'], '红花侠')
|
||||
item = site.get_item()
|
||||
self.assertEqual(item.title, '红花侠')
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ class ApplePodcastTestCase(TestCase):
|
|||
site = SiteList.get_site_by_url(t_url)
|
||||
self.assertEqual(site.ready, False)
|
||||
self.assertEqual(site.id_value, '1050430296')
|
||||
site.get_page_ready()
|
||||
self.assertEqual(site.page.metadata['title'], 'The New Yorker Radio Hour')
|
||||
# self.assertEqual(site.page.metadata['feed_url'], 'http://feeds.wnyc.org/newyorkerradiohour')
|
||||
self.assertEqual(site.page.metadata['feed_url'], 'http://feeds.feedburner.com/newyorkerradiohour')
|
||||
site.get_resource_ready()
|
||||
self.assertEqual(site.resource.metadata['title'], 'The New Yorker Radio Hour')
|
||||
# self.assertEqual(site.resource.metadata['feed_url'], 'http://feeds.wnyc.org/newyorkerradiohour')
|
||||
self.assertEqual(site.resource.metadata['feed_url'], 'http://feeds.feedburner.com/newyorkerradiohour')
|
||||
|
|
|
@ -22,7 +22,7 @@ class ApplePodcast(AbstractSite):
|
|||
dl = BasicDownloader(api_url)
|
||||
resp = dl.download()
|
||||
r = resp.json()['results'][0]
|
||||
pd = PageData(metadata={
|
||||
pd = ResourceContent(metadata={
|
||||
'title': r['trackName'],
|
||||
'feed_url': r['feedUrl'],
|
||||
'hosts': [r['artistName']],
|
||||
|
|
|
@ -111,14 +111,14 @@ class DoubanBook(AbstractSite, ScraperMixin):
|
|||
work_link = self.parse_str('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
|
||||
if work_link:
|
||||
r = re.match(r'\w+://book.douban.com/works/(\d+)', work_link)
|
||||
self.data['required_pages'] = [{
|
||||
self.data['required_resources'] = [{
|
||||
'model': 'Work',
|
||||
'id_type': IdType.DoubanBook_Work,
|
||||
'id_value': r[1] if r else None,
|
||||
'title': self.data['title'],
|
||||
'url': work_link,
|
||||
}]
|
||||
pd = PageData(metadata=self.data)
|
||||
pd = ResourceContent(metadata=self.data)
|
||||
pd.lookup_ids[IdType.ISBN] = self.data.get('isbn')
|
||||
pd.lookup_ids[IdType.CUBN] = self.data.get('cubn')
|
||||
if self.data["cover_image_url"]:
|
||||
|
@ -145,7 +145,7 @@ class DoubanBook_Work(AbstractSite):
|
|||
def bypass_scrape(self, data_from_link):
|
||||
if not data_from_link:
|
||||
return None
|
||||
pd = PageData(metadata={
|
||||
pd = ResourceContent(metadata={
|
||||
'title': data_from_link['title'],
|
||||
})
|
||||
return pd
|
||||
|
@ -156,7 +156,7 @@ class DoubanBook_Work(AbstractSite):
|
|||
title = title_elem[0].split('全部版本(')[0].strip() if title_elem else None
|
||||
if not title:
|
||||
raise ParseError(self, 'title')
|
||||
pd = PageData(metadata={
|
||||
pd = ResourceContent(metadata={
|
||||
'title': title,
|
||||
})
|
||||
return pd
|
||||
|
|
|
@ -48,7 +48,7 @@ class DoubanDrama(AbstractSite):
|
|||
img_url_elem = h.xpath("//img[@itemprop='image']/@src")
|
||||
data['cover_image_url'] = img_url_elem[0].strip() if img_url_elem else None
|
||||
|
||||
pd = PageData(metadata=data)
|
||||
pd = ResourceContent(metadata=data)
|
||||
if pd.metadata["cover_image_url"]:
|
||||
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
|
||||
try:
|
||||
|
|
|
@ -215,7 +215,7 @@ class DoubanMovie(AbstractSite):
|
|||
img_url_elem = content.xpath("//img[@rel='v:image']/@src")
|
||||
img_url = img_url_elem[0].strip() if img_url_elem else None
|
||||
|
||||
pd = PageData(metadata={
|
||||
pd = ResourceContent(metadata={
|
||||
'title': title,
|
||||
'orig_title': orig_title,
|
||||
'other_title': other_title,
|
||||
|
@ -257,7 +257,7 @@ class DoubanMovie(AbstractSite):
|
|||
# TODO correct the IMDB id
|
||||
pd.lookup_ids[IdType.IMDB] = imdb_code
|
||||
if tmdb_show_id:
|
||||
pd.metadata['required_pages'] = [{
|
||||
pd.metadata['required_resources'] = [{
|
||||
'model': 'TVShow',
|
||||
'id_type': IdType.TMDB_TV,
|
||||
'id_value': tmdb_show_id,
|
||||
|
@ -265,7 +265,7 @@ class DoubanMovie(AbstractSite):
|
|||
'url': TMDB_TV.id_to_url(tmdb_show_id),
|
||||
}]
|
||||
# TODO parse sister seasons
|
||||
# pd.metadata['related_pages'] = []
|
||||
# pd.metadata['related_resources'] = []
|
||||
if pd.metadata["cover_image_url"]:
|
||||
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
|
||||
try:
|
||||
|
|
|
@ -66,14 +66,14 @@ class Goodreads(AbstractSite):
|
|||
data['cover_image_url'] = b['imageUrl']
|
||||
w = next(filter(lambda x: x.get('details'), o['Work']), None)
|
||||
if w:
|
||||
data['required_pages'] = [{
|
||||
data['required_resources'] = [{
|
||||
'model': 'Work',
|
||||
'id_type': IdType.Goodreads_Work,
|
||||
'id_value': str(w['legacyId']),
|
||||
'title': w['details']['originalTitle'],
|
||||
'url': w['editions']['webUrl'],
|
||||
}]
|
||||
pd = PageData(metadata=data)
|
||||
pd = ResourceContent(metadata=data)
|
||||
pd.lookup_ids[IdType.ISBN] = data.get('isbn')
|
||||
pd.lookup_ids[IdType.ASIN] = data.get('asin')
|
||||
if data["cover_image_url"]:
|
||||
|
@ -107,7 +107,7 @@ class Goodreads_Work(AbstractSite):
|
|||
author = author_elem[0].strip() if author_elem else None
|
||||
first_published_elem = content.xpath("//h2/span/text()")
|
||||
first_published = first_published_elem[0].strip() if first_published_elem else None
|
||||
pd = PageData(metadata={
|
||||
pd = ResourceContent(metadata={
|
||||
'title': title,
|
||||
'author': author,
|
||||
'first_published': first_published
|
||||
|
|
|
@ -74,7 +74,7 @@ class Spotify(AbstractSite):
|
|||
# isrc = res_data['external_ids'].get('isrc')
|
||||
# _logger.error('isrc for album? this should not happen')
|
||||
|
||||
pd = PageData(metadata={
|
||||
pd = ResourceContent(metadata={
|
||||
'title': title,
|
||||
'artist': artist,
|
||||
'genre': genre,
|
||||
|
|
|
@ -126,7 +126,7 @@ class TMDB_Movie(AbstractSite):
|
|||
# TODO: use GET /configuration to get base url
|
||||
img_url = ('https://image.tmdb.org/t/p/original/' + res_data['poster_path']) if res_data['poster_path'] is not None else None
|
||||
|
||||
pd = PageData(metadata={
|
||||
pd = ResourceContent(metadata={
|
||||
'title': title,
|
||||
'orig_title': orig_title,
|
||||
'other_title': None,
|
||||
|
@ -233,7 +233,7 @@ class TMDB_TV(AbstractSite):
|
|||
'id_value': f'{self.id_value}-{s["season_number"]}',
|
||||
'title': s['name'],
|
||||
'url': f'{self.url}/season/{s["season_number"]}'}, res_data['seasons']))
|
||||
pd = PageData(metadata={
|
||||
pd = ResourceContent(metadata={
|
||||
'title': title,
|
||||
'orig_title': orig_title,
|
||||
'other_title': None,
|
||||
|
@ -253,7 +253,7 @@ class TMDB_TV(AbstractSite):
|
|||
'single_episode_length': None,
|
||||
'brief': brief,
|
||||
'cover_image_url': img_url,
|
||||
'related_pages': season_links,
|
||||
'related_resources': season_links,
|
||||
})
|
||||
if imdb_code:
|
||||
pd.lookup_ids[IdType.IMDB] = imdb_code
|
||||
|
@ -292,8 +292,8 @@ class TMDB_TVSeason(AbstractSite):
|
|||
d = BasicDownloader(api_url).download().json()
|
||||
if not d.get('id'):
|
||||
raise ParseError('id')
|
||||
pd = PageData(metadata=_copy_dict(d, {'name': 'title', 'overview': 'brief', 'air_date': 'air_date', 'season_number': 0, 'external_ids': 0}))
|
||||
pd.metadata['required_pages'] = [{
|
||||
pd = ResourceContent(metadata=_copy_dict(d, {'name': 'title', 'overview': 'brief', 'air_date': 'air_date', 'season_number': 0, 'external_ids': 0}))
|
||||
pd.metadata['required_resources'] = [{
|
||||
'model': 'TVShow',
|
||||
'id_type': IdType.TMDB_TV,
|
||||
'id_value': v[0],
|
||||
|
|
|
@ -44,12 +44,12 @@ class TVSeason(Item):
|
|||
episode_count = jsondata.IntegerField(blank=True, default=None)
|
||||
METADATA_COPY_LIST = ['title', 'brief', 'season_number', 'episode_count']
|
||||
|
||||
def update_linked_items_from_extenal_page(self, page):
|
||||
"""add Work from page.metadata['work'] if not yet"""
|
||||
links = page.required_pages + page.related_pages
|
||||
def update_linked_items_from_external_resource(self, resource):
|
||||
"""add Work from resource.metadata['work'] if not yet"""
|
||||
links = resource.required_resources + resource.related_resources
|
||||
for w in links:
|
||||
if w['model'] == 'TVShow':
|
||||
p = ExternalPage.objects.filter(id_type=w['id_type'], id_value=w['id_value']).first()
|
||||
p = ExternalResource.objects.filter(id_type=w['id_type'], id_value=w['id_value']).first()
|
||||
if p and p.item and self.show != p.item:
|
||||
self.show = p.item
|
||||
|
||||
|
|
|
@ -27,12 +27,12 @@ class TMDBTVTestCase(TestCase):
|
|||
site = SiteList.get_site_by_url(t_url)
|
||||
self.assertEqual(site.ready, False)
|
||||
self.assertEqual(site.id_value, '57243')
|
||||
site.get_page_ready()
|
||||
site.get_resource_ready()
|
||||
self.assertEqual(site.ready, True)
|
||||
self.assertEqual(site.page.metadata['title'], '神秘博士')
|
||||
self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB)
|
||||
self.assertEqual(site.page.item.__class__.__name__, 'TVShow')
|
||||
self.assertEqual(site.page.item.imdb, 'tt0436992')
|
||||
self.assertEqual(site.resource.metadata['title'], '神秘博士')
|
||||
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
|
||||
self.assertEqual(site.resource.item.__class__.__name__, 'TVShow')
|
||||
self.assertEqual(site.resource.item.imdb, 'tt0436992')
|
||||
|
||||
|
||||
class TMDBTVSeasonTestCase(TestCase):
|
||||
|
@ -54,21 +54,21 @@ class TMDBTVSeasonTestCase(TestCase):
|
|||
site = SiteList.get_site_by_url(t_url)
|
||||
self.assertEqual(site.ready, False)
|
||||
self.assertEqual(site.id_value, '57243-4')
|
||||
site.get_page_ready()
|
||||
site.get_resource_ready()
|
||||
self.assertEqual(site.ready, True)
|
||||
self.assertEqual(site.page.metadata['title'], '第 4 季')
|
||||
self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB)
|
||||
self.assertEqual(site.page.item.__class__.__name__, 'TVSeason')
|
||||
self.assertEqual(site.page.item.imdb, 'tt1159991')
|
||||
self.assertIsNotNone(site.page.item.show)
|
||||
self.assertEqual(site.page.item.show.imdb, 'tt0436992')
|
||||
self.assertEqual(site.resource.metadata['title'], '第 4 季')
|
||||
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
|
||||
self.assertEqual(site.resource.item.__class__.__name__, 'TVSeason')
|
||||
self.assertEqual(site.resource.item.imdb, 'tt1159991')
|
||||
self.assertIsNotNone(site.resource.item.show)
|
||||
self.assertEqual(site.resource.item.show.imdb, 'tt0436992')
|
||||
|
||||
|
||||
class DoubanMovieTVTestCase(TestCase):
|
||||
@use_local_response
|
||||
def test_scrape(self):
|
||||
url3 = 'https://movie.douban.com/subject/3627919/'
|
||||
p3 = SiteList.get_site_by_url(url3).get_page_ready()
|
||||
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
|
||||
self.assertEqual(p3.item.__class__.__name__, 'TVSeason')
|
||||
self.assertIsNotNone(p3.item.show)
|
||||
self.assertEqual(p3.item.show.imdb, 'tt0436992')
|
||||
|
@ -76,7 +76,7 @@ class DoubanMovieTVTestCase(TestCase):
|
|||
@use_local_response
|
||||
def test_scrape_singleseason(self):
|
||||
url3 = 'https://movie.douban.com/subject/26895436/'
|
||||
p3 = SiteList.get_site_by_url(url3).get_page_ready()
|
||||
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
|
||||
self.assertEqual(p3.item.__class__.__name__, 'TVShow')
|
||||
|
||||
|
||||
|
@ -86,9 +86,9 @@ class MultiTVSitesTestCase(TestCase):
|
|||
url1 = 'https://www.themoviedb.org/tv/57243-doctor-who'
|
||||
url2 = 'https://www.imdb.com/title/tt0436992/'
|
||||
# url3 = 'https://movie.douban.com/subject/3541415/'
|
||||
p1 = SiteList.get_site_by_url(url1).get_page_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_page_ready()
|
||||
# p3 = SiteList.get_site_by_url(url3).get_page_ready()
|
||||
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
|
||||
# p3 = SiteList.get_site_by_url(url3).get_resource_ready()
|
||||
self.assertEqual(p1.item.id, p2.item.id)
|
||||
# self.assertEqual(p2.item.id, p3.item.id)
|
||||
|
||||
|
@ -97,9 +97,9 @@ class MultiTVSitesTestCase(TestCase):
|
|||
url1 = 'https://www.themoviedb.org/tv/57243-doctor-who/season/4'
|
||||
url2 = 'https://www.imdb.com/title/tt1159991/'
|
||||
url3 = 'https://movie.douban.com/subject/3627919/'
|
||||
p1 = SiteList.get_site_by_url(url1).get_page_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_page_ready()
|
||||
p3 = SiteList.get_site_by_url(url3).get_page_ready()
|
||||
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
|
||||
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
|
||||
self.assertEqual(p1.item.imdb, p2.item.imdb)
|
||||
self.assertEqual(p2.item.imdb, p3.item.imdb)
|
||||
self.assertEqual(p1.item.id, p2.item.id)
|
||||
|
@ -109,8 +109,8 @@ class MultiTVSitesTestCase(TestCase):
|
|||
def test_miniseries(self):
|
||||
url1 = 'https://www.themoviedb.org/tv/86941-the-north-water'
|
||||
url3 = 'https://movie.douban.com/subject/26895436/'
|
||||
p1 = SiteList.get_site_by_url(url1).get_page_ready()
|
||||
p3 = SiteList.get_site_by_url(url3).get_page_ready()
|
||||
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
|
||||
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
|
||||
self.assertEqual(p3.item.__class__.__name__, 'TVShow')
|
||||
self.assertEqual(p1.item.id, p3.item.id)
|
||||
|
||||
|
@ -119,9 +119,9 @@ class MultiTVSitesTestCase(TestCase):
|
|||
url1 = 'https://www.themoviedb.org/movie/282758-doctor-who-the-runaway-bride'
|
||||
url2 = 'hhttps://www.imdb.com/title/tt0827573/'
|
||||
url3 = 'https://movie.douban.com/subject/4296866/'
|
||||
p1 = SiteList.get_site_by_url(url1).get_page_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_page_ready()
|
||||
p3 = SiteList.get_site_by_url(url3).get_page_ready()
|
||||
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
|
||||
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
|
||||
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
|
||||
self.assertEqual(p1.item.imdb, p2.item.imdb)
|
||||
self.assertEqual(p2.item.imdb, p3.item.imdb)
|
||||
self.assertEqual(p1.item.id, p2.item.id)
|
||||
|
|
Loading…
Add table
Reference in a new issue