diff --git a/catalog/book/models.py b/catalog/book/models.py index a96da033..6e62c47e 100644 --- a/catalog/book/models.py +++ b/catalog/book/models.py @@ -13,7 +13,7 @@ only has Edition level ("volume") data Douban: old editions has only CUBN(Chinese Unified Book Number) -work data seems asymmetric (a book page links to a work page, but may not listed on that work page as one of the editions) +work data seems asymmetric (a book links to a work, but may not listed in that work as one of its editions) """ @@ -45,9 +45,9 @@ class Edition(Item): def isbn10(self, value): self.isbn = isbn_10_to_13(value) - def update_linked_items_from_extenal_page(self, page): - """add Work from page.metadata['work'] if not yet""" - links = page.required_pages + page.related_pages + def update_linked_items_from_external_resource(self, resource): + """add Work from resource.metadata['work'] if not yet""" + links = resource.required_resources + resource.related_resources for w in links: if w['model'] == 'Work': work = Work.objects.filter(primary_lookup_id_type=w['id_type'], primary_lookup_id_value=w['id_value']).first() diff --git a/catalog/book/tests.py b/catalog/book/tests.py index 511bd705..ccdfc3d5 100644 --- a/catalog/book/tests.py +++ b/catalog/book/tests.py @@ -71,19 +71,19 @@ class GoodreadsTestCase(TestCase): site = SiteList.get_site_by_url(t_url) self.assertEqual(site.ready, False) self.assertEqual(site.url, t_url2) - site.get_page() + site.get_resource() self.assertEqual(site.ready, False) - self.assertIsNotNone(site.page) - site.get_page_ready() + self.assertIsNotNone(site.resource) + site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.page.metadata.get('title'), 'Hyperion') - self.assertEqual(site.page.metadata.get('isbn'), isbn) - self.assertEqual(site.page.required_pages[0]['id_value'], '1383900') + self.assertEqual(site.resource.metadata.get('title'), 'Hyperion') + self.assertEqual(site.resource.metadata.get('isbn'), isbn) + self.assertEqual(site.resource.required_resources[0]['id_value'], '1383900') edition = Edition.objects.get(primary_lookup_id_type=IdType.ISBN, primary_lookup_id_value=isbn) - page = edition.external_pages.all().first() - self.assertEqual(page.id_type, IdType.Goodreads) - self.assertEqual(page.id_value, '77566') - self.assertNotEqual(page.cover, '/media/item/default.svg') + resource = edition.external_resources.all().first() + self.assertEqual(resource.id_type, IdType.Goodreads) + self.assertEqual(resource.id_value, '77566') + self.assertNotEqual(resource.cover, '/media/item/default.svg') self.assertEqual(edition.isbn, '9780553283686') self.assertEqual(edition.title, 'Hyperion') @@ -91,26 +91,26 @@ class GoodreadsTestCase(TestCase): site = SiteList.get_site_by_url(t_url) self.assertEqual(site.ready, False) self.assertEqual(site.url, t_url2) - site.get_page() - self.assertEqual(site.ready, True, 'previous page should still exist with data') + site.get_resource() + self.assertEqual(site.ready, True, 'previous resource should still exist with data') @use_local_response def test_asin(self): t_url = 'https://www.goodreads.com/book/show/45064996-hyperion' site = SiteList.get_site_by_url(t_url) - site.get_page_ready() - self.assertEqual(site.page.item.title, 'Hyperion') - self.assertEqual(site.page.item.asin, 'B004G60EHS') + site.get_resource_ready() + self.assertEqual(site.resource.item.title, 'Hyperion') + self.assertEqual(site.resource.item.asin, 'B004G60EHS') @use_local_response def test_work(self): url = 'https://www.goodreads.com/work/editions/153313' - p = SiteList.get_site_by_url(url).get_page_ready() + p = SiteList.get_site_by_url(url).get_resource_ready() self.assertEqual(p.item.title, '1984') url1 = 'https://www.goodreads.com/book/show/3597767-rok-1984' url2 = 'https://www.goodreads.com/book/show/40961427-1984' - p1 = SiteList.get_site_by_url(url1).get_page_ready() - p2 = SiteList.get_site_by_url(url2).get_page_ready() + p1 = SiteList.get_site_by_url(url1).get_resource_ready() + p2 = SiteList.get_site_by_url(url2).get_resource_ready() w1 = p1.item.works.all().first() w2 = p2.item.works.all().first() self.assertEqual(w1, w2) @@ -137,22 +137,22 @@ class DoubanBookTestCase(TestCase): t_url = 'https://book.douban.com/subject/35902899/' site = SiteList.get_site_by_url(t_url) self.assertEqual(site.ready, False) - site.get_page_ready() + site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.page.metadata.get('title'), '1984 Nineteen Eighty-Four') - self.assertEqual(site.page.metadata.get('isbn'), '9781847498571') - self.assertEqual(site.page.id_type, IdType.DoubanBook) - self.assertEqual(site.page.id_value, '35902899') - self.assertEqual(site.page.item.isbn, '9781847498571') - self.assertEqual(site.page.item.title, '1984 Nineteen Eighty-Four') + self.assertEqual(site.resource.metadata.get('title'), '1984 Nineteen Eighty-Four') + self.assertEqual(site.resource.metadata.get('isbn'), '9781847498571') + self.assertEqual(site.resource.id_type, IdType.DoubanBook) + self.assertEqual(site.resource.id_value, '35902899') + self.assertEqual(site.resource.item.isbn, '9781847498571') + self.assertEqual(site.resource.item.title, '1984 Nineteen Eighty-Four') @use_local_response def test_work(self): # url = 'https://www.goodreads.com/work/editions/153313' url1 = 'https://book.douban.com/subject/1089243/' url2 = 'https://book.douban.com/subject/2037260/' - p1 = SiteList.get_site_by_url(url1).get_page_ready() - p2 = SiteList.get_site_by_url(url2).get_page_ready() + p1 = SiteList.get_site_by_url(url1).get_resource_ready() + p2 = SiteList.get_site_by_url(url2).get_resource_ready() w1 = p1.item.works.all().first() w2 = p2.item.works.all().first() self.assertEqual(w1.title, '黄金时代') @@ -169,8 +169,8 @@ class MultiBookSitesTestCase(TestCase): # isbn = '9781847498571' url1 = 'https://www.goodreads.com/book/show/56821625-1984' url2 = 'https://book.douban.com/subject/35902899/' - p1 = SiteList.get_site_by_url(url1).get_page_ready() - p2 = SiteList.get_site_by_url(url2).get_page_ready() + p1 = SiteList.get_site_by_url(url1).get_resource_ready() + p2 = SiteList.get_site_by_url(url2).get_resource_ready() self.assertEqual(p1.item.id, p2.item.id) @use_local_response @@ -180,16 +180,16 @@ class MultiBookSitesTestCase(TestCase): url2 = 'https://book.douban.com/subject/2037260/' url3 = 'https://www.goodreads.com/book/show/59952545-golden-age' url4 = 'https://www.goodreads.com/book/show/11798823' - p1 = SiteList.get_site_by_url(url1).get_page_ready() # lxml bug may break this + p1 = SiteList.get_site_by_url(url1).get_resource_ready() # lxml bug may break this w1 = p1.item.works.all().first() - p2 = SiteList.get_site_by_url(url2).get_page_ready() + p2 = SiteList.get_site_by_url(url2).get_resource_ready() w2 = p2.item.works.all().first() self.assertEqual(w1, w2) self.assertEqual(p1.item.works.all().count(), 1) - p3 = SiteList.get_site_by_url(url3).get_page_ready() + p3 = SiteList.get_site_by_url(url3).get_resource_ready() w3 = p3.item.works.all().first() self.assertNotEqual(w3, w2) - p4 = SiteList.get_site_by_url(url4).get_page_ready() + p4 = SiteList.get_site_by_url(url4).get_resource_ready() self.assertEqual(p4.item.works.all().count(), 2) self.assertEqual(p1.item.works.all().count(), 2) w2e = w2.editions.all().order_by('title') diff --git a/catalog/common/__init__.py b/catalog/common/__init__.py index b51c2629..1ea3b6ae 100644 --- a/catalog/common/__init__.py +++ b/catalog/common/__init__.py @@ -5,4 +5,4 @@ from .scrapers import * from . import jsondata -__all__ = ('IdType', 'Item', 'ExternalPage', 'PageData', 'ParseError', 'ScraperMixin', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'setMockMode', 'getMockMode', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP') +__all__ = ('IdType', 'Item', 'ExternalResource', 'ResourceContent', 'ParseError', 'ScraperMixin', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'setMockMode', 'getMockMode', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP') diff --git a/catalog/common/models.py b/catalog/common/models.py index e5b19bfb..511c3c65 100644 --- a/catalog/common/models.py +++ b/catalog/common/models.py @@ -179,16 +179,16 @@ class Item(PolymorphicModel): # print(ll) pass - METADATA_COPY_LIST = ['title', 'brief'] # list of metadata keys to copy from page to item + METADATA_COPY_LIST = ['title', 'brief'] # list of metadata keys to copy from resource to item @classmethod def copy_metadata(cls, metadata): return dict((k, v) for k, v in metadata.items() if k in cls.METADATA_COPY_LIST and v is not None) - def merge_data_from_extenal_pages(self): + def merge_data_from_external_resources(self): """Subclass may override this""" lookup_ids = [] - for p in self.external_pages.all(): + for p in self.external_resources.all(): lookup_ids.append((p.id_type, p.id_value)) lookup_ids += p.other_lookup_ids.items() for k in self.METADATA_COPY_LIST: @@ -198,7 +198,7 @@ class Item(PolymorphicModel): self.cover = p.cover self.update_lookup_ids(lookup_ids) - def update_linked_items_from_extenal_page(self, page): + def update_linked_items_from_external_resource(self, resource): """Subclass should override this""" pass @@ -213,19 +213,19 @@ class ItemLookupId(models.Model): unique_together = [['id_type', 'id_value']] -class ExternalPage(models.Model): - item = models.ForeignKey(Item, null=True, on_delete=models.SET_NULL, related_name='external_pages') +class ExternalResource(models.Model): + item = models.ForeignKey(Item, null=True, on_delete=models.SET_NULL, related_name='external_resources') id_type = models.CharField(_("IdType of the source site"), blank=False, choices=IdType.choices, max_length=50) id_value = models.CharField(_("Primary Id on the source site"), blank=False, max_length=1000) - url = models.CharField(_("url to the page"), blank=False, max_length=1000, unique=True) + url = models.CharField(_("url to the resource"), blank=False, max_length=1000, unique=True) cover = models.ImageField(upload_to=item_cover_path, default=DEFAULT_ITEM_COVER, blank=True) other_lookup_ids = models.JSONField(default=dict) metadata = models.JSONField(default=dict) scraped_time = models.DateTimeField(null=True) created_time = models.DateTimeField(auto_now_add=True) edited_time = models.DateTimeField(auto_now=True) - required_pages = jsondata.ArrayField(null=False, blank=False, default=list) - related_pages = jsondata.ArrayField(null=False, blank=False, default=list) + required_resources = jsondata.ArrayField(null=False, blank=False, default=list) + related_resources = jsondata.ArrayField(null=False, blank=False, default=list) class Meta: unique_together = [['id_type', 'id_value']] @@ -233,11 +233,11 @@ class ExternalPage(models.Model): def __str__(self): return f"{self.id}{':' + self.id_type + ':' + self.id_value if self.id_value else ''} ({self.url})" - def update_content(self, page_data): - self.other_lookup_ids = page_data.lookup_ids - self.metadata = page_data.metadata - if page_data.cover_image and page_data.cover_image_extention: - self.cover = SimpleUploadedFile('temp.' + page_data.cover_image_extention, page_data.cover_image) + def update_content(self, resource_content): + self.other_lookup_ids = resource_content.lookup_ids + self.metadata = resource_content.metadata + if resource_content.cover_image and resource_content.cover_image_extention: + self.cover = SimpleUploadedFile('temp.' + resource_content.cover_image_extention, resource_content.cover_image) self.scraped_time = timezone.now() self.save() diff --git a/catalog/common/sites.py b/catalog/common/sites.py index d2da664e..e89893e9 100644 --- a/catalog/common/sites.py +++ b/catalog/common/sites.py @@ -1,6 +1,6 @@ from typing import * import re -from .models import ExternalPage +from .models import ExternalResource from dataclasses import dataclass, field import logging @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) @dataclass -class PageData: +class ResourceContent: lookup_ids: dict = field(default_factory=dict) metadata: dict = field(default_factory=dict) cover_image = None @@ -45,28 +45,28 @@ class AbstractSite: def __init__(self, url=None): self.id_value = self.url_to_id(url) if url else None self.url = self.id_to_url(self.id_value) if url else None - self.page = None + self.resource = None - def get_page(self): - if not self.page: - self.page = ExternalPage.objects.filter(url=self.url).first() - if self.page is None: - self.page = ExternalPage(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url) - return self.page + def get_resource(self): + if not self.resource: + self.resource = ExternalResource.objects.filter(url=self.url).first() + if self.resource is None: + self.resource = ExternalResource(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url) + return self.resource - def bypass_scrape(self, data_from_link) -> PageData | None: - """subclass may implement this to use data from linked page and bypass actual scrape""" + def bypass_scrape(self, data_from_link) -> ResourceContent | None: + """subclass may implement this to use data from linked resource and bypass actual scrape""" return None - def scrape(self) -> PageData: - """subclass should implement this, return PageData object""" - data = PageData() + def scrape(self) -> ResourceContent: + """subclass should implement this, return ResourceContent object""" + data = ResourceContent() return data def get_item(self): - p = self.get_page() + p = self.get_resource() if not p: - raise ValueError(f'page not available for {self.url}') + raise ValueError(f'resource not available for {self.url}') model = p.get_preferred_model() if not model: model = self.DEFAULT_MODEL @@ -82,41 +82,41 @@ class AbstractSite: @property def ready(self): - return bool(self.page and self.page.ready) + return bool(self.resource and self.resource.ready) - def get_page_ready(self, auto_save=True, auto_create=True, auto_link=True, data_from_link=None): - """return a page scraped, or scrape if not yet""" + def get_resource_ready(self, auto_save=True, auto_create=True, auto_link=True, data_from_link=None): + """return a resource scraped, or scrape if not yet""" if auto_link: auto_create = True if auto_create: auto_save = True - p = self.get_page() - pagedata = {} - if not self.page: + p = self.get_resource() + resource_content = {} + if not self.resource: return None if not p.ready: - pagedata = self.bypass_scrape(data_from_link) - if not pagedata: - pagedata = self.scrape() - p.update_content(pagedata) + resource_content = self.bypass_scrape(data_from_link) + if not resource_content: + resource_content = self.scrape() + p.update_content(resource_content) if not p.ready: - logger.error(f'unable to get page {self.url} ready') + logger.error(f'unable to get resource {self.url} ready') return None if auto_create and p.item is None: self.get_item() if auto_save: p.save() if p.item: - p.item.merge_data_from_extenal_pages() + p.item.merge_data_from_external_resources() p.item.save() if auto_link: - for linked_pages in p.required_pages: - linked_site = SiteList.get_site_by_url(linked_pages['url']) + for linked_resources in p.required_resources: + linked_site = SiteList.get_site_by_url(linked_resources['url']) if linked_site: - linked_site.get_page_ready(auto_link=False) + linked_site.get_resource_ready(auto_link=False) else: - logger.error(f'unable to get site for {linked_pages["url"]}') - p.item.update_linked_items_from_extenal_page(p) + logger.error(f'unable to get site for {linked_resources["url"]}') + p.item.update_linked_items_from_external_resource(p) p.item.save() return p diff --git a/catalog/common/utils.py b/catalog/common/utils.py index 650539f6..289ea855 100644 --- a/catalog/common/utils.py +++ b/catalog/common/utils.py @@ -14,9 +14,9 @@ logger = logging.getLogger(__name__) DEFAULT_ITEM_COVER = 'item/default.svg' -def item_cover_path(page, filename): +def item_cover_path(resource, filename): fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1] - return 'items/' + page.id_type + '/' + fn + return 'items/' + resource.id_type + '/' + fn TestDataDir = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/' diff --git a/catalog/management/commands/cat.py b/catalog/management/commands/cat.py index c98aaac5..1f3ed236 100644 --- a/catalog/management/commands/cat.py +++ b/catalog/management/commands/cat.py @@ -5,7 +5,7 @@ from catalog.sites import * class Command(BaseCommand): - help = 'Scrape a catalog item from external page (but not save it)' + help = 'Scrape a catalog item from external resource (but not save it)' def add_arguments(self, parser): parser.add_argument('url', type=str, help='URL to scrape') @@ -17,6 +17,6 @@ class Command(BaseCommand): self.stdout.write(self.style.ERROR(f'Unknown site for {url}')) return self.stdout.write(f'Fetching from {site}') - page = site.get_page_ready(auto_link=False, auto_save=False) + resource = site.get_resource_ready(auto_link=False, auto_save=False) self.stdout.write(self.style.SUCCESS(f'Done.')) - pprint.pp(page.metadata) + pprint.pp(resource.metadata) diff --git a/catalog/movie/tests.py b/catalog/movie/tests.py index ec9c9178..b3deacce 100644 --- a/catalog/movie/tests.py +++ b/catalog/movie/tests.py @@ -19,11 +19,11 @@ class DoubanMovieTestCase(TestCase): site = SiteList.get_site_by_url(t_url) self.assertEqual(site.ready, False) self.assertEqual(site.id_value, '3541415') - site.get_page_ready() - self.assertEqual(site.page.metadata['title'], '盗梦空间') - self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB) - self.assertEqual(site.page.item.__class__.__name__, 'Movie') - self.assertEqual(site.page.item.imdb, 'tt1375666') + site.get_resource_ready() + self.assertEqual(site.resource.metadata['title'], '盗梦空间') + self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB) + self.assertEqual(site.resource.item.__class__.__name__, 'Movie') + self.assertEqual(site.resource.item.imdb, 'tt1375666') class TMDBMovieTestCase(TestCase): @@ -45,11 +45,11 @@ class TMDBMovieTestCase(TestCase): site = SiteList.get_site_by_url(t_url) self.assertEqual(site.ready, False) self.assertEqual(site.id_value, '293767') - site.get_page_ready() - self.assertEqual(site.page.metadata['title'], '比利·林恩的中场战事') - self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB) - self.assertEqual(site.page.item.__class__.__name__, 'Movie') - self.assertEqual(site.page.item.imdb, 'tt2513074') + site.get_resource_ready() + self.assertEqual(site.resource.metadata['title'], '比利·林恩的中场战事') + self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB) + self.assertEqual(site.resource.item.__class__.__name__, 'Movie') + self.assertEqual(site.resource.item.imdb, 'tt2513074') class IMDBMovieTestCase(TestCase): @@ -71,10 +71,10 @@ class IMDBMovieTestCase(TestCase): site = SiteList.get_site_by_url(t_url) self.assertEqual(site.ready, False) self.assertEqual(site.id_value, 'tt1375666') - site.get_page_ready() - self.assertEqual(site.page.metadata['title'], '盗梦空间') - self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB) - self.assertEqual(site.page.item.imdb, 'tt1375666') + site.get_resource_ready() + self.assertEqual(site.resource.metadata['title'], '盗梦空间') + self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB) + self.assertEqual(site.resource.item.imdb, 'tt1375666') class MultiMovieSitesTestCase(TestCase): @@ -83,8 +83,8 @@ class MultiMovieSitesTestCase(TestCase): url1 = 'https://www.themoviedb.org/movie/27205-inception' url2 = 'https://movie.douban.com/subject/3541415/' url3 = 'https://www.imdb.com/title/tt1375666/' - p1 = SiteList.get_site_by_url(url1).get_page_ready() - p2 = SiteList.get_site_by_url(url2).get_page_ready() - p3 = SiteList.get_site_by_url(url3).get_page_ready() + p1 = SiteList.get_site_by_url(url1).get_resource_ready() + p2 = SiteList.get_site_by_url(url2).get_resource_ready() + p3 = SiteList.get_site_by_url(url3).get_resource_ready() self.assertEqual(p1.item.id, p2.item.id) self.assertEqual(p2.item.id, p3.item.id) diff --git a/catalog/music/tests.py b/catalog/music/tests.py index 1c6eeb31..3354b8d9 100644 --- a/catalog/music/tests.py +++ b/catalog/music/tests.py @@ -20,8 +20,8 @@ class SpotifyTestCase(TestCase): t_url = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP' site = SiteList.get_site_by_url(t_url) self.assertEqual(site.ready, False) - site.get_page_ready() + site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.page.metadata['title'], 'The Race For Space') - self.assertIsInstance(site.page.item, Album) - self.assertEqual(site.page.item.barcode, '3610159662676') + self.assertEqual(site.resource.metadata['title'], 'The Race For Space') + self.assertIsInstance(site.resource.item, Album) + self.assertEqual(site.resource.item.barcode, '3610159662676') diff --git a/catalog/performance/tests.py b/catalog/performance/tests.py index db46a9f1..9d3302ea 100644 --- a/catalog/performance/tests.py +++ b/catalog/performance/tests.py @@ -22,9 +22,9 @@ class DoubanDramaTestCase(TestCase): t_url = 'https://www.douban.com/location/drama/24849279/' site = SiteList.get_site_by_url(t_url) self.assertEqual(site.ready, False) - page = site.get_page_ready() + resource = site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(page.metadata['title'], '红花侠') + self.assertEqual(resource.metadata['title'], '红花侠') item = site.get_item() self.assertEqual(item.title, '红花侠') diff --git a/catalog/podcast/tests.py b/catalog/podcast/tests.py index 9332f4e4..0e70b3e1 100644 --- a/catalog/podcast/tests.py +++ b/catalog/podcast/tests.py @@ -24,7 +24,7 @@ class ApplePodcastTestCase(TestCase): site = SiteList.get_site_by_url(t_url) self.assertEqual(site.ready, False) self.assertEqual(site.id_value, '1050430296') - site.get_page_ready() - self.assertEqual(site.page.metadata['title'], 'The New Yorker Radio Hour') - # self.assertEqual(site.page.metadata['feed_url'], 'http://feeds.wnyc.org/newyorkerradiohour') - self.assertEqual(site.page.metadata['feed_url'], 'http://feeds.feedburner.com/newyorkerradiohour') + site.get_resource_ready() + self.assertEqual(site.resource.metadata['title'], 'The New Yorker Radio Hour') + # self.assertEqual(site.resource.metadata['feed_url'], 'http://feeds.wnyc.org/newyorkerradiohour') + self.assertEqual(site.resource.metadata['feed_url'], 'http://feeds.feedburner.com/newyorkerradiohour') diff --git a/catalog/sites/apple_podcast.py b/catalog/sites/apple_podcast.py index c4a601d1..8f06cd5f 100644 --- a/catalog/sites/apple_podcast.py +++ b/catalog/sites/apple_podcast.py @@ -22,7 +22,7 @@ class ApplePodcast(AbstractSite): dl = BasicDownloader(api_url) resp = dl.download() r = resp.json()['results'][0] - pd = PageData(metadata={ + pd = ResourceContent(metadata={ 'title': r['trackName'], 'feed_url': r['feedUrl'], 'hosts': [r['artistName']], diff --git a/catalog/sites/douban_book.py b/catalog/sites/douban_book.py index 5c3b7c28..57132f0f 100644 --- a/catalog/sites/douban_book.py +++ b/catalog/sites/douban_book.py @@ -111,14 +111,14 @@ class DoubanBook(AbstractSite, ScraperMixin): work_link = self.parse_str('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href') if work_link: r = re.match(r'\w+://book.douban.com/works/(\d+)', work_link) - self.data['required_pages'] = [{ + self.data['required_resources'] = [{ 'model': 'Work', 'id_type': IdType.DoubanBook_Work, 'id_value': r[1] if r else None, 'title': self.data['title'], 'url': work_link, }] - pd = PageData(metadata=self.data) + pd = ResourceContent(metadata=self.data) pd.lookup_ids[IdType.ISBN] = self.data.get('isbn') pd.lookup_ids[IdType.CUBN] = self.data.get('cubn') if self.data["cover_image_url"]: @@ -145,7 +145,7 @@ class DoubanBook_Work(AbstractSite): def bypass_scrape(self, data_from_link): if not data_from_link: return None - pd = PageData(metadata={ + pd = ResourceContent(metadata={ 'title': data_from_link['title'], }) return pd @@ -156,7 +156,7 @@ class DoubanBook_Work(AbstractSite): title = title_elem[0].split('全部版本(')[0].strip() if title_elem else None if not title: raise ParseError(self, 'title') - pd = PageData(metadata={ + pd = ResourceContent(metadata={ 'title': title, }) return pd diff --git a/catalog/sites/douban_drama.py b/catalog/sites/douban_drama.py index dfc37009..86157d05 100644 --- a/catalog/sites/douban_drama.py +++ b/catalog/sites/douban_drama.py @@ -48,7 +48,7 @@ class DoubanDrama(AbstractSite): img_url_elem = h.xpath("//img[@itemprop='image']/@src") data['cover_image_url'] = img_url_elem[0].strip() if img_url_elem else None - pd = PageData(metadata=data) + pd = ResourceContent(metadata=data) if pd.metadata["cover_image_url"]: imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url) try: diff --git a/catalog/sites/douban_movie.py b/catalog/sites/douban_movie.py index 0400cba1..a6ca1869 100644 --- a/catalog/sites/douban_movie.py +++ b/catalog/sites/douban_movie.py @@ -215,7 +215,7 @@ class DoubanMovie(AbstractSite): img_url_elem = content.xpath("//img[@rel='v:image']/@src") img_url = img_url_elem[0].strip() if img_url_elem else None - pd = PageData(metadata={ + pd = ResourceContent(metadata={ 'title': title, 'orig_title': orig_title, 'other_title': other_title, @@ -257,7 +257,7 @@ class DoubanMovie(AbstractSite): # TODO correct the IMDB id pd.lookup_ids[IdType.IMDB] = imdb_code if tmdb_show_id: - pd.metadata['required_pages'] = [{ + pd.metadata['required_resources'] = [{ 'model': 'TVShow', 'id_type': IdType.TMDB_TV, 'id_value': tmdb_show_id, @@ -265,7 +265,7 @@ class DoubanMovie(AbstractSite): 'url': TMDB_TV.id_to_url(tmdb_show_id), }] # TODO parse sister seasons - # pd.metadata['related_pages'] = [] + # pd.metadata['related_resources'] = [] if pd.metadata["cover_image_url"]: imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url) try: diff --git a/catalog/sites/goodreads.py b/catalog/sites/goodreads.py index c6a87a4c..f6618d94 100644 --- a/catalog/sites/goodreads.py +++ b/catalog/sites/goodreads.py @@ -66,14 +66,14 @@ class Goodreads(AbstractSite): data['cover_image_url'] = b['imageUrl'] w = next(filter(lambda x: x.get('details'), o['Work']), None) if w: - data['required_pages'] = [{ + data['required_resources'] = [{ 'model': 'Work', 'id_type': IdType.Goodreads_Work, 'id_value': str(w['legacyId']), 'title': w['details']['originalTitle'], 'url': w['editions']['webUrl'], }] - pd = PageData(metadata=data) + pd = ResourceContent(metadata=data) pd.lookup_ids[IdType.ISBN] = data.get('isbn') pd.lookup_ids[IdType.ASIN] = data.get('asin') if data["cover_image_url"]: @@ -107,7 +107,7 @@ class Goodreads_Work(AbstractSite): author = author_elem[0].strip() if author_elem else None first_published_elem = content.xpath("//h2/span/text()") first_published = first_published_elem[0].strip() if first_published_elem else None - pd = PageData(metadata={ + pd = ResourceContent(metadata={ 'title': title, 'author': author, 'first_published': first_published diff --git a/catalog/sites/spotify.py b/catalog/sites/spotify.py index 52a90d92..18066854 100644 --- a/catalog/sites/spotify.py +++ b/catalog/sites/spotify.py @@ -74,7 +74,7 @@ class Spotify(AbstractSite): # isrc = res_data['external_ids'].get('isrc') # _logger.error('isrc for album? this should not happen') - pd = PageData(metadata={ + pd = ResourceContent(metadata={ 'title': title, 'artist': artist, 'genre': genre, diff --git a/catalog/sites/tmdb.py b/catalog/sites/tmdb.py index 8c03d70e..63b839d1 100644 --- a/catalog/sites/tmdb.py +++ b/catalog/sites/tmdb.py @@ -126,7 +126,7 @@ class TMDB_Movie(AbstractSite): # TODO: use GET /configuration to get base url img_url = ('https://image.tmdb.org/t/p/original/' + res_data['poster_path']) if res_data['poster_path'] is not None else None - pd = PageData(metadata={ + pd = ResourceContent(metadata={ 'title': title, 'orig_title': orig_title, 'other_title': None, @@ -233,7 +233,7 @@ class TMDB_TV(AbstractSite): 'id_value': f'{self.id_value}-{s["season_number"]}', 'title': s['name'], 'url': f'{self.url}/season/{s["season_number"]}'}, res_data['seasons'])) - pd = PageData(metadata={ + pd = ResourceContent(metadata={ 'title': title, 'orig_title': orig_title, 'other_title': None, @@ -253,7 +253,7 @@ class TMDB_TV(AbstractSite): 'single_episode_length': None, 'brief': brief, 'cover_image_url': img_url, - 'related_pages': season_links, + 'related_resources': season_links, }) if imdb_code: pd.lookup_ids[IdType.IMDB] = imdb_code @@ -292,8 +292,8 @@ class TMDB_TVSeason(AbstractSite): d = BasicDownloader(api_url).download().json() if not d.get('id'): raise ParseError('id') - pd = PageData(metadata=_copy_dict(d, {'name': 'title', 'overview': 'brief', 'air_date': 'air_date', 'season_number': 0, 'external_ids': 0})) - pd.metadata['required_pages'] = [{ + pd = ResourceContent(metadata=_copy_dict(d, {'name': 'title', 'overview': 'brief', 'air_date': 'air_date', 'season_number': 0, 'external_ids': 0})) + pd.metadata['required_resources'] = [{ 'model': 'TVShow', 'id_type': IdType.TMDB_TV, 'id_value': v[0], diff --git a/catalog/tv/models.py b/catalog/tv/models.py index 105f1cc0..ea1044f1 100644 --- a/catalog/tv/models.py +++ b/catalog/tv/models.py @@ -44,12 +44,12 @@ class TVSeason(Item): episode_count = jsondata.IntegerField(blank=True, default=None) METADATA_COPY_LIST = ['title', 'brief', 'season_number', 'episode_count'] - def update_linked_items_from_extenal_page(self, page): - """add Work from page.metadata['work'] if not yet""" - links = page.required_pages + page.related_pages + def update_linked_items_from_external_resource(self, resource): + """add Work from resource.metadata['work'] if not yet""" + links = resource.required_resources + resource.related_resources for w in links: if w['model'] == 'TVShow': - p = ExternalPage.objects.filter(id_type=w['id_type'], id_value=w['id_value']).first() + p = ExternalResource.objects.filter(id_type=w['id_type'], id_value=w['id_value']).first() if p and p.item and self.show != p.item: self.show = p.item diff --git a/catalog/tv/tests.py b/catalog/tv/tests.py index 3f7af40a..a25c45aa 100644 --- a/catalog/tv/tests.py +++ b/catalog/tv/tests.py @@ -27,12 +27,12 @@ class TMDBTVTestCase(TestCase): site = SiteList.get_site_by_url(t_url) self.assertEqual(site.ready, False) self.assertEqual(site.id_value, '57243') - site.get_page_ready() + site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.page.metadata['title'], '神秘博士') - self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB) - self.assertEqual(site.page.item.__class__.__name__, 'TVShow') - self.assertEqual(site.page.item.imdb, 'tt0436992') + self.assertEqual(site.resource.metadata['title'], '神秘博士') + self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB) + self.assertEqual(site.resource.item.__class__.__name__, 'TVShow') + self.assertEqual(site.resource.item.imdb, 'tt0436992') class TMDBTVSeasonTestCase(TestCase): @@ -54,21 +54,21 @@ class TMDBTVSeasonTestCase(TestCase): site = SiteList.get_site_by_url(t_url) self.assertEqual(site.ready, False) self.assertEqual(site.id_value, '57243-4') - site.get_page_ready() + site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.page.metadata['title'], '第 4 季') - self.assertEqual(site.page.item.primary_lookup_id_type, IdType.IMDB) - self.assertEqual(site.page.item.__class__.__name__, 'TVSeason') - self.assertEqual(site.page.item.imdb, 'tt1159991') - self.assertIsNotNone(site.page.item.show) - self.assertEqual(site.page.item.show.imdb, 'tt0436992') + self.assertEqual(site.resource.metadata['title'], '第 4 季') + self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB) + self.assertEqual(site.resource.item.__class__.__name__, 'TVSeason') + self.assertEqual(site.resource.item.imdb, 'tt1159991') + self.assertIsNotNone(site.resource.item.show) + self.assertEqual(site.resource.item.show.imdb, 'tt0436992') class DoubanMovieTVTestCase(TestCase): @use_local_response def test_scrape(self): url3 = 'https://movie.douban.com/subject/3627919/' - p3 = SiteList.get_site_by_url(url3).get_page_ready() + p3 = SiteList.get_site_by_url(url3).get_resource_ready() self.assertEqual(p3.item.__class__.__name__, 'TVSeason') self.assertIsNotNone(p3.item.show) self.assertEqual(p3.item.show.imdb, 'tt0436992') @@ -76,7 +76,7 @@ class DoubanMovieTVTestCase(TestCase): @use_local_response def test_scrape_singleseason(self): url3 = 'https://movie.douban.com/subject/26895436/' - p3 = SiteList.get_site_by_url(url3).get_page_ready() + p3 = SiteList.get_site_by_url(url3).get_resource_ready() self.assertEqual(p3.item.__class__.__name__, 'TVShow') @@ -86,9 +86,9 @@ class MultiTVSitesTestCase(TestCase): url1 = 'https://www.themoviedb.org/tv/57243-doctor-who' url2 = 'https://www.imdb.com/title/tt0436992/' # url3 = 'https://movie.douban.com/subject/3541415/' - p1 = SiteList.get_site_by_url(url1).get_page_ready() - p2 = SiteList.get_site_by_url(url2).get_page_ready() - # p3 = SiteList.get_site_by_url(url3).get_page_ready() + p1 = SiteList.get_site_by_url(url1).get_resource_ready() + p2 = SiteList.get_site_by_url(url2).get_resource_ready() + # p3 = SiteList.get_site_by_url(url3).get_resource_ready() self.assertEqual(p1.item.id, p2.item.id) # self.assertEqual(p2.item.id, p3.item.id) @@ -97,9 +97,9 @@ class MultiTVSitesTestCase(TestCase): url1 = 'https://www.themoviedb.org/tv/57243-doctor-who/season/4' url2 = 'https://www.imdb.com/title/tt1159991/' url3 = 'https://movie.douban.com/subject/3627919/' - p1 = SiteList.get_site_by_url(url1).get_page_ready() - p2 = SiteList.get_site_by_url(url2).get_page_ready() - p3 = SiteList.get_site_by_url(url3).get_page_ready() + p1 = SiteList.get_site_by_url(url1).get_resource_ready() + p2 = SiteList.get_site_by_url(url2).get_resource_ready() + p3 = SiteList.get_site_by_url(url3).get_resource_ready() self.assertEqual(p1.item.imdb, p2.item.imdb) self.assertEqual(p2.item.imdb, p3.item.imdb) self.assertEqual(p1.item.id, p2.item.id) @@ -109,8 +109,8 @@ class MultiTVSitesTestCase(TestCase): def test_miniseries(self): url1 = 'https://www.themoviedb.org/tv/86941-the-north-water' url3 = 'https://movie.douban.com/subject/26895436/' - p1 = SiteList.get_site_by_url(url1).get_page_ready() - p3 = SiteList.get_site_by_url(url3).get_page_ready() + p1 = SiteList.get_site_by_url(url1).get_resource_ready() + p3 = SiteList.get_site_by_url(url3).get_resource_ready() self.assertEqual(p3.item.__class__.__name__, 'TVShow') self.assertEqual(p1.item.id, p3.item.id) @@ -119,9 +119,9 @@ class MultiTVSitesTestCase(TestCase): url1 = 'https://www.themoviedb.org/movie/282758-doctor-who-the-runaway-bride' url2 = 'hhttps://www.imdb.com/title/tt0827573/' url3 = 'https://movie.douban.com/subject/4296866/' - p1 = SiteList.get_site_by_url(url1).get_page_ready() - p2 = SiteList.get_site_by_url(url2).get_page_ready() - p3 = SiteList.get_site_by_url(url3).get_page_ready() + p1 = SiteList.get_site_by_url(url1).get_resource_ready() + p2 = SiteList.get_site_by_url(url2).get_resource_ready() + p3 = SiteList.get_site_by_url(url3).get_resource_ready() self.assertEqual(p1.item.imdb, p2.item.imdb) self.assertEqual(p2.item.imdb, p3.item.imdb) self.assertEqual(p1.item.id, p2.item.id)