From bfd68e6a3fc1527589bcb33159daf14a9440c21b Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 9 Dec 2022 02:35:21 +0000 Subject: [PATCH] new data model: googlebooks --- catalog/book/tests.py | 32 ++++++++ catalog/common/models.py | 2 +- catalog/common/sites.py | 2 +- catalog/sites/__init__.py | 1 + catalog/sites/google_books.py | 79 +++++++++++++++++++ ...ogleapis_com_books_v1_volumes_hV__zQEACAAJ | 75 ++++++++++++++++++ 6 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 catalog/sites/google_books.py create mode 100644 test_data/https___www_googleapis_com_books_v1_volumes_hV__zQEACAAJ diff --git a/catalog/book/tests.py b/catalog/book/tests.py index 279956e6..51e55353 100644 --- a/catalog/book/tests.py +++ b/catalog/book/tests.py @@ -116,6 +116,35 @@ class GoodreadsTestCase(TestCase): self.assertEqual(w1, w2) +class GoogleBooksTestCase(TestCase): + def test_parse(self): + t_type = IdType.GoogleBooks + t_id = 'hV--zQEACAAJ' + t_url = 'https://books.google.com.bn/books?id=hV--zQEACAAJ&hl=ms' + t_url2 = 'https://books.google.com/books?id=hV--zQEACAAJ' + p1 = SiteList.get_site_by_url(t_url) + p2 = SiteList.get_site_by_url(t_url2) + self.assertIsNotNone(p1) + self.assertEqual(p1.url, t_url2) + self.assertEqual(p1.ID_TYPE, t_type) + self.assertEqual(p1.id_value, t_id) + self.assertEqual(p2.url, t_url2) + + @use_local_response + def test_scrape(self): + t_url = 'https://books.google.com.bn/books?id=hV--zQEACAAJ' + site = SiteList.get_site_by_url(t_url) + self.assertEqual(site.ready, False) + site.get_resource_ready() + self.assertEqual(site.ready, True) + self.assertEqual(site.resource.metadata.get('title'), '1984 Nineteen Eighty-Four') + self.assertEqual(site.resource.metadata.get('isbn'), '9781847498571') + self.assertEqual(site.resource.id_type, IdType.GoogleBooks) + self.assertEqual(site.resource.id_value, 'hV--zQEACAAJ') + self.assertEqual(site.resource.item.isbn, '9781847498571') + self.assertEqual(site.resource.item.title, '1984 Nineteen Eighty-Four') + + class DoubanBookTestCase(TestCase): def setUp(self): pass @@ -170,9 +199,12 @@ class MultiBookSitesTestCase(TestCase): # isbn = '9781847498571' url1 = 'https://www.goodreads.com/book/show/56821625-1984' url2 = 'https://book.douban.com/subject/35902899/' + url3 = 'https://books.google.com/books?id=hV--zQEACAAJ' p1 = SiteList.get_site_by_url(url1).get_resource_ready() p2 = SiteList.get_site_by_url(url2).get_resource_ready() + p3 = SiteList.get_site_by_url(url3).get_resource_ready() self.assertEqual(p1.item.id, p2.item.id) + self.assertEqual(p2.item.id, p3.item.id) @use_local_response def test_works(self): diff --git a/catalog/common/models.py b/catalog/common/models.py index 691d317a..7537ee41 100644 --- a/catalog/common/models.py +++ b/catalog/common/models.py @@ -27,7 +27,7 @@ class IdType(models.TextChoices): TMDB_Movie = 'tmdb_movie', _('TMDB电影') Goodreads = 'goodreads', _('Goodreads') Goodreads_Work = 'goodreads_work', _('Goodreads著作') - GoogleBook = 'googlebook', _('谷歌图书') + GoogleBooks = 'googlebooks', _('谷歌图书') DoubanBook = 'doubanbook', _('豆瓣读书') DoubanBook_Work = 'doubanbook_work', _('豆瓣读书著作') DoubanMovie = 'doubanmovie', _('豆瓣电影') diff --git a/catalog/common/sites.py b/catalog/common/sites.py index 8c959158..d23db01e 100644 --- a/catalog/common/sites.py +++ b/catalog/common/sites.py @@ -20,7 +20,7 @@ _logger = logging.getLogger(__name__) class ResourceContent: lookup_ids: dict = field(default_factory=dict) metadata: dict = field(default_factory=dict) - cover_image = None + cover_image: bytes = None cover_image_extention: str = None diff --git a/catalog/sites/__init__.py b/catalog/sites/__init__.py index 01717bde..6f6dc2f7 100644 --- a/catalog/sites/__init__.py +++ b/catalog/sites/__init__.py @@ -6,6 +6,7 @@ from .douban_music import DoubanMusic from .douban_game import DoubanGame from .douban_drama import DoubanDrama from .goodreads import Goodreads +from .google_books import GoogleBooks from .tmdb import TMDB_Movie from .imdb import IMDB from .spotify import Spotify diff --git a/catalog/sites/google_books.py b/catalog/sites/google_books.py new file mode 100644 index 00000000..0554d3d8 --- /dev/null +++ b/catalog/sites/google_books.py @@ -0,0 +1,79 @@ +from catalog.common import * +from catalog.models import * +import re +import logging + + +_logger = logging.getLogger(__name__) + + +@SiteList.register +class GoogleBooks(AbstractSite): + ID_TYPE = IdType.GoogleBooks + URL_PATTERNS = [ + r"https://books\.google\.co[^/]+/books\?id=([^&#]+)", + r"https://www\.google\.co[^/]+/books/edition/[^/]+/([^&#?]+)", + r"https://books\.google\.co[^/]+/books/about/[^?]+?id=([^&#?]+)", + ] + WIKI_PROPERTY_ID = '' + DEFAULT_MODEL = Edition + + @classmethod + def id_to_url(self, id_value): + return "https://books.google.com/books?id=" + id_value + + def scrape(self): + api_url = f'https://www.googleapis.com/books/v1/volumes/{self.id_value}' + b = BasicDownloader(api_url).download().json() + other = {} + title = b['volumeInfo']['title'] + subtitle = b['volumeInfo']['subtitle'] if 'subtitle' in b['volumeInfo'] else None + pub_year = None + pub_month = None + if 'publishedDate' in b['volumeInfo']: + pub_date = b['volumeInfo']['publishedDate'].split('-') + pub_year = pub_date[0] + pub_month = pub_date[1] if len(pub_date) > 1 else None + pub_house = b['volumeInfo']['publisher'] if 'publisher' in b['volumeInfo'] else None + language = b['volumeInfo']['language'] if 'language' in b['volumeInfo'] else None + pages = b['volumeInfo']['pageCount'] if 'pageCount' in b['volumeInfo'] else None + if 'mainCategory' in b['volumeInfo']: + other['分类'] = b['volumeInfo']['mainCategory'] + authors = b['volumeInfo']['authors'] if 'authors' in b['volumeInfo'] else None + if 'description' in b['volumeInfo']: + brief = b['volumeInfo']['description'] + elif 'textSnippet' in b['volumeInfo']: + brief = b["volumeInfo"]["textSnippet"]["searchInfo"] + else: + brief = '' + brief = re.sub(r'<.*?>', '', brief.replace('