from catalog.common import * from catalog.models import * import re import logging _logger = logging.getLogger(__name__) @SiteList.register class GoogleBooks(AbstractSite): ID_TYPE = IdType.GoogleBooks URL_PATTERNS = [ r"https://books\.google\.co[^/]+/books\?id=([^&#]+)", r"https://www\.google\.co[^/]+/books/edition/[^/]+/([^&#?]+)", r"https://books\.google\.co[^/]+/books/about/[^?]+?id=([^&#?]+)", ] WIKI_PROPERTY_ID = '' DEFAULT_MODEL = Edition @classmethod def id_to_url(self, id_value): return "https://books.google.com/books?id=" + id_value def scrape(self): api_url = f'https://www.googleapis.com/books/v1/volumes/{self.id_value}' b = BasicDownloader(api_url).download().json() other = {} title = b['volumeInfo']['title'] subtitle = b['volumeInfo']['subtitle'] if 'subtitle' in b['volumeInfo'] else None pub_year = None pub_month = None if 'publishedDate' in b['volumeInfo']: pub_date = b['volumeInfo']['publishedDate'].split('-') pub_year = pub_date[0] pub_month = pub_date[1] if len(pub_date) > 1 else None pub_house = b['volumeInfo']['publisher'] if 'publisher' in b['volumeInfo'] else None language = b['volumeInfo']['language'] if 'language' in b['volumeInfo'] else None pages = b['volumeInfo']['pageCount'] if 'pageCount' in b['volumeInfo'] else None if 'mainCategory' in b['volumeInfo']: other['分类'] = b['volumeInfo']['mainCategory'] authors = b['volumeInfo']['authors'] if 'authors' in b['volumeInfo'] else None if 'description' in b['volumeInfo']: brief = b['volumeInfo']['description'] elif 'textSnippet' in b['volumeInfo']: brief = b["volumeInfo"]["textSnippet"]["searchInfo"] else: brief = '' brief = re.sub(r'<.*?>', '', brief.replace('<br', '\n<br')) img_url = b['volumeInfo']['imageLinks']['thumbnail'] if 'imageLinks' in b['volumeInfo'] else None isbn10 = None isbn13 = None for iid in b['volumeInfo']['industryIdentifiers'] if 'industryIdentifiers' in b['volumeInfo'] else []: if iid['type'] == 'ISBN_10': isbn10 = iid['identifier'] if iid['type'] == 'ISBN_13': isbn13 = iid['identifier'] isbn = isbn13 # if isbn13 is not None else isbn10 raw_img, ext = BasicImageDownloader.download_image(img_url, self.url) data = { 'title': title, 'subtitle': subtitle, 'orig_title': None, 'author': authors, 'translator': None, 'language': language, 'pub_house': pub_house, 'pub_year': pub_year, 'pub_month': pub_month, 'binding': None, 'pages': pages, 'isbn': isbn, 'brief': brief, 'contents': None, 'other_info': other, 'cover_image_url': img_url, } return ResourceContent(metadata=data, cover_image=raw_img, cover_image_extention=ext, lookup_ids={IdType.ISBN: isbn13})