lib.itmens/catalog/sites/google_books.py

81 lines
3.2 KiB
Python
Raw Normal View History

2022-12-09 02:35:21 +00:00
from catalog.common import *
from catalog.models import *
import re
import logging
_logger = logging.getLogger(__name__)
2022-12-15 17:29:35 -05:00
@SiteManager.register
2022-12-09 02:35:21 +00:00
class GoogleBooks(AbstractSite):
2022-12-15 17:29:35 -05:00
SITE_NAME = SiteName.GoogleBooks
2022-12-09 02:35:21 +00:00
ID_TYPE = IdType.GoogleBooks
2022-12-15 17:29:35 -05:00
URL_PATTERNS = [
2022-12-09 02:35:21 +00:00
r"https://books\.google\.co[^/]+/books\?id=([^&#]+)",
r"https://www\.google\.co[^/]+/books/edition/[^/]+/([^&#?]+)",
r"https://books\.google\.co[^/]+/books/about/[^?]+?id=([^&#?]+)",
]
WIKI_PROPERTY_ID = ''
DEFAULT_MODEL = Edition
@classmethod
def id_to_url(self, id_value):
return "https://books.google.com/books?id=" + id_value
def scrape(self):
api_url = f'https://www.googleapis.com/books/v1/volumes/{self.id_value}'
b = BasicDownloader(api_url).download().json()
other = {}
title = b['volumeInfo']['title']
subtitle = b['volumeInfo']['subtitle'] if 'subtitle' in b['volumeInfo'] else None
pub_year = None
pub_month = None
if 'publishedDate' in b['volumeInfo']:
pub_date = b['volumeInfo']['publishedDate'].split('-')
pub_year = pub_date[0]
pub_month = pub_date[1] if len(pub_date) > 1 else None
pub_house = b['volumeInfo']['publisher'] if 'publisher' in b['volumeInfo'] else None
language = b['volumeInfo']['language'] if 'language' in b['volumeInfo'] else None
pages = b['volumeInfo']['pageCount'] if 'pageCount' in b['volumeInfo'] else None
if 'mainCategory' in b['volumeInfo']:
other['分类'] = b['volumeInfo']['mainCategory']
authors = b['volumeInfo']['authors'] if 'authors' in b['volumeInfo'] else None
if 'description' in b['volumeInfo']:
brief = b['volumeInfo']['description']
elif 'textSnippet' in b['volumeInfo']:
brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
else:
brief = ''
brief = re.sub(r'<.*?>', '', brief.replace('<br', '\n<br'))
img_url = b['volumeInfo']['imageLinks']['thumbnail'] if 'imageLinks' in b['volumeInfo'] else None
isbn10 = None
isbn13 = None
for iid in b['volumeInfo']['industryIdentifiers'] if 'industryIdentifiers' in b['volumeInfo'] else []:
if iid['type'] == 'ISBN_10':
isbn10 = iid['identifier']
if iid['type'] == 'ISBN_13':
isbn13 = iid['identifier']
isbn = isbn13 # if isbn13 is not None else isbn10
raw_img, ext = BasicImageDownloader.download_image(img_url, self.url)
data = {
'title': title,
'subtitle': subtitle,
'orig_title': None,
'author': authors,
'translator': None,
'language': language,
'pub_house': pub_house,
'pub_year': pub_year,
'pub_month': pub_month,
'binding': None,
'pages': pages,
'isbn': isbn,
'brief': brief,
'contents': None,
'other_info': other,
'cover_image_url': img_url,
}
return ResourceContent(metadata=data, cover_image=raw_img, cover_image_extention=ext, lookup_ids={IdType.ISBN: isbn13})