lib.itmens/catalog/sites/google_books.py

114 lines
4.2 KiB
Python
Raw Normal View History

2022-12-09 02:35:21 +00:00
import logging
import re
2022-12-09 02:35:21 +00:00
from catalog.common import *
from catalog.models import *
2022-12-09 02:35:21 +00:00
_logger = logging.getLogger(__name__)
2022-12-15 17:29:35 -05:00
@SiteManager.register
2022-12-09 02:35:21 +00:00
class GoogleBooks(AbstractSite):
2022-12-15 17:29:35 -05:00
SITE_NAME = SiteName.GoogleBooks
2022-12-09 02:35:21 +00:00
ID_TYPE = IdType.GoogleBooks
2022-12-15 17:29:35 -05:00
URL_PATTERNS = [
2022-12-09 02:35:21 +00:00
r"https://books\.google\.co[^/]+/books\?id=([^&#]+)",
r"https://www\.google\.co[^/]+/books/edition/[^/]+/([^&#?]+)",
2023-05-14 10:09:01 -04:00
r"https://books\.google\.co[^/]+/books/about/[^?]+\?id=([^&#?]+)",
2022-12-09 02:35:21 +00:00
]
2022-12-29 23:57:02 -05:00
WIKI_PROPERTY_ID = ""
2022-12-09 02:35:21 +00:00
DEFAULT_MODEL = Edition
@classmethod
2023-08-11 11:55:42 -04:00
def id_to_url(cls, id_value):
2022-12-09 02:35:21 +00:00
return "https://books.google.com/books?id=" + id_value
def scrape(self):
2022-12-29 23:57:02 -05:00
api_url = f"https://www.googleapis.com/books/v1/volumes/{self.id_value}"
2022-12-09 02:35:21 +00:00
b = BasicDownloader(api_url).download().json()
other = {}
2022-12-29 23:57:02 -05:00
title = b["volumeInfo"]["title"]
subtitle = (
b["volumeInfo"]["subtitle"] if "subtitle" in b["volumeInfo"] else None
)
2022-12-09 02:35:21 +00:00
pub_year = None
pub_month = None
2022-12-29 23:57:02 -05:00
if "publishedDate" in b["volumeInfo"]:
pub_date = b["volumeInfo"]["publishedDate"].split("-")
2022-12-09 02:35:21 +00:00
pub_year = pub_date[0]
pub_month = pub_date[1] if len(pub_date) > 1 else None
2022-12-29 23:57:02 -05:00
pub_house = (
b["volumeInfo"]["publisher"] if "publisher" in b["volumeInfo"] else None
)
2024-07-15 23:26:24 -04:00
language = (
b["volumeInfo"]["language"].lower() if "language" in b["volumeInfo"] else []
)
2024-07-13 18:14:40 -04:00
2022-12-29 23:57:02 -05:00
pages = b["volumeInfo"]["pageCount"] if "pageCount" in b["volumeInfo"] else None
if "mainCategory" in b["volumeInfo"]:
other["分类"] = b["volumeInfo"]["mainCategory"]
authors = b["volumeInfo"]["authors"] if "authors" in b["volumeInfo"] else None
if "description" in b["volumeInfo"]:
brief = b["volumeInfo"]["description"]
elif "textSnippet" in b["volumeInfo"]:
2022-12-09 02:35:21 +00:00
brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
else:
2022-12-29 23:57:02 -05:00
brief = ""
brief = re.sub(r"<.*?>", "", brief.replace("<br", "\n<br"))
img_url = None
if "imageLinks" in b["volumeInfo"]:
2023-07-25 16:38:00 -04:00
if "extraLarge" in b["volumeInfo"]["imageLinks"]:
img_url = b["volumeInfo"]["imageLinks"]["extraLarge"]
elif "large" in b["volumeInfo"]["imageLinks"]:
img_url = b["volumeInfo"]["imageLinks"]["large"]
elif "thumbnail" in b["volumeInfo"]["imageLinks"]:
2023-02-12 21:54:21 -05:00
img_url = b["volumeInfo"]["imageLinks"]["thumbnail"]
2023-07-25 16:38:00 -04:00
# if "thumbnail" in b["volumeInfo"]["imageLinks"]:
# img_url = b["volumeInfo"]["imageLinks"]["thumbnail"]
# img_url = img_url.replace("zoom=1", "")
2022-12-09 02:35:21 +00:00
isbn10 = None
isbn13 = None
2022-12-29 23:57:02 -05:00
for iid in (
b["volumeInfo"]["industryIdentifiers"]
if "industryIdentifiers" in b["volumeInfo"]
else []
):
if iid["type"] == "ISBN_10":
isbn10 = iid["identifier"]
if iid["type"] == "ISBN_13":
isbn13 = iid["identifier"]
2022-12-09 02:35:21 +00:00
isbn = isbn13 # if isbn13 is not None else isbn10
2023-02-12 21:54:21 -05:00
raw_img, ext = BasicImageDownloader.download_image(img_url, None, headers={})
2022-12-09 02:35:21 +00:00
data = {
2022-12-29 23:57:02 -05:00
"title": title,
2024-07-13 18:14:40 -04:00
"localized_title": [{"lang": language, "text": title}],
2022-12-29 23:57:02 -05:00
"subtitle": subtitle,
2024-07-13 18:14:40 -04:00
"localized_subtitle": (
[{"lang": language, "text": subtitle}] if subtitle else []
),
2022-12-29 23:57:02 -05:00
"orig_title": None,
"author": authors,
"translator": None,
"language": language,
"pub_house": pub_house,
"pub_year": pub_year,
"pub_month": pub_month,
"binding": None,
"pages": pages,
"isbn": isbn,
2024-07-13 18:14:40 -04:00
# "brief": brief,
"localized_description": (
[{"lang": language, "text": brief}] if brief else []
),
2022-12-29 23:57:02 -05:00
"contents": None,
"other_info": other,
"cover_image_url": img_url,
2022-12-09 02:35:21 +00:00
}
2022-12-29 23:57:02 -05:00
return ResourceContent(
metadata=data,
cover_image=raw_img,
cover_image_extention=ext,
lookup_ids={IdType.ISBN: isbn13},
)