2023-08-10 11:27:31 -04:00
|
|
|
import re
|
2025-01-18 15:53:06 -05:00
|
|
|
from urllib.parse import quote_plus
|
2022-12-09 02:35:21 +00:00
|
|
|
|
2025-01-18 15:53:06 -05:00
|
|
|
import httpx
|
2024-10-15 00:44:44 -04:00
|
|
|
from django.conf import settings
|
2025-01-18 15:53:06 -05:00
|
|
|
from loguru import logger
|
2024-10-15 00:44:44 -04:00
|
|
|
|
|
|
|
from catalog.book.utils import isbn_10_to_13
|
2023-08-10 11:27:31 -04:00
|
|
|
from catalog.common import *
|
|
|
|
from catalog.models import *
|
2022-12-09 02:35:21 +00:00
|
|
|
|
|
|
|
|
2022-12-15 17:29:35 -05:00
|
|
|
@SiteManager.register
|
2022-12-09 02:35:21 +00:00
|
|
|
class GoogleBooks(AbstractSite):
|
2022-12-15 17:29:35 -05:00
|
|
|
SITE_NAME = SiteName.GoogleBooks
|
2022-12-09 02:35:21 +00:00
|
|
|
ID_TYPE = IdType.GoogleBooks
|
2022-12-15 17:29:35 -05:00
|
|
|
URL_PATTERNS = [
|
2025-03-03 16:37:27 -05:00
|
|
|
r"https://books\.google\.[^/]+/books\?id=([^&#]+)",
|
|
|
|
r"https://www\.google\.[^/]+/books/edition/[^/]+/([^&#?]+)",
|
|
|
|
r"https://books\.google\.[^/]+/books/about/[^?]+\?id=([^&#?]+)",
|
2022-12-09 02:35:21 +00:00
|
|
|
]
|
2022-12-29 23:57:02 -05:00
|
|
|
WIKI_PROPERTY_ID = ""
|
2022-12-09 02:35:21 +00:00
|
|
|
DEFAULT_MODEL = Edition
|
|
|
|
|
|
|
|
@classmethod
|
2023-08-11 11:55:42 -04:00
|
|
|
def id_to_url(cls, id_value):
|
2022-12-09 02:35:21 +00:00
|
|
|
return "https://books.google.com/books?id=" + id_value
|
|
|
|
|
|
|
|
def scrape(self):
|
2022-12-29 23:57:02 -05:00
|
|
|
api_url = f"https://www.googleapis.com/books/v1/volumes/{self.id_value}"
|
2024-10-15 00:44:44 -04:00
|
|
|
if settings.GOOGLE_API_KEY:
|
|
|
|
api_url += f"?key={settings.GOOGLE_API_KEY}"
|
2022-12-09 02:35:21 +00:00
|
|
|
b = BasicDownloader(api_url).download().json()
|
|
|
|
other = {}
|
2022-12-29 23:57:02 -05:00
|
|
|
title = b["volumeInfo"]["title"]
|
|
|
|
subtitle = (
|
|
|
|
b["volumeInfo"]["subtitle"] if "subtitle" in b["volumeInfo"] else None
|
|
|
|
)
|
2022-12-09 02:35:21 +00:00
|
|
|
pub_year = None
|
|
|
|
pub_month = None
|
2022-12-29 23:57:02 -05:00
|
|
|
if "publishedDate" in b["volumeInfo"]:
|
|
|
|
pub_date = b["volumeInfo"]["publishedDate"].split("-")
|
2022-12-09 02:35:21 +00:00
|
|
|
pub_year = pub_date[0]
|
|
|
|
pub_month = pub_date[1] if len(pub_date) > 1 else None
|
2022-12-29 23:57:02 -05:00
|
|
|
pub_house = (
|
|
|
|
b["volumeInfo"]["publisher"] if "publisher" in b["volumeInfo"] else None
|
|
|
|
)
|
2024-07-15 23:26:24 -04:00
|
|
|
language = (
|
|
|
|
b["volumeInfo"]["language"].lower() if "language" in b["volumeInfo"] else []
|
|
|
|
)
|
2024-07-13 18:14:40 -04:00
|
|
|
|
2022-12-29 23:57:02 -05:00
|
|
|
pages = b["volumeInfo"]["pageCount"] if "pageCount" in b["volumeInfo"] else None
|
|
|
|
if "mainCategory" in b["volumeInfo"]:
|
|
|
|
other["分类"] = b["volumeInfo"]["mainCategory"]
|
|
|
|
authors = b["volumeInfo"]["authors"] if "authors" in b["volumeInfo"] else None
|
|
|
|
if "description" in b["volumeInfo"]:
|
|
|
|
brief = b["volumeInfo"]["description"]
|
|
|
|
elif "textSnippet" in b["volumeInfo"]:
|
2022-12-09 02:35:21 +00:00
|
|
|
brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
|
|
|
|
else:
|
2022-12-29 23:57:02 -05:00
|
|
|
brief = ""
|
|
|
|
brief = re.sub(r"<.*?>", "", brief.replace("<br", "\n<br"))
|
2023-02-12 21:28:22 -05:00
|
|
|
img_url = None
|
|
|
|
if "imageLinks" in b["volumeInfo"]:
|
2023-07-25 16:38:00 -04:00
|
|
|
if "extraLarge" in b["volumeInfo"]["imageLinks"]:
|
|
|
|
img_url = b["volumeInfo"]["imageLinks"]["extraLarge"]
|
|
|
|
elif "large" in b["volumeInfo"]["imageLinks"]:
|
|
|
|
img_url = b["volumeInfo"]["imageLinks"]["large"]
|
|
|
|
elif "thumbnail" in b["volumeInfo"]["imageLinks"]:
|
2023-02-12 21:54:21 -05:00
|
|
|
img_url = b["volumeInfo"]["imageLinks"]["thumbnail"]
|
2023-07-25 16:38:00 -04:00
|
|
|
# if "thumbnail" in b["volumeInfo"]["imageLinks"]:
|
|
|
|
# img_url = b["volumeInfo"]["imageLinks"]["thumbnail"]
|
|
|
|
# img_url = img_url.replace("zoom=1", "")
|
2022-12-09 02:35:21 +00:00
|
|
|
isbn10 = None
|
|
|
|
isbn13 = None
|
2022-12-29 23:57:02 -05:00
|
|
|
for iid in (
|
|
|
|
b["volumeInfo"]["industryIdentifiers"]
|
|
|
|
if "industryIdentifiers" in b["volumeInfo"]
|
|
|
|
else []
|
|
|
|
):
|
|
|
|
if iid["type"] == "ISBN_10":
|
|
|
|
isbn10 = iid["identifier"]
|
|
|
|
if iid["type"] == "ISBN_13":
|
|
|
|
isbn13 = iid["identifier"]
|
2024-10-15 00:44:44 -04:00
|
|
|
isbn = isbn13 if isbn13 is not None else isbn_10_to_13(isbn10)
|
2022-12-09 02:35:21 +00:00
|
|
|
|
2023-02-12 21:54:21 -05:00
|
|
|
raw_img, ext = BasicImageDownloader.download_image(img_url, None, headers={})
|
2022-12-09 02:35:21 +00:00
|
|
|
data = {
|
2022-12-29 23:57:02 -05:00
|
|
|
"title": title,
|
2024-07-13 18:14:40 -04:00
|
|
|
"localized_title": [{"lang": language, "text": title}],
|
2022-12-29 23:57:02 -05:00
|
|
|
"subtitle": subtitle,
|
2024-07-13 18:14:40 -04:00
|
|
|
"localized_subtitle": (
|
|
|
|
[{"lang": language, "text": subtitle}] if subtitle else []
|
|
|
|
),
|
2022-12-29 23:57:02 -05:00
|
|
|
"orig_title": None,
|
|
|
|
"author": authors,
|
|
|
|
"translator": None,
|
|
|
|
"language": language,
|
|
|
|
"pub_house": pub_house,
|
|
|
|
"pub_year": pub_year,
|
|
|
|
"pub_month": pub_month,
|
|
|
|
"binding": None,
|
|
|
|
"pages": pages,
|
|
|
|
"isbn": isbn,
|
2024-07-13 18:14:40 -04:00
|
|
|
# "brief": brief,
|
|
|
|
"localized_description": (
|
|
|
|
[{"lang": language, "text": brief}] if brief else []
|
|
|
|
),
|
2022-12-29 23:57:02 -05:00
|
|
|
"contents": None,
|
|
|
|
"other_info": other,
|
|
|
|
"cover_image_url": img_url,
|
2022-12-09 02:35:21 +00:00
|
|
|
}
|
2022-12-29 23:57:02 -05:00
|
|
|
return ResourceContent(
|
|
|
|
metadata=data,
|
|
|
|
cover_image=raw_img,
|
|
|
|
cover_image_extention=ext,
|
|
|
|
lookup_ids={IdType.ISBN: isbn13},
|
|
|
|
)
|
2025-01-18 15:53:06 -05:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
async def search_task(
|
2025-01-20 07:03:56 -05:00
|
|
|
cls, q: str, page: int, category: str, page_size: int
|
2025-01-18 15:53:06 -05:00
|
|
|
) -> list[ExternalSearchResultItem]:
|
|
|
|
if category not in ["all", "book"]:
|
|
|
|
return []
|
|
|
|
results = []
|
2025-01-20 07:03:56 -05:00
|
|
|
api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={page_size * (page - 1)}&maxResults={page_size}&maxAllowedMaturityRating=MATURE"
|
2025-01-18 15:53:06 -05:00
|
|
|
async with httpx.AsyncClient() as client:
|
|
|
|
try:
|
|
|
|
response = await client.get(api_url, timeout=2)
|
|
|
|
j = response.json()
|
|
|
|
if "items" in j:
|
|
|
|
for b in j["items"]:
|
|
|
|
if "title" not in b["volumeInfo"]:
|
|
|
|
continue
|
|
|
|
title = b["volumeInfo"]["title"]
|
|
|
|
subtitle = ""
|
|
|
|
if "publishedDate" in b["volumeInfo"]:
|
|
|
|
subtitle += b["volumeInfo"]["publishedDate"] + " "
|
|
|
|
if "authors" in b["volumeInfo"]:
|
|
|
|
subtitle += ", ".join(b["volumeInfo"]["authors"])
|
|
|
|
if "description" in b["volumeInfo"]:
|
|
|
|
brief = b["volumeInfo"]["description"]
|
|
|
|
elif "textSnippet" in b["volumeInfo"]:
|
|
|
|
brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
|
|
|
|
else:
|
|
|
|
brief = ""
|
|
|
|
category = ItemCategory.Book
|
|
|
|
# b['volumeInfo']['infoLink'].replace('http:', 'https:')
|
|
|
|
url = "https://books.google.com/books?id=" + b["id"]
|
|
|
|
cover = (
|
|
|
|
b["volumeInfo"]["imageLinks"]["thumbnail"]
|
|
|
|
if "imageLinks" in b["volumeInfo"]
|
|
|
|
else ""
|
|
|
|
)
|
|
|
|
results.append(
|
|
|
|
ExternalSearchResultItem(
|
|
|
|
category,
|
|
|
|
SiteName.GoogleBooks,
|
|
|
|
url,
|
|
|
|
title,
|
|
|
|
subtitle,
|
|
|
|
brief,
|
|
|
|
cover,
|
|
|
|
)
|
|
|
|
)
|
2025-01-29 23:33:45 -05:00
|
|
|
except httpx.ReadTimeout:
|
|
|
|
logger.warning("GoogleBooks search timeout", extra={"query": q})
|
2025-01-18 15:53:06 -05:00
|
|
|
except Exception as e:
|
|
|
|
logger.error(
|
|
|
|
"GoogleBooks search error", extra={"query": q, "exception": e}
|
|
|
|
)
|
|
|
|
return results
|