lib.itmens/catalog/sites/google_books.py

import re
from urllib.parse import quote_plus

import httpx
from django.conf import settings
from loguru import logger

from catalog.book.utils import isbn_10_to_13
from catalog.common import *
from catalog.models import *


@SiteManager.register
class GoogleBooks(AbstractSite):
    SITE_NAME = SiteName.GoogleBooks
    ID_TYPE = IdType.GoogleBooks
    URL_PATTERNS = [
        r"https://books\.google\.[^/]+/books\?id=([^&#]+)",
        r"https://www\.google\.[^/]+/books/edition/[^/]+/([^&#?]+)",
        r"https://books\.google\.[^/]+/books/about/[^?]+\?id=([^&#?]+)",
    ]
    WIKI_PROPERTY_ID = ""
    DEFAULT_MODEL = Edition

    @classmethod
    def id_to_url(cls, id_value):
        return "https://books.google.com/books?id=" + id_value

    def scrape(self):
        api_url = f"https://www.googleapis.com/books/v1/volumes/{self.id_value}"
        if settings.GOOGLE_API_KEY:
            api_url += f"?key={settings.GOOGLE_API_KEY}"
        b = BasicDownloader(api_url).download().json()
        other = {}
        title = b["volumeInfo"]["title"]
        subtitle = (
            b["volumeInfo"]["subtitle"] if "subtitle" in b["volumeInfo"] else None
        )
        pub_year = None
        pub_month = None
        if "publishedDate" in b["volumeInfo"]:
            pub_date = b["volumeInfo"]["publishedDate"].split("-")
            pub_year = pub_date[0]
            pub_month = pub_date[1] if len(pub_date) > 1 else None
        pub_house = (
            b["volumeInfo"]["publisher"] if "publisher" in b["volumeInfo"] else None
        )
        language = (
            b["volumeInfo"]["language"].lower() if "language" in b["volumeInfo"] else []
        )

        pages = b["volumeInfo"]["pageCount"] if "pageCount" in b["volumeInfo"] else None
        if "mainCategory" in b["volumeInfo"]:
            other["分类"] = b["volumeInfo"]["mainCategory"]
        authors = b["volumeInfo"]["authors"] if "authors" in b["volumeInfo"] else None
        if "description" in b["volumeInfo"]:
            brief = b["volumeInfo"]["description"]
        elif "textSnippet" in b["volumeInfo"]:
            brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
        else:
            brief = ""
        brief = re.sub(r"<.*?>", "", brief.replace("<br", "\n<br"))
        img_url = None
        if "imageLinks" in b["volumeInfo"]:
            if "extraLarge" in b["volumeInfo"]["imageLinks"]:
                img_url = b["volumeInfo"]["imageLinks"]["extraLarge"]
            elif "large" in b["volumeInfo"]["imageLinks"]:
                img_url = b["volumeInfo"]["imageLinks"]["large"]
            elif "thumbnail" in b["volumeInfo"]["imageLinks"]:
                img_url = b["volumeInfo"]["imageLinks"]["thumbnail"]
            # if "thumbnail" in b["volumeInfo"]["imageLinks"]:
            #     img_url = b["volumeInfo"]["imageLinks"]["thumbnail"]
            #     img_url = img_url.replace("zoom=1", "")
        isbn10 = None
        isbn13 = None
        for iid in (
            b["volumeInfo"]["industryIdentifiers"]
            if "industryIdentifiers" in b["volumeInfo"]
            else []
        ):
            if iid["type"] == "ISBN_10":
                isbn10 = iid["identifier"]
            if iid["type"] == "ISBN_13":
                isbn13 = iid["identifier"]
        isbn = isbn13 if isbn13 is not None else isbn_10_to_13(isbn10)

        raw_img, ext = BasicImageDownloader.download_image(img_url, None, headers={})
        data = {
            "title": title,
            "localized_title": [{"lang": language, "text": title}],
            "subtitle": subtitle,
            "localized_subtitle": (
                [{"lang": language, "text": subtitle}] if subtitle else []
            ),
            "orig_title": None,
            "author": authors,
            "translator": None,
            "language": language,
            "pub_house": pub_house,
            "pub_year": pub_year,
            "pub_month": pub_month,
            "binding": None,
            "pages": pages,
            "isbn": isbn,
            # "brief": brief,
            "localized_description": (
                [{"lang": language, "text": brief}] if brief else []
            ),
            "contents": None,
            "other_info": other,
            "cover_image_url": img_url,
        }
        return ResourceContent(
            metadata=data,
            cover_image=raw_img,
            cover_image_extention=ext,
            lookup_ids={IdType.ISBN: isbn13},
        )

    @classmethod
    async def search_task(
        cls, q: str, page: int, category: str, page_size: int
    ) -> list[ExternalSearchResultItem]:
        if category not in ["all", "book"]:
            return []
        results = []
        api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={page_size * (page - 1)}&maxResults={page_size}&maxAllowedMaturityRating=MATURE"
        async with httpx.AsyncClient() as client:
            try:
                response = await client.get(api_url, timeout=2)
                j = response.json()
                if "items" in j:
                    for b in j["items"]:
                        if "title" not in b["volumeInfo"]:
                            continue
                        title = b["volumeInfo"]["title"]
                        subtitle = ""
                        if "publishedDate" in b["volumeInfo"]:
                            subtitle += b["volumeInfo"]["publishedDate"] + " "
                        if "authors" in b["volumeInfo"]:
                            subtitle += ", ".join(b["volumeInfo"]["authors"])
                        if "description" in b["volumeInfo"]:
                            brief = b["volumeInfo"]["description"]
                        elif "textSnippet" in b["volumeInfo"]:
                            brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
                        else:
                            brief = ""
                        category = ItemCategory.Book
                        # b['volumeInfo']['infoLink'].replace('http:', 'https:')
                        url = "https://books.google.com/books?id=" + b["id"]
                        cover = (
                            b["volumeInfo"]["imageLinks"]["thumbnail"]
                            if "imageLinks" in b["volumeInfo"]
                            else ""
                        )
                        results.append(
                            ExternalSearchResultItem(
                                category,
                                SiteName.GoogleBooks,
                                url,
                                title,
                                subtitle,
                                brief,
                                cover,
                            )
                        )
            except httpx.ReadTimeout:
                logger.warning("GoogleBooks search timeout", extra={"query": q})
            except Exception as e:
                logger.error(
                    "GoogleBooks search error", extra={"query": q, "exception": e}
                )
        return results
fix unicode site name; add isort; split journal and users models 2023-08-10 11:27:31 -04:00			`import re`
make external search async and configurable 2025-01-18 15:53:06 -05:00			`from urllib.parse import quote_plus`
new data model: googlebooks 2022-12-09 02:35:21 +00:00
make external search async and configurable 2025-01-18 15:53:06 -05:00			`import httpx`
use API KEY for google books api call 2024-10-15 00:44:44 -04:00			`from django.conf import settings`
make external search async and configurable 2025-01-18 15:53:06 -05:00			`from loguru import logger`
use API KEY for google books api call 2024-10-15 00:44:44 -04:00
			`from catalog.book.utils import isbn_10_to_13`
fix unicode site name; add isort; split journal and users models 2023-08-10 11:27:31 -04:00			`from catalog.common import *`
			`from catalog.models import *`
new data model: googlebooks 2022-12-09 02:35:21 +00:00

new data model: /book/<uid> 2022-12-15 17:29:35 -05:00			`@SiteManager.register`
new data model: googlebooks 2022-12-09 02:35:21 +00:00			`class GoogleBooks(AbstractSite):`
new data model: /book/<uid> 2022-12-15 17:29:35 -05:00			`SITE_NAME = SiteName.GoogleBooks`
new data model: googlebooks 2022-12-09 02:35:21 +00:00			`ID_TYPE = IdType.GoogleBooks`
new data model: /book/<uid> 2022-12-15 17:29:35 -05:00			`URL_PATTERNS = [`
fix google books parser when server not in us 2025-03-03 16:37:27 -05:00			`r"https://books\.google\.[^/]+/books\?id=([^&#]+)",`
			`r"https://www\.google\.[^/]+/books/edition/[^/]+/([^&#?]+)",`
			`r"https://books\.google\.[^/]+/books/about/[^?]+\?id=([^&#?]+)",`
new data model: googlebooks 2022-12-09 02:35:21 +00:00			`]`
reformat new code with black 2022-12-29 23:57:02 -05:00			`WIKI_PROPERTY_ID = ""`
new data model: googlebooks 2022-12-09 02:35:21 +00:00			`DEFAULT_MODEL = Edition`

			`@classmethod`
lint fix site and import 2023-08-11 11:55:42 -04:00			`def id_to_url(cls, id_value):`
new data model: googlebooks 2022-12-09 02:35:21 +00:00			`return "https://books.google.com/books?id=" + id_value`

			`def scrape(self):`
reformat new code with black 2022-12-29 23:57:02 -05:00			`api_url = f"https://www.googleapis.com/books/v1/volumes/{self.id_value}"`
use API KEY for google books api call 2024-10-15 00:44:44 -04:00			`if settings.GOOGLE_API_KEY:`
			`api_url += f"?key={settings.GOOGLE_API_KEY}"`
new data model: googlebooks 2022-12-09 02:35:21 +00:00			`b = BasicDownloader(api_url).download().json()`
			`other = {}`
reformat new code with black 2022-12-29 23:57:02 -05:00			`title = b["volumeInfo"]["title"]`
			`subtitle = (`
			`b["volumeInfo"]["subtitle"] if "subtitle" in b["volumeInfo"] else None`
			`)`
new data model: googlebooks 2022-12-09 02:35:21 +00:00			`pub_year = None`
			`pub_month = None`
reformat new code with black 2022-12-29 23:57:02 -05:00			`if "publishedDate" in b["volumeInfo"]:`
			`pub_date = b["volumeInfo"]["publishedDate"].split("-")`
new data model: googlebooks 2022-12-09 02:35:21 +00:00			`pub_year = pub_date[0]`
			`pub_month = pub_date[1] if len(pub_date) > 1 else None`
reformat new code with black 2022-12-29 23:57:02 -05:00			`pub_house = (`
			`b["volumeInfo"]["publisher"] if "publisher" in b["volumeInfo"] else None`
			`)`
fix edition title display issues 2024-07-15 23:26:24 -04:00			`language = (`
			`b["volumeInfo"]["language"].lower() if "language" in b["volumeInfo"] else []`
			`)`
fix tests 2024-07-13 18:14:40 -04:00
reformat new code with black 2022-12-29 23:57:02 -05:00			`pages = b["volumeInfo"]["pageCount"] if "pageCount" in b["volumeInfo"] else None`
			`if "mainCategory" in b["volumeInfo"]:`
			`other["分类"] = b["volumeInfo"]["mainCategory"]`
			`authors = b["volumeInfo"]["authors"] if "authors" in b["volumeInfo"] else None`
			`if "description" in b["volumeInfo"]:`
			`brief = b["volumeInfo"]["description"]`
			`elif "textSnippet" in b["volumeInfo"]:`
new data model: googlebooks 2022-12-09 02:35:21 +00:00			`brief = b["volumeInfo"]["textSnippet"]["searchInfo"]`
			`else:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`brief = ""`
			`brief = re.sub(r"<.*?>", "", brief.replace("<br", "\n<br"))`
improve google book cover image quality 2023-02-12 21:28:22 -05:00			`img_url = None`
			`if "imageLinks" in b["volumeInfo"]:`
fix google books cover 2023-07-25 16:38:00 -04:00			`if "extraLarge" in b["volumeInfo"]["imageLinks"]:`
			`img_url = b["volumeInfo"]["imageLinks"]["extraLarge"]`
			`elif "large" in b["volumeInfo"]["imageLinks"]:`
			`img_url = b["volumeInfo"]["imageLinks"]["large"]`
			`elif "thumbnail" in b["volumeInfo"]["imageLinks"]:`
fix google book cover 2023-02-12 21:54:21 -05:00			`img_url = b["volumeInfo"]["imageLinks"]["thumbnail"]`
fix google books cover 2023-07-25 16:38:00 -04:00			`# if "thumbnail" in b["volumeInfo"]["imageLinks"]:`
			`# img_url = b["volumeInfo"]["imageLinks"]["thumbnail"]`
			`# img_url = img_url.replace("zoom=1", "")`
new data model: googlebooks 2022-12-09 02:35:21 +00:00			`isbn10 = None`
			`isbn13 = None`
reformat new code with black 2022-12-29 23:57:02 -05:00			`for iid in (`
			`b["volumeInfo"]["industryIdentifiers"]`
			`if "industryIdentifiers" in b["volumeInfo"]`
			`else []`
			`):`
			`if iid["type"] == "ISBN_10":`
			`isbn10 = iid["identifier"]`
			`if iid["type"] == "ISBN_13":`
			`isbn13 = iid["identifier"]`
use API KEY for google books api call 2024-10-15 00:44:44 -04:00			`isbn = isbn13 if isbn13 is not None else isbn_10_to_13(isbn10)`
new data model: googlebooks 2022-12-09 02:35:21 +00:00
fix google book cover 2023-02-12 21:54:21 -05:00			`raw_img, ext = BasicImageDownloader.download_image(img_url, None, headers={})`
new data model: googlebooks 2022-12-09 02:35:21 +00:00			`data = {`
reformat new code with black 2022-12-29 23:57:02 -05:00			`"title": title,`
fix tests 2024-07-13 18:14:40 -04:00			`"localized_title": [{"lang": language, "text": title}],`
reformat new code with black 2022-12-29 23:57:02 -05:00			`"subtitle": subtitle,`
fix tests 2024-07-13 18:14:40 -04:00			`"localized_subtitle": (`
			`[{"lang": language, "text": subtitle}] if subtitle else []`
			`),`
reformat new code with black 2022-12-29 23:57:02 -05:00			`"orig_title": None,`
			`"author": authors,`
			`"translator": None,`
			`"language": language,`
			`"pub_house": pub_house,`
			`"pub_year": pub_year,`
			`"pub_month": pub_month,`
			`"binding": None,`
			`"pages": pages,`
			`"isbn": isbn,`
fix tests 2024-07-13 18:14:40 -04:00			`# "brief": brief,`
			`"localized_description": (`
			`[{"lang": language, "text": brief}] if brief else []`
			`),`
reformat new code with black 2022-12-29 23:57:02 -05:00			`"contents": None,`
			`"other_info": other,`
			`"cover_image_url": img_url,`
new data model: googlebooks 2022-12-09 02:35:21 +00:00			`}`
reformat new code with black 2022-12-29 23:57:02 -05:00			`return ResourceContent(`
			`metadata=data,`
			`cover_image=raw_img,`
			`cover_image_extention=ext,`
			`lookup_ids={IdType.ISBN: isbn13},`
			`)`
make external search async and configurable 2025-01-18 15:53:06 -05:00
			`@classmethod`
			`async def search_task(`
refine search; fix dedupe external results 2025-01-20 07:03:56 -05:00			`cls, q: str, page: int, category: str, page_size: int`
make external search async and configurable 2025-01-18 15:53:06 -05:00			`) -> list[ExternalSearchResultItem]:`
			`if category not in ["all", "book"]:`
			`return []`
			`results = []`
refine search; fix dedupe external results 2025-01-20 07:03:56 -05:00			`api_url = f"https://www.googleapis.com/books/v1/volumes?country=us&q={quote_plus(q)}&startIndex={page_size * (page - 1)}&maxResults={page_size}&maxAllowedMaturityRating=MATURE"`
make external search async and configurable 2025-01-18 15:53:06 -05:00			`async with httpx.AsyncClient() as client:`
			`try:`
			`response = await client.get(api_url, timeout=2)`
			`j = response.json()`
			`if "items" in j:`
			`for b in j["items"]:`
			`if "title" not in b["volumeInfo"]:`
			`continue`
			`title = b["volumeInfo"]["title"]`
			`subtitle = ""`
			`if "publishedDate" in b["volumeInfo"]:`
			`subtitle += b["volumeInfo"]["publishedDate"] + " "`
			`if "authors" in b["volumeInfo"]:`
			`subtitle += ", ".join(b["volumeInfo"]["authors"])`
			`if "description" in b["volumeInfo"]:`
			`brief = b["volumeInfo"]["description"]`
			`elif "textSnippet" in b["volumeInfo"]:`
			`brief = b["volumeInfo"]["textSnippet"]["searchInfo"]`
			`else:`
			`brief = ""`
			`category = ItemCategory.Book`
			`# b['volumeInfo']['infoLink'].replace('http:', 'https:')`
			`url = "https://books.google.com/books?id=" + b["id"]`
			`cover = (`
			`b["volumeInfo"]["imageLinks"]["thumbnail"]`
			`if "imageLinks" in b["volumeInfo"]`
			`else ""`
			`)`
			`results.append(`
			`ExternalSearchResultItem(`
			`category,`
			`SiteName.GoogleBooks,`
			`url,`
			`title,`
			`subtitle,`
			`brief,`
			`cover,`
			`)`
			`)`
do not log error when search timeout 2025-01-29 23:33:45 -05:00			`except httpx.ReadTimeout:`
			`logger.warning("GoogleBooks search timeout", extra={"query": q})`
make external search async and configurable 2025-01-18 15:53:06 -05:00			`except Exception as e:`
			`logger.error(`
			`"GoogleBooks search error", extra={"query": q, "exception": e}`
			`)`
			`return results`