import requests import re import filetype from lxml import html from common.models import SourceSiteEnum from movies.models import Movie, MovieGenreEnum from movies.forms import MovieForm from books.models import Book from books.forms import BookForm from music.models import Album, Song from music.forms import AlbumForm, SongForm from games.models import Game from games.forms import GameForm from django.conf import settings from PIL import Image from io import BytesIO from common.scraper import * # https://developers.google.com/youtube/v3/docs/?apix=true # https://developers.google.com/books/docs/v1/using class GoogleBooksScraper(AbstractScraper): site_name = SourceSiteEnum.GOOGLEBOOKS.value host = ["books.google.com", "www.google.com/books"] data_class = Book form_class = BookForm regex = re.compile(r"https://books\.google\.com/books\?id=([^&#]+)") @classmethod def get_effective_url(cls, raw_url): # https://books.google.com/books?id=wUHxzgEACAAJ # https://books.google.com/books/about/%E7%8F%BE%E5%A0%B4%E6%AD%B7%E5%8F%B2.html?id=nvNoAAAAIAAJ # https://www.google.com/books/edition/_/nvNoAAAAIAAJ?hl=en&gbpv=1 u = re.match(r"https://books\.google\.com/books.*id=([^&#]+)", raw_url) if not u: u = re.match(r"https://www\.google\.com/books/edition/[^/]+/([^&#?]+)", raw_url) return 'https://books.google.com/books?id=' + u[1] if u else None def scrape(self, url, response=None): url = self.get_effective_url(url) m = self.regex.match(url) if m: api_url = f'https://www.googleapis.com/books/v1/volumes/{m[1]}' else: raise ValueError("not valid url") b = requests.get(api_url).json() other = {} title = b['volumeInfo']['title'] subtitle = b['volumeInfo']['subtitle'] if 'subtitle' in b['volumeInfo'] else None pub_year = None pub_month = None if 'publishedDate' in b['volumeInfo']: pub_date = b['volumeInfo']['publishedDate'].split('-') pub_year = pub_date[0] pub_month = pub_date[1] if len(pub_date) > 1 else None pub_house = b['volumeInfo']['publisher'] if 'publisher' in b['volumeInfo'] else None language = b['volumeInfo']['language'] if 'language' in b['volumeInfo'] else None pages = b['volumeInfo']['pageCount'] if 'pageCount' in b['volumeInfo'] else None if 'mainCategory' in b['volumeInfo']: other['分类'] = b['volumeInfo']['mainCategory'] authors = b['volumeInfo']['authors'] if 'authors' in b['volumeInfo'] else None if 'description' in b['volumeInfo']: brief = b['volumeInfo']['description'] elif 'textSnippet' in b['volumeInfo']: brief = b["volumeInfo"]["textSnippet"]["searchInfo"] else: brief = '' brief = re.sub(r'<.*?>', '', brief.replace('