scrape GoogleBooks

This commit is contained in:
Your Name 2021-10-19 00:52:10 -04:00
parent 3398e050e1
commit 7d548cdead
4 changed files with 88 additions and 3 deletions

View file

@ -1750,3 +1750,78 @@ class TmdbMovieScraper(AbstractScraper):
# https://developers.google.com/youtube/v3/docs/?apix=true
# https://developers.google.com/books/docs/v1/using
class GoogleBooksScraper(AbstractScraper):
site_name = SourceSiteEnum.GOOGLEBOOKS.value
host = "books.google.com"
data_class = Book
form_class = BookForm
regex = re.compile(r"https://books\.google\.com/books\?id=(\w+)")
@classmethod
def get_effective_url(cls, raw_url):
u = re.match(r"https://books\.google\.com/books\?id=\w+", raw_url)
return u[0] if u else None
def scrape(self, url, response=None):
m = self.regex.match(url)
if m:
api_url = f'https://www.googleapis.com/books/v1/volumes/{m[1]}'
else:
raise ValueError("not valid url")
b = requests.get(api_url).json()
other = {}
title = b['volumeInfo']['title']
subtitle = b['volumeInfo']['subtitle'] if 'subtitle' in b['volumeInfo'] else None
pub_year = None
pub_month = None
if 'publishedDate' in b['volumeInfo']:
pub_date = b['volumeInfo']['publishedDate']
pub_year = re.sub(r'(\d\d\d\d).+', r'\1', pub_date)
pub_month = re.sub(r'(\d\d\d\d)-(\d+).+', r'\2', pub_date) if len(pub_date) > 5 else None
pub_house = b['volumeInfo']['publisher'] if 'publisher' in b['volumeInfo'] else None
language = b['volumeInfo']['language'] if 'language' in b['volumeInfo'] else None
pages = b['volumeInfo']['pageCount'] if 'pageCount' in b['volumeInfo'] else None
if 'mainCategory' in b['volumeInfo']:
other['分类'] = b['volumeInfo']['mainCategory']
authors = b['volumeInfo']['authors']
if 'description' in b['volumeInfo']:
brief = b['volumeInfo']['description']
elif 'textSnippet' in b['volumeInfo']:
brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
else:
brief = ''
brief = re.sub(r'<.*?>', '', brief.replace('<br', '\n<br'))
img_url = b['volumeInfo']['imageLinks']['thumbnail'] if 'imageLinks' in b['volumeInfo'] else None
isbn10 = None
isbn13 = None
for iid in b['volumeInfo']['industryIdentifiers'] if 'industryIdentifiers' in b['volumeInfo'] else []:
if iid['type'] == 'ISBN_10':
isbn10 = iid['identifier']
if iid['type'] == 'ISBN_13':
isbn13 = iid['identifier']
isbn = isbn13 if isbn13 is not None else isbn10
data = {
'title': title,
'subtitle': subtitle,
'orig_title': None,
'author': authors,
'translator': None,
'language': language,
'pub_house': pub_house,
'pub_year': pub_year,
'pub_month': pub_month,
'binding': None,
'pages': pages,
'isbn': isbn,
'brief': brief,
'contents': None,
'other_info': other,
'cover_url': img_url,
'source_site': self.site_name,
'source_url': self.get_effective_url(url),
}
raw_img, ext = self.download_image(img_url, url)
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img

View file

@ -1130,13 +1130,19 @@ select::placeholder {
}
.source-label.source-label__tmdb {
background: linear-gradient(30deg, #91CCA3, #1FB4E2);
background: linear-gradient(90deg, #91CCA3, #1FB4E2);
color: white;
border: none;
font-weight: 600;
font-weight: lighter;
padding-top: 2px;
}
.source-label.source-label__googlebooks {
color: white;
background-color: #4285F4;
border-color: #4285F4;
}
.main-section-wrapper {
padding: 32px 48px 32px 36px;
background-color: #f7f7f7;

File diff suppressed because one or more lines are too long

View file

@ -75,3 +75,7 @@ $tmdb-color-secondary: #1FB4E2
border: none
font-weight: lighter
padding-top: 2px
&.source-label__googlebooks
color: white
background-color: #4285F4
border-color: #4285F4