lib.itmens/common/scrapers/goodreads.py
2022-11-20 05:32:43 +00:00

157 lines
6.1 KiB
Python

import requests
import re
import filetype
from lxml import html
from common.models import SourceSiteEnum
from movies.models import Movie, MovieGenreEnum
from movies.forms import MovieForm
from books.models import Book
from books.forms import BookForm
from music.models import Album, Song
from music.forms import AlbumForm, SongForm
from games.models import Game
from games.forms import GameForm
from django.conf import settings
from PIL import Image
from io import BytesIO
from common.scraper import *
class GoodreadsScraper(AbstractScraper):
site_name = SourceSiteEnum.GOODREADS.value
host = "www.goodreads.com"
data_class = Book
form_class = BookForm
regex = re.compile(r"https://www\.goodreads\.com/book/show/\d+")
@classmethod
def get_effective_url(cls, raw_url):
u = re.match(r".+/book/show/(\d+)", raw_url)
if not u:
u = re.match(r".+book/(\d+)", raw_url)
return "https://www.goodreads.com/book/show/" + u[1] if u else None
def scrape(self, url, response=None):
"""
This is the scraping portal
"""
if response is not None:
content = html.fromstring(response.content.decode('utf-8'))
else:
headers = None # DEFAULT_REQUEST_HEADERS.copy()
content = self.download_page(url, headers)
try:
title = content.xpath("//h1/text()")[0].strip()
except IndexError:
raise ValueError("given url contains no book info")
subtitle = None
orig_title_elem = content.xpath("//div[@id='bookDataBox']//div[text()='Original Title']/following-sibling::div/text()")
orig_title = orig_title_elem[0].strip() if orig_title_elem else None
language_elem = content.xpath('//div[@itemprop="inLanguage"]/text()')
language = language_elem[0].strip() if language_elem else None
pub_house_elem = content.xpath("//div[contains(text(), 'Published') and @class='row']/text()")
try:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
r = re.compile('.*Published.*(' + '|'.join(months) + ').*(\\d\\d\\d\\d).+by\\s*(.+)\\s*', re.DOTALL)
pub = r.match(pub_house_elem[0])
pub_year = pub[2]
pub_month = months.index(pub[1]) + 1
pub_house = pub[3].strip()
except Exception:
pub_year = None
pub_month = None
pub_house = None
pub_house_elem = content.xpath("//nobr[contains(text(), 'first published')]/text()")
try:
pub = re.match(r'.*first published\s+(.+\d\d\d\d).*', pub_house_elem[0], re.DOTALL)
first_pub = pub[1]
except Exception:
first_pub = None
binding_elem = content.xpath('//span[@itemprop="bookFormat"]/text()')
binding = binding_elem[0].strip() if binding_elem else None
pages_elem = content.xpath('//span[@itemprop="numberOfPages"]/text()')
pages = pages_elem[0].strip() if pages_elem else None
if pages is not None:
pages = int(RE_NUMBERS.findall(pages)[
0]) if RE_NUMBERS.findall(pages) else None
isbn_elem = content.xpath('//span[@itemprop="isbn"]/text()')
if not isbn_elem:
isbn_elem = content.xpath('//div[@itemprop="isbn"]/text()') # this is likely ASIN
isbn = isbn_elem[0].strip() if isbn_elem else None
brief_elem = content.xpath('//div[@id="description"]/span[@style="display:none"]/text()')
if brief_elem:
brief = '\n'.join(p.strip() for p in brief_elem)
else:
brief_elem = content.xpath('//div[@id="description"]/span/text()')
brief = '\n'.join(p.strip() for p in brief_elem) if brief_elem else None
genre = content.xpath('//div[@class="bigBoxBody"]/div/div/div/a/text()')
genre = genre[0] if genre else None
book_title = re.sub('\n', '', content.xpath('//h1[@id="bookTitle"]/text()')[0]).strip()
author = content.xpath('//a[@class="authorName"]/span/text()')[0]
contents = None
img_url_elem = content.xpath("//img[@id='coverImage']/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
raw_img, ext = self.download_image(img_url, url)
authors_elem = content.xpath("//a[@class='authorName'][not(../span[@class='authorName greyText smallText role'])]/span/text()")
if authors_elem:
authors = []
for author in authors_elem:
authors.append(RE_WHITESPACES.sub(' ', author.strip()))
else:
authors = None
translators = None
authors_elem = content.xpath("//a[@class='authorName'][../span/text()='(Translator)']/span/text()")
if authors_elem:
translators = []
for translator in authors_elem:
translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
else:
translators = None
other = {}
if first_pub:
other['首版时间'] = first_pub
if genre:
other['分类'] = genre
series_elem = content.xpath("//h2[@id='bookSeries']/a/text()")
if series_elem:
other['丛书'] = re.sub(r'\(\s*(.+[^\s])\s*#.*\)', '\\1', series_elem[0].strip())
data = {
'title': title,
'subtitle': subtitle,
'orig_title': orig_title,
'author': authors,
'translator': translators,
'language': language,
'pub_house': pub_house,
'pub_year': pub_year,
'pub_month': pub_month,
'binding': binding,
'pages': pages,
'isbn': isbn,
'brief': brief,
'contents': contents,
'other_info': other,
'cover_url': img_url,
'source_site': self.site_name,
'source_url': self.get_effective_url(url),
}
data['source_url'] = self.get_effective_url(url)
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img