goodreads

This commit is contained in:
Your Name 2021-09-21 23:09:09 -04:00
parent 81e94a43ff
commit 1b02d73a52
5 changed files with 142 additions and 3 deletions

View file

@ -19,11 +19,12 @@ RE_HTML_TAG = re.compile(r"<[^>]*>")
###################################
class SourceSiteEnum(models.TextChoices):
IN_SITE = "in-site", settings.CLIENT_NAME
DOUBAN = "douban", _("豆瓣")
DOUBAN = "douban", _("豆瓣")
SPOTIFY = "spotify", _("Spotify")
IMDB = "imdb", _("IMDb")
STEAM = "steam", _("STEAM")
BANGUMI = 'bangumi', _("bangumi")
GOODREADS = "goodreads", _("goodreads")
class Entity(models.Model):

View file

@ -1373,6 +1373,137 @@ class BangumiScraper(AbstractScraper):
raise NotImplementedError
class GoodreadsScraper(AbstractScraper):
site_name = SourceSiteEnum.GOODREADS.value
host = "www.goodreads.com"
data_class = Book
form_class = BookForm
regex = re.compile(r"https://www\.goodreads\.com/show/\d+")
@classmethod
def get_effective_url(cls, raw_url):
u = re.match(r"https://www\.goodreads\.com/book/show/\d+", raw_url)
return u[0] if u else None
def scrape(self, url):
"""
This is the scraping portal
"""
headers = DEFAULT_REQUEST_HEADERS.copy()
headers['Host'] = self.host
content = self.download_page(url, headers)
try:
title = content.xpath("//h1[@id='bookTitle']/text()")[0].strip()
except IndexError:
raise ValueError("given url contains no book info")
subtitle = None
orig_title_elem = content.xpath("//div[@id='bookDataBox']//div[text()='Original Title']/following-sibling::div/text()")
orig_title = orig_title_elem[0].strip() if orig_title_elem else None
language_elem = content.xpath('//div[@itemprop="inLanguage"]/text()')
language = language_elem[0].strip() if language_elem else None
pub_house_elem = content.xpath("//div[contains(text(), 'Published') and @class='row']/text()")
try:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
r = re.compile('.*Published.*(' + '|'.join(months) + ').*(\\d\\d\\d\\d).+by\\s*(.+)\\s*', re.DOTALL)
pub = r.match(pub_house_elem[0])
pub_year = pub[2]
pub_month = months.index(pub[1]) + 1
pub_house = pub[3].strip()
except:
pub_year = None
pub_month = None
pub_house = None
pub_house_elem = content.xpath("//nobr[contains(text(), 'first published')]/text()")
try:
pub = re.match(r'.*first published\s+(.+\d\d\d\d).*', pub_house_elem[0], re.DOTALL)
first_pub = pub[1]
except:
first_pub = None
binding_elem = content.xpath('//span[@itemprop="bookFormat"]/text()')
binding = binding_elem[0].strip() if binding_elem else None
pages_elem = content.xpath('//span[@itemprop="numberOfPages"]/text()')
pages = pages_elem[0].strip() if pages_elem else None
if pages is not None:
pages = int(RE_NUMBERS.findall(pages)[
0]) if RE_NUMBERS.findall(pages) else None
isbn_elem = content.xpath('//span[@itemprop="isbn"]/text()')
if not isbn_elem:
isbn_elem = content.xpath('//div[@itemprop="isbn"]/text()') # this is likely ASIN
isbn = isbn_elem[0].strip() if isbn_elem else None
brief_elem = content.xpath('//div[@id="description"]/span[@style="display:none"]/text()')
brief = '\n'.join(p.strip() for p in brief_elem) if brief_elem else None
genre = content.xpath('//div[@class="bigBoxBody"]/div/div/div/a/text()')
genre = genre[0] if genre else None
book_title = re.sub('\n','',content.xpath('//h1[@id="bookTitle"]/text()')[0]).strip()
author = content.xpath('//a[@class="authorName"]/span/text()')[0]
contents = None
img_url_elem = content.xpath("//img[@id='coverImage']/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
raw_img, ext = self.download_image(img_url)
authors_elem = content.xpath("//a[@class='authorName'][not(../span[@class='authorName greyText smallText role'])]/span/text()")
if authors_elem:
authors = []
for author in authors_elem:
authors.append(RE_WHITESPACES.sub(' ', author.strip()))
else:
authors = None
translators = None
authors_elem = content.xpath("//a[@class='authorName'][../span/text()='(Translator)']/span/text()")
if authors_elem:
translators = []
for translator in authors_elem:
translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
else:
translators = None
other = {}
if first_pub:
other['首版时间'] = first_pub
if genre:
other['分类'] = genre
series_elem = content.xpath("//h2[@id='bookSeries']/a/text()")
if series_elem:
other['丛书'] = re.sub(r'\(\s*(.+[^\s])\s*#.*\)', '\\1', series_elem[0].strip())
data = {
'title': title,
'subtitle': subtitle,
'orig_title': orig_title,
'author': authors,
'translator': translators,
'language': language,
'pub_house': pub_house,
'pub_year': pub_year,
'pub_month': pub_month,
'binding': binding,
'pages': pages,
'isbn': isbn,
'brief': brief,
'contents': contents,
'other_info': other,
'source_site': self.site_name,
'source_url': self.get_effective_url(url),
}
data['source_url'] = self.get_effective_url(url)
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
return data, raw_img
# https://developers.google.com/youtube/v3/docs/?apix=true
# https://developers.google.com/books/docs/v1/using

File diff suppressed because one or more lines are too long

View file

@ -11,6 +11,8 @@ $steam-color-primary: #1387b8
$steam-color-secondary: #111d2e
$bangumi-color-primary: #F09199
$bangumi-color-secondary: #FCFCFC
$goodreads-color-primary: #372213
$goodreads-color-secondary: #F4F1EA
.source-label
display: inline
@ -60,4 +62,8 @@ $bangumi-color-secondary: #FCFCFC
background: $bangumi-color-secondary
color: $bangumi-color-primary
font-style: italic
font-weight: 600
font-weight: 600
&.source-label__goodreads
background: $goodreads-color-secondary
color: $goodreads-color-primary
font-weight: lighter

View file

@ -2,6 +2,7 @@ dateparser
django
django-hstore
django-markdownx
django-sass
easy-thumbnails
lxml
openpyxl