goodreads
This commit is contained in:
parent
81e94a43ff
commit
1b02d73a52
5 changed files with 142 additions and 3 deletions
|
@ -19,11 +19,12 @@ RE_HTML_TAG = re.compile(r"<[^>]*>")
|
|||
###################################
|
||||
class SourceSiteEnum(models.TextChoices):
|
||||
IN_SITE = "in-site", settings.CLIENT_NAME
|
||||
DOUBAN = "douban", _("豆瓣")
|
||||
DOUBAN = "douban", _("豆瓣")
|
||||
SPOTIFY = "spotify", _("Spotify")
|
||||
IMDB = "imdb", _("IMDb")
|
||||
STEAM = "steam", _("STEAM")
|
||||
BANGUMI = 'bangumi', _("bangumi")
|
||||
GOODREADS = "goodreads", _("goodreads")
|
||||
|
||||
|
||||
class Entity(models.Model):
|
||||
|
|
|
@ -1373,6 +1373,137 @@ class BangumiScraper(AbstractScraper):
|
|||
raise NotImplementedError
|
||||
|
||||
|
||||
class GoodreadsScraper(AbstractScraper):
|
||||
site_name = SourceSiteEnum.GOODREADS.value
|
||||
host = "www.goodreads.com"
|
||||
data_class = Book
|
||||
form_class = BookForm
|
||||
regex = re.compile(r"https://www\.goodreads\.com/show/\d+")
|
||||
|
||||
@classmethod
|
||||
def get_effective_url(cls, raw_url):
|
||||
u = re.match(r"https://www\.goodreads\.com/book/show/\d+", raw_url)
|
||||
return u[0] if u else None
|
||||
|
||||
def scrape(self, url):
|
||||
"""
|
||||
This is the scraping portal
|
||||
"""
|
||||
headers = DEFAULT_REQUEST_HEADERS.copy()
|
||||
headers['Host'] = self.host
|
||||
content = self.download_page(url, headers)
|
||||
|
||||
try:
|
||||
title = content.xpath("//h1[@id='bookTitle']/text()")[0].strip()
|
||||
except IndexError:
|
||||
raise ValueError("given url contains no book info")
|
||||
|
||||
subtitle = None
|
||||
|
||||
orig_title_elem = content.xpath("//div[@id='bookDataBox']//div[text()='Original Title']/following-sibling::div/text()")
|
||||
orig_title = orig_title_elem[0].strip() if orig_title_elem else None
|
||||
|
||||
language_elem = content.xpath('//div[@itemprop="inLanguage"]/text()')
|
||||
language = language_elem[0].strip() if language_elem else None
|
||||
|
||||
pub_house_elem = content.xpath("//div[contains(text(), 'Published') and @class='row']/text()")
|
||||
try:
|
||||
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
|
||||
r = re.compile('.*Published.*(' + '|'.join(months) + ').*(\\d\\d\\d\\d).+by\\s*(.+)\\s*', re.DOTALL)
|
||||
pub = r.match(pub_house_elem[0])
|
||||
pub_year = pub[2]
|
||||
pub_month = months.index(pub[1]) + 1
|
||||
pub_house = pub[3].strip()
|
||||
except:
|
||||
pub_year = None
|
||||
pub_month = None
|
||||
pub_house = None
|
||||
|
||||
pub_house_elem = content.xpath("//nobr[contains(text(), 'first published')]/text()")
|
||||
try:
|
||||
pub = re.match(r'.*first published\s+(.+\d\d\d\d).*', pub_house_elem[0], re.DOTALL)
|
||||
first_pub = pub[1]
|
||||
except:
|
||||
first_pub = None
|
||||
|
||||
binding_elem = content.xpath('//span[@itemprop="bookFormat"]/text()')
|
||||
binding = binding_elem[0].strip() if binding_elem else None
|
||||
|
||||
pages_elem = content.xpath('//span[@itemprop="numberOfPages"]/text()')
|
||||
pages = pages_elem[0].strip() if pages_elem else None
|
||||
if pages is not None:
|
||||
pages = int(RE_NUMBERS.findall(pages)[
|
||||
0]) if RE_NUMBERS.findall(pages) else None
|
||||
|
||||
isbn_elem = content.xpath('//span[@itemprop="isbn"]/text()')
|
||||
if not isbn_elem:
|
||||
isbn_elem = content.xpath('//div[@itemprop="isbn"]/text()') # this is likely ASIN
|
||||
isbn = isbn_elem[0].strip() if isbn_elem else None
|
||||
|
||||
brief_elem = content.xpath('//div[@id="description"]/span[@style="display:none"]/text()')
|
||||
brief = '\n'.join(p.strip() for p in brief_elem) if brief_elem else None
|
||||
|
||||
genre = content.xpath('//div[@class="bigBoxBody"]/div/div/div/a/text()')
|
||||
genre = genre[0] if genre else None
|
||||
book_title = re.sub('\n','',content.xpath('//h1[@id="bookTitle"]/text()')[0]).strip()
|
||||
author = content.xpath('//a[@class="authorName"]/span/text()')[0]
|
||||
contents = None
|
||||
|
||||
img_url_elem = content.xpath("//img[@id='coverImage']/@src")
|
||||
img_url = img_url_elem[0].strip() if img_url_elem else None
|
||||
raw_img, ext = self.download_image(img_url)
|
||||
|
||||
authors_elem = content.xpath("//a[@class='authorName'][not(../span[@class='authorName greyText smallText role'])]/span/text()")
|
||||
if authors_elem:
|
||||
authors = []
|
||||
for author in authors_elem:
|
||||
authors.append(RE_WHITESPACES.sub(' ', author.strip()))
|
||||
else:
|
||||
authors = None
|
||||
|
||||
translators = None
|
||||
authors_elem = content.xpath("//a[@class='authorName'][../span/text()='(Translator)']/span/text()")
|
||||
if authors_elem:
|
||||
translators = []
|
||||
for translator in authors_elem:
|
||||
translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
|
||||
else:
|
||||
translators = None
|
||||
|
||||
other = {}
|
||||
if first_pub:
|
||||
other['首版时间'] = first_pub
|
||||
if genre:
|
||||
other['分类'] = genre
|
||||
series_elem = content.xpath("//h2[@id='bookSeries']/a/text()")
|
||||
if series_elem:
|
||||
other['丛书'] = re.sub(r'\(\s*(.+[^\s])\s*#.*\)', '\\1', series_elem[0].strip())
|
||||
|
||||
data = {
|
||||
'title': title,
|
||||
'subtitle': subtitle,
|
||||
'orig_title': orig_title,
|
||||
'author': authors,
|
||||
'translator': translators,
|
||||
'language': language,
|
||||
'pub_house': pub_house,
|
||||
'pub_year': pub_year,
|
||||
'pub_month': pub_month,
|
||||
'binding': binding,
|
||||
'pages': pages,
|
||||
'isbn': isbn,
|
||||
'brief': brief,
|
||||
'contents': contents,
|
||||
'other_info': other,
|
||||
'source_site': self.site_name,
|
||||
'source_url': self.get_effective_url(url),
|
||||
}
|
||||
data['source_url'] = self.get_effective_url(url)
|
||||
|
||||
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
|
||||
return data, raw_img
|
||||
|
||||
|
||||
# https://developers.google.com/youtube/v3/docs/?apix=true
|
||||
# https://developers.google.com/books/docs/v1/using
|
||||
|
||||
|
|
2
common/static/css/boofilsic.min.css
vendored
2
common/static/css/boofilsic.min.css
vendored
File diff suppressed because one or more lines are too long
|
@ -11,6 +11,8 @@ $steam-color-primary: #1387b8
|
|||
$steam-color-secondary: #111d2e
|
||||
$bangumi-color-primary: #F09199
|
||||
$bangumi-color-secondary: #FCFCFC
|
||||
$goodreads-color-primary: #372213
|
||||
$goodreads-color-secondary: #F4F1EA
|
||||
|
||||
.source-label
|
||||
display: inline
|
||||
|
@ -60,4 +62,8 @@ $bangumi-color-secondary: #FCFCFC
|
|||
background: $bangumi-color-secondary
|
||||
color: $bangumi-color-primary
|
||||
font-style: italic
|
||||
font-weight: 600
|
||||
font-weight: 600
|
||||
&.source-label__goodreads
|
||||
background: $goodreads-color-secondary
|
||||
color: $goodreads-color-primary
|
||||
font-weight: lighter
|
|
@ -2,6 +2,7 @@ dateparser
|
|||
django
|
||||
django-hstore
|
||||
django-markdownx
|
||||
django-sass
|
||||
easy-thumbnails
|
||||
lxml
|
||||
openpyxl
|
||||
|
|
Loading…
Add table
Reference in a new issue