goodreads

2021-09-21 23:09:09 -04:00 · 2021-09-21 23:09:09 -04:00 · 1b02d73a52
commit 1b02d73a52
parent 81e94a43ff
5 changed files with 142 additions and 3 deletions
--- a/common/models.py
+++ b/common/models.py
@ -19,11 +19,12 @@ RE_HTML_TAG = re.compile(r"<[^>]*>")
 ###################################
 class SourceSiteEnum(models.TextChoices):
    IN_SITE = "in-site", settings.CLIENT_NAME
-    DOUBAN = "douban",  _("豆瓣")
+    DOUBAN = "douban", _("豆瓣")
    SPOTIFY = "spotify", _("Spotify")
    IMDB = "imdb", _("IMDb")
    STEAM = "steam", _("STEAM")
    BANGUMI = 'bangumi', _("bangumi")
+    GOODREADS = "goodreads", _("goodreads")


 class Entity(models.Model):
--- a/common/scraper.py
+++ b/common/scraper.py
@ -1373,6 +1373,137 @@ class BangumiScraper(AbstractScraper):
        raise NotImplementedError


+class GoodreadsScraper(AbstractScraper):
+    site_name = SourceSiteEnum.GOODREADS.value
+    host = "www.goodreads.com"
+    data_class = Book
+    form_class = BookForm
+    regex = re.compile(r"https://www\.goodreads\.com/show/\d+")
+
+    @classmethod
+    def get_effective_url(cls, raw_url):
+        u = re.match(r"https://www\.goodreads\.com/book/show/\d+", raw_url)
+        return u[0] if u else None
+
+    def scrape(self, url):
+        """
+        This is the scraping portal
+        """
+        headers = DEFAULT_REQUEST_HEADERS.copy()
+        headers['Host'] = self.host
+        content = self.download_page(url, headers)
+
+        try:
+            title = content.xpath("//h1[@id='bookTitle']/text()")[0].strip()
+        except IndexError:
+            raise ValueError("given url contains no book info")
+
+        subtitle = None
+
+        orig_title_elem = content.xpath("//div[@id='bookDataBox']//div[text()='Original Title']/following-sibling::div/text()")
+        orig_title = orig_title_elem[0].strip() if orig_title_elem else None
+
+        language_elem = content.xpath('//div[@itemprop="inLanguage"]/text()')
+        language = language_elem[0].strip() if language_elem else None
+
+        pub_house_elem = content.xpath("//div[contains(text(), 'Published') and @class='row']/text()")
+        try:
+            months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
+            r = re.compile('.*Published.*(' + '|'.join(months) + ').*(\\d\\d\\d\\d).+by\\s*(.+)\\s*', re.DOTALL)
+            pub = r.match(pub_house_elem[0])
+            pub_year = pub[2]
+            pub_month = months.index(pub[1]) + 1
+            pub_house = pub[3].strip()
+        except:
+            pub_year = None
+            pub_month = None
+            pub_house = None
+
+        pub_house_elem = content.xpath("//nobr[contains(text(), 'first published')]/text()")
+        try:
+            pub = re.match(r'.*first published\s+(.+\d\d\d\d).*', pub_house_elem[0], re.DOTALL)
+            first_pub = pub[1]
+        except:
+            first_pub = None
+
+        binding_elem = content.xpath('//span[@itemprop="bookFormat"]/text()')
+        binding = binding_elem[0].strip() if binding_elem else None
+
+        pages_elem = content.xpath('//span[@itemprop="numberOfPages"]/text()')
+        pages = pages_elem[0].strip() if pages_elem else None
+        if pages is not None:
+            pages = int(RE_NUMBERS.findall(pages)[
+                        0]) if RE_NUMBERS.findall(pages) else None
+
+        isbn_elem = content.xpath('//span[@itemprop="isbn"]/text()')
+        if not isbn_elem:
+            isbn_elem = content.xpath('//div[@itemprop="isbn"]/text()')  # this is likely ASIN
+        isbn = isbn_elem[0].strip() if isbn_elem else None
+
+        brief_elem = content.xpath('//div[@id="description"]/span[@style="display:none"]/text()')
+        brief = '\n'.join(p.strip() for p in brief_elem) if brief_elem else None
+
+        genre = content.xpath('//div[@class="bigBoxBody"]/div/div/div/a/text()')
+        genre = genre[0] if genre else None
+        book_title = re.sub('\n','',content.xpath('//h1[@id="bookTitle"]/text()')[0]).strip()
+        author = content.xpath('//a[@class="authorName"]/span/text()')[0]
+        contents = None
+
+        img_url_elem = content.xpath("//img[@id='coverImage']/@src")
+        img_url = img_url_elem[0].strip() if img_url_elem else None
+        raw_img, ext = self.download_image(img_url)
+
+        authors_elem = content.xpath("//a[@class='authorName'][not(../span[@class='authorName greyText smallText role'])]/span/text()")
+        if authors_elem:
+            authors = []
+            for author in authors_elem:
+                authors.append(RE_WHITESPACES.sub(' ', author.strip()))
+        else:
+            authors = None
+
+        translators = None
+        authors_elem = content.xpath("//a[@class='authorName'][../span/text()='(Translator)']/span/text()")
+        if authors_elem:
+            translators = []
+            for translator in authors_elem:
+                translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
+        else:
+            translators = None
+
+        other = {}
+        if first_pub:
+            other['首版时间'] = first_pub
+        if genre:
+            other['分类'] = genre
+        series_elem = content.xpath("//h2[@id='bookSeries']/a/text()")
+        if series_elem:
+            other['丛书'] = re.sub(r'\(\s*(.+[^\s])\s*#.*\)', '\\1', series_elem[0].strip())
+
+        data = {
+            'title': title,
+            'subtitle': subtitle,
+            'orig_title': orig_title,
+            'author': authors,
+            'translator': translators,
+            'language': language,
+            'pub_house': pub_house,
+            'pub_year': pub_year,
+            'pub_month': pub_month,
+            'binding': binding,
+            'pages': pages,
+            'isbn': isbn,
+            'brief': brief,
+            'contents': contents,
+            'other_info': other,
+            'source_site': self.site_name,
+            'source_url': self.get_effective_url(url),
+        }
+        data['source_url'] = self.get_effective_url(url)
+
+        self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
+        return data, raw_img
+
+
 # https://developers.google.com/youtube/v3/docs/?apix=true
 # https://developers.google.com/books/docs/v1/using

--- a/common/static/css/boofilsic.min.css
+++ b/common/static/css/boofilsic.min.css
--- a/common/static/sass/_Label.sass
+++ b/common/static/sass/_Label.sass
@ -11,6 +11,8 @@ $steam-color-primary: #1387b8
 $steam-color-secondary: #111d2e
 $bangumi-color-primary: #F09199
 $bangumi-color-secondary: #FCFCFC
+$goodreads-color-primary: #372213
+$goodreads-color-secondary: #F4F1EA

 .source-label
    display: inline
@ -60,4 +62,8 @@ $bangumi-color-secondary: #FCFCFC
        background: $bangumi-color-secondary
        color: $bangumi-color-primary
        font-style: italic
-        font-weight: 600
+        font-weight: 600
+    &.source-label__goodreads
+        background: $goodreads-color-secondary
+        color: $goodreads-color-primary
+        font-weight: lighter
--- a/requirements.txt
+++ b/requirements.txt
@ -2,6 +2,7 @@ dateparser
 django
 django-hstore
 django-markdownx
+django-sass
 easy-thumbnails
 lxml
 openpyxl