import Goodreads book list and profile

2022-04-04 01:24:16 -04:00 · 2022-04-04 01:24:16 -04:00 · fa283846f0
commit fa283846f0
parent 0f2302a4a5
6 changed files with 193 additions and 9 deletions
--- a/common/importers/goodreads.py
+++ b/common/importers/goodreads.py
@ -0,0 +1,146 @@
+import re
+import requests
+from lxml import html
+# from common.scrapers.goodreads import GoodreadsScraper
+from common.scraper import get_scraper_by_url
+from books.models import Book, BookMark
+from collection.models import Collection
+from common.models import MarkStatusEnum
+from django.core.exceptions import ObjectDoesNotExist
+from django.conf import settings
+from user_messages import api as msg
+import django_rq
+
+
+re_shelf = r'^https://www.goodreads.com/review/list/\d+[^?]*\?shelf=[^&]+'
+re_profile = r'^https://www.goodreads.com/user/show/(\d+)'
+gr_rating = {
+    'did not like it': 2,
+    'it was ok': 4,
+    'liked it': 6,
+    'really liked it': 8,
+    'it was amazing': 10
+}
+gr_status = {
+}
+
+
+class GoodreadsImporter:
+    @classmethod
+    def import_from_url(self, raw_url, user):
+        match_shelf = re.match(re_shelf, raw_url)
+        match_profile = re.match(re_profile, raw_url)
+        if match_profile or match_shelf:
+            django_rq.get_queue('doufen').enqueue(self.import_from_url_task, raw_url, user)
+            return True
+        else:
+            return False
+
+    @classmethod
+    def import_from_url_task(cls, url, user):
+        match_shelf = re.match(re_shelf, url)
+        match_profile = re.match(re_profile, url)
+        total = 0
+        if match_shelf:
+            shelf = cls.parse_shelf(match_shelf[0], user)
+            if shelf['title'] and shelf['books']:
+                collection = Collection.objects.create(title=shelf['title'],
+                                                       description='Imported from [Goodreads](' + url + ')',
+                                                       owner=user)
+                for book in shelf['books']:
+                    collection.append_item(book['book'], book['review'])
+                    total += 1
+                collection.save()
+            msg.success(user, f'成功从Goodreads导入包含{total}本书的收藏单{shelf["title"]}。')
+        elif match_profile:
+            uid = match_profile[1]
+            shelves = {
+                MarkStatusEnum.WISH: f'https://www.goodreads.com/review/list/{uid}?shelf=to-read',
+                MarkStatusEnum.DO: f'https://www.goodreads.com/review/list/{uid}?shelf=currently-reading',
+                MarkStatusEnum.COLLECT: f'https://www.goodreads.com/review/list/{uid}?shelf=read',
+            }
+            for status in shelves:
+                shelf_url = shelves.get(status)
+                shelf = cls.parse_shelf(shelf_url, user)
+                for book in shelf['books']:
+                    params = {
+                        'owner': user,
+                        # 'created_time': data.time,
+                        # 'edited_time': data.time,
+                        'rating': book['rating'],
+                        'text': book['review'],
+                        'status': status,
+                        'visibility': 0,
+                        'book': book['book'],
+                    }
+                    try:
+                        mark = BookMark.objects.create(**params)
+                        mark.book.update_rating(None, mark.rating)
+                    except Exception as e:
+                        # print(e)
+                        pass
+                    total += 1
+            msg.success(user, f'成功从Goodreads导入{total}个标记。')
+
+    @classmethod
+    def parse_shelf(cls, url, user):  # return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
+        books = []
+        url_shelf = url + '&view=table'
+        while url_shelf:
+            print(url_shelf)
+            r = requests.get(url_shelf, timeout=settings.SCRAPING_TIMEOUT)
+            url_shelf = None
+            if r.status_code == 200:
+                content = html.fromstring(r.content.decode('utf-8'))
+                try:
+                    title = content.xpath(
+                        "//span[@class='h1Shelf']/text()")[0].strip()
+                except IndexError:
+                    raise ValueError("given url contains no book info")
+                print(title)
+                for cell in content.xpath("//tbody[@id='booksBody']/tr"):
+                    url_book = 'https://www.goodreads.com' + \
+                        cell.xpath(
+                            ".//td[@class='field title']//a/@href")[0].strip()
+                    action = cell.xpath(
+                        ".//td[@class='field actions']//a/text()")[0].strip()
+                    rating_elem = cell.xpath(
+                        ".//td[@class='field rating']//span/@title")
+                    rating = gr_rating.get(
+                        rating_elem[0].strip()) if rating_elem else None
+                    url_review = 'https://www.goodreads.com' + \
+                        cell.xpath(
+                            ".//td[@class='field actions']//a/@href")[0].strip()
+                    review = ''
+                    try:
+                        if action == 'view (with text)':
+                            r2 = requests.get(
+                                url_review, timeout=settings.SCRAPING_TIMEOUT)
+                            if r2.status_code == 200:
+                                c2 = html.fromstring(r2.content.decode('utf-8'))
+                                review_elem = c2.xpath(
+                                    "//div[@itemprop='reviewBody']/text()")
+                                review = '\n'.join(
+                                    p.strip() for p in review_elem) if review_elem else ''
+                            else:
+                                print(r2.status_code)
+                        scraper = get_scraper_by_url(url_book)
+                        url_book = scraper.get_effective_url(url_book)
+                        book = Book.objects.filter(source_url=url_book).first()
+                        if not book:
+                            print("add new book " + url_book)
+                            scraper.scrape(url_book)
+                            form = scraper.save(request_user=user)
+                            book = form.instance
+                        books.append({
+                            'url': url_book,
+                            'book': book,
+                            'rating': rating,
+                            'review': review
+                        })
+                    except Exception:
+                        print("Error adding " + url_book)
+                        pass  # likely just download error
+                next_elem = content.xpath("//a[@class='next_page']/@href")
+                url_shelf = ('https://www.goodreads.com' + next_elem[0].strip()) if next_elem else None
+        return {'title': title, 'books': books}
--- a/common/scraper.py
+++ b/common/scraper.py
@ -146,15 +146,19 @@ class AbstractScraper:
    def download_page(cls, url, headers):
        url = cls.get_effective_url(url)

-        session_id = random.random()
-        proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %
-                     (settings.LUMINATI_USERNAME, session_id, settings.LUMINATI_PASSWORD, PORT))
-        proxies = {
-            'http': proxy_url,
-            'https': proxy_url,
-        }
        if settings.LUMINATI_USERNAME is None:
            proxies = None
+            if settings.SCRAPESTACK_KEY is not None:
+                url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}'
+        else:
+            session_id = random.random()
+            proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %
+                         (settings.LUMINATI_USERNAME, session_id, settings.LUMINATI_PASSWORD, PORT))
+            proxies = {
+                'http': proxy_url,
+                'https': proxy_url,
+            }
+
        r = requests.get(url, proxies=proxies,
                         headers=headers, timeout=settings.SCRAPING_TIMEOUT)

--- a/common/scrapers/goodreads.py
+++ b/common/scrapers/goodreads.py
@ -36,8 +36,7 @@ class GoodreadsScraper(AbstractScraper):
        if response is not None:
            content = html.fromstring(response.content.decode('utf-8'))
        else:
-            headers = DEFAULT_REQUEST_HEADERS.copy()
-            headers['Host'] = self.host
+            headers = None  # DEFAULT_REQUEST_HEADERS.copy()
            content = self.download_page(url, headers)

        try:
--- a/users/templates/users/data.html
+++ b/users/templates/users/data.html
@ -145,6 +145,28 @@
                            </div>
                        </div>

+                        <div class="main-section-wrapper">
+                            <div class="tools-section-wrapper">
+                                <div class="import-panel">
+                                    <h5 class="import-panel__label">{% trans '导入Goodreads帐号或书单' %}</h5>
+                                    <div class="import-panel__body">
+                                        <form action="{% url 'users:import_goodreads' %}" method="POST" >
+                                            {% csrf_token %}
+                                            <div class="import-panel__checkbox">输入Goodreads链接
+                                            <input type="url" name="url" value="" placeholder="例如 https://www.goodreads.com/user/show/12345-janedoe">
+                                            <input type="submit" class="import-panel__button" value="{% trans '导入' %}" id="uploadBtn" />
+                                            </div>
+                                            <div>
+                                                Goodreads用户主页链接形如 https://www.goodreads.com/user/show/12345-janedoe 将自动导入到NeoDB用户的想读、在读、已读列表，每本书的评论导入为NeoDB短评；
+                                                <br />
+                                                Goodreads书单链接形如 https://www.goodreads.com/review/list/12345-janedoe?shelf=name 将自动导入成为NeoDB收藏单，每本书的评论导入为NeoDB收藏单条目备注。
+                                            </div>
+                                        </form>
+                                    </div>
+                                </div>
+                            </div>
+                        </div>
+
                        <div class="main-section-wrapper">
                            <div class="tools-section-wrapper">
                                <div class="import-panel">
--- a/users/urls.py
+++ b/users/urls.py
@ -8,6 +8,7 @@ urlpatterns = [
    path('connect/', connect, name='connect'),
    path('reconnect/', reconnect, name='reconnect'),
    path('data/', data, name='data'),
+    path('data/import_goodreads', import_goodreads, name='import_goodreads'),
    path('data/export_reviews', export_reviews, name='export_reviews'),
    path('data/export_marks', export_marks, name='export_marks'),
    path('data/sync_mastodon', sync_mastodon, name='sync_mastodon'),
--- a/users/views.py
+++ b/users/views.py
@ -38,6 +38,7 @@ from movies.models import MovieMark, MovieReview
 from games.models import GameMark, GameReview
 from music.models import AlbumMark, SongMark, AlbumReview, SongReview
 from collection.models import Collection
+from common.importers.goodreads import GoodreadsImporter


 # Views
@ -1141,3 +1142,14 @@ def clear_data(request):
        else:
            messages.add_message(request, messages.ERROR, _('验证信息不符。'))
    return redirect(reverse("users:data"))
+
+
+@login_required
+def import_goodreads(request):
+    if request.method == 'POST':
+        raw_url = request.POST.get('url')
+        if GoodreadsImporter.import_from_url(raw_url, request.user):
+            messages.add_message(request, messages.INFO, _('开始后台导入。'))
+        else:
+            messages.add_message(request, messages.ERROR, _('无法识别链接。'))
+    return redirect(reverse("users:data"))