diff --git a/common/importers/goodreads.py b/common/importers/goodreads.py new file mode 100644 index 00000000..48b0ddaa --- /dev/null +++ b/common/importers/goodreads.py @@ -0,0 +1,146 @@ +import re +import requests +from lxml import html +# from common.scrapers.goodreads import GoodreadsScraper +from common.scraper import get_scraper_by_url +from books.models import Book, BookMark +from collection.models import Collection +from common.models import MarkStatusEnum +from django.core.exceptions import ObjectDoesNotExist +from django.conf import settings +from user_messages import api as msg +import django_rq + + +re_shelf = r'^https://www.goodreads.com/review/list/\d+[^?]*\?shelf=[^&]+' +re_profile = r'^https://www.goodreads.com/user/show/(\d+)' +gr_rating = { + 'did not like it': 2, + 'it was ok': 4, + 'liked it': 6, + 'really liked it': 8, + 'it was amazing': 10 +} +gr_status = { +} + + +class GoodreadsImporter: + @classmethod + def import_from_url(self, raw_url, user): + match_shelf = re.match(re_shelf, raw_url) + match_profile = re.match(re_profile, raw_url) + if match_profile or match_shelf: + django_rq.get_queue('doufen').enqueue(self.import_from_url_task, raw_url, user) + return True + else: + return False + + @classmethod + def import_from_url_task(cls, url, user): + match_shelf = re.match(re_shelf, url) + match_profile = re.match(re_profile, url) + total = 0 + if match_shelf: + shelf = cls.parse_shelf(match_shelf[0], user) + if shelf['title'] and shelf['books']: + collection = Collection.objects.create(title=shelf['title'], + description='Imported from [Goodreads](' + url + ')', + owner=user) + for book in shelf['books']: + collection.append_item(book['book'], book['review']) + total += 1 + collection.save() + msg.success(user, f'成功从Goodreads导入包含{total}本书的收藏单{shelf["title"]}。') + elif match_profile: + uid = match_profile[1] + shelves = { + MarkStatusEnum.WISH: f'https://www.goodreads.com/review/list/{uid}?shelf=to-read', + MarkStatusEnum.DO: f'https://www.goodreads.com/review/list/{uid}?shelf=currently-reading', + MarkStatusEnum.COLLECT: f'https://www.goodreads.com/review/list/{uid}?shelf=read', + } + for status in shelves: + shelf_url = shelves.get(status) + shelf = cls.parse_shelf(shelf_url, user) + for book in shelf['books']: + params = { + 'owner': user, + # 'created_time': data.time, + # 'edited_time': data.time, + 'rating': book['rating'], + 'text': book['review'], + 'status': status, + 'visibility': 0, + 'book': book['book'], + } + try: + mark = BookMark.objects.create(**params) + mark.book.update_rating(None, mark.rating) + except Exception as e: + # print(e) + pass + total += 1 + msg.success(user, f'成功从Goodreads导入{total}个标记。') + + @classmethod + def parse_shelf(cls, url, user): # return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]} + books = [] + url_shelf = url + '&view=table' + while url_shelf: + print(url_shelf) + r = requests.get(url_shelf, timeout=settings.SCRAPING_TIMEOUT) + url_shelf = None + if r.status_code == 200: + content = html.fromstring(r.content.decode('utf-8')) + try: + title = content.xpath( + "//span[@class='h1Shelf']/text()")[0].strip() + except IndexError: + raise ValueError("given url contains no book info") + print(title) + for cell in content.xpath("//tbody[@id='booksBody']/tr"): + url_book = 'https://www.goodreads.com' + \ + cell.xpath( + ".//td[@class='field title']//a/@href")[0].strip() + action = cell.xpath( + ".//td[@class='field actions']//a/text()")[0].strip() + rating_elem = cell.xpath( + ".//td[@class='field rating']//span/@title") + rating = gr_rating.get( + rating_elem[0].strip()) if rating_elem else None + url_review = 'https://www.goodreads.com' + \ + cell.xpath( + ".//td[@class='field actions']//a/@href")[0].strip() + review = '' + try: + if action == 'view (with text)': + r2 = requests.get( + url_review, timeout=settings.SCRAPING_TIMEOUT) + if r2.status_code == 200: + c2 = html.fromstring(r2.content.decode('utf-8')) + review_elem = c2.xpath( + "//div[@itemprop='reviewBody']/text()") + review = '\n'.join( + p.strip() for p in review_elem) if review_elem else '' + else: + print(r2.status_code) + scraper = get_scraper_by_url(url_book) + url_book = scraper.get_effective_url(url_book) + book = Book.objects.filter(source_url=url_book).first() + if not book: + print("add new book " + url_book) + scraper.scrape(url_book) + form = scraper.save(request_user=user) + book = form.instance + books.append({ + 'url': url_book, + 'book': book, + 'rating': rating, + 'review': review + }) + except Exception: + print("Error adding " + url_book) + pass # likely just download error + next_elem = content.xpath("//a[@class='next_page']/@href") + url_shelf = ('https://www.goodreads.com' + next_elem[0].strip()) if next_elem else None + return {'title': title, 'books': books} diff --git a/common/scraper.py b/common/scraper.py index daf53bca..64a50a87 100644 --- a/common/scraper.py +++ b/common/scraper.py @@ -146,15 +146,19 @@ class AbstractScraper: def download_page(cls, url, headers): url = cls.get_effective_url(url) - session_id = random.random() - proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' % - (settings.LUMINATI_USERNAME, session_id, settings.LUMINATI_PASSWORD, PORT)) - proxies = { - 'http': proxy_url, - 'https': proxy_url, - } if settings.LUMINATI_USERNAME is None: proxies = None + if settings.SCRAPESTACK_KEY is not None: + url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}' + else: + session_id = random.random() + proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' % + (settings.LUMINATI_USERNAME, session_id, settings.LUMINATI_PASSWORD, PORT)) + proxies = { + 'http': proxy_url, + 'https': proxy_url, + } + r = requests.get(url, proxies=proxies, headers=headers, timeout=settings.SCRAPING_TIMEOUT) diff --git a/common/scrapers/goodreads.py b/common/scrapers/goodreads.py index 697f802e..77c1d70e 100644 --- a/common/scrapers/goodreads.py +++ b/common/scrapers/goodreads.py @@ -36,8 +36,7 @@ class GoodreadsScraper(AbstractScraper): if response is not None: content = html.fromstring(response.content.decode('utf-8')) else: - headers = DEFAULT_REQUEST_HEADERS.copy() - headers['Host'] = self.host + headers = None # DEFAULT_REQUEST_HEADERS.copy() content = self.download_page(url, headers) try: diff --git a/users/templates/users/data.html b/users/templates/users/data.html index 540a7a33..e0f7d8a9 100644 --- a/users/templates/users/data.html +++ b/users/templates/users/data.html @@ -145,6 +145,28 @@ +
+
+
+
{% trans '导入Goodreads帐号或书单' %}
+
+
+ {% csrf_token %} +
输入Goodreads链接 + + +
+
+ Goodreads用户主页链接形如 https://www.goodreads.com/user/show/12345-janedoe 将自动导入到NeoDB用户的想读、在读、已读列表,每本书的评论导入为NeoDB短评; +
+ Goodreads书单链接形如 https://www.goodreads.com/review/list/12345-janedoe?shelf=name 将自动导入成为NeoDB收藏单,每本书的评论导入为NeoDB收藏单条目备注。 +
+
+
+
+
+
+
diff --git a/users/urls.py b/users/urls.py index bc49e374..5e0f1ea0 100644 --- a/users/urls.py +++ b/users/urls.py @@ -8,6 +8,7 @@ urlpatterns = [ path('connect/', connect, name='connect'), path('reconnect/', reconnect, name='reconnect'), path('data/', data, name='data'), + path('data/import_goodreads', import_goodreads, name='import_goodreads'), path('data/export_reviews', export_reviews, name='export_reviews'), path('data/export_marks', export_marks, name='export_marks'), path('data/sync_mastodon', sync_mastodon, name='sync_mastodon'), diff --git a/users/views.py b/users/views.py index 6c5ba15c..32314db0 100644 --- a/users/views.py +++ b/users/views.py @@ -38,6 +38,7 @@ from movies.models import MovieMark, MovieReview from games.models import GameMark, GameReview from music.models import AlbumMark, SongMark, AlbumReview, SongReview from collection.models import Collection +from common.importers.goodreads import GoodreadsImporter # Views @@ -1141,3 +1142,14 @@ def clear_data(request): else: messages.add_message(request, messages.ERROR, _('验证信息不符。')) return redirect(reverse("users:data")) + + +@login_required +def import_goodreads(request): + if request.method == 'POST': + raw_url = request.POST.get('url') + if GoodreadsImporter.import_from_url(raw_url, request.user): + messages.add_message(request, messages.INFO, _('开始后台导入。')) + else: + messages.add_message(request, messages.ERROR, _('无法识别链接。')) + return redirect(reverse("users:data"))