From 745f93cd714d12fffc97c110d6eb0d581891209f Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 24 Apr 2023 00:06:27 -0400 Subject: [PATCH] improve goodreads import --- catalog/sites/goodreads.py | 12 +++++++----- journal/importers/goodreads.py | 17 +++++++++-------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/catalog/sites/goodreads.py b/catalog/sites/goodreads.py index 6e85bb72..b877d5b1 100644 --- a/catalog/sites/goodreads.py +++ b/catalog/sites/goodreads.py @@ -16,12 +16,14 @@ class GoodreadsDownloader(RetryDownloader): if response is None: return RESPONSE_NETWORK_ERROR elif response.status_code == 200: - if response.text.find("__NEXT_DATA__") != -1: + if ( + response.text.find("__NEXT_DATA__") != -1 + and response.text.find('"title"') != -1 + ): return RESPONSE_OK - else: - # Goodreads may return legacy version for a/b testing - # retry if so - return RESPONSE_NETWORK_ERROR + # Goodreads may return legacy version for a/b testing + # retry if so + return RESPONSE_NETWORK_ERROR else: return RESPONSE_INVALID_CONTENT diff --git a/journal/importers/goodreads.py b/journal/importers/goodreads.py index df880a93..e2c68fd1 100644 --- a/journal/importers/goodreads.py +++ b/journal/importers/goodreads.py @@ -1,7 +1,5 @@ import re -from lxml import html from datetime import datetime -from django.conf import settings from user_messages import api as msg import django_rq from django.utils.timezone import make_aware @@ -77,8 +75,11 @@ class GoodreadsImporter: for book in shelf["books"]: mark = Mark(user, book["book"]) if ( - mark.shelf_type == shelf_type - or mark.shelf_type == ShelfType.COMPLETE + (mark.shelf_type == shelf_type and mark.text == book["review"]) + or ( + mark.shelf_type == ShelfType.COMPLETE + and shelf_type != ShelfType.COMPLETE + ) or ( mark.shelf_type == ShelfType.PROGRESS and shelf_type == ShelfType.WISHLIST @@ -140,7 +141,7 @@ class GoodreadsImporter: "https://www.goodreads.com" + cell.xpath(".//td[@class='field actions']//a/@href")[0].strip() ) - review = "" + review = None last_updated = None date_elem = cell.xpath(".//td[@class='field date_added']//span/text()") for d in date_elem: @@ -157,7 +158,7 @@ class GoodreadsImporter: ) ) try: - c2 = BasicDownloader(url_shelf).download().html() + c2 = BasicDownloader(url_review).download().html() review_elem = c2.xpath("//div[@itemprop='reviewBody']/text()") review = ( "\n".join(p.strip() for p in review_elem) if review_elem else "" @@ -189,8 +190,8 @@ class GoodreadsImporter: "last_updated": last_updated, } ) - except Exception: - print("Error adding " + url_book) + except Exception as e: + print(f"Error adding {url_book} {e}") pass # likely just download error next_elem = content.xpath("//a[@class='next_page']/@href") url_shelf = (