improve goodreads import

2023-04-24 00:06:27 -04:00 · 2023-04-24 00:06:27 -04:00 · 745f93cd71
commit 745f93cd71
parent 81dd29a28c
2 changed files with 16 additions and 13 deletions
--- a/catalog/sites/goodreads.py
+++ b/catalog/sites/goodreads.py
@ -16,12 +16,14 @@ class GoodreadsDownloader(RetryDownloader):
        if response is None:
            return RESPONSE_NETWORK_ERROR
        elif response.status_code == 200:
-            if response.text.find("__NEXT_DATA__") != -1:
+            if (
+                response.text.find("__NEXT_DATA__") != -1
+                and response.text.find('"title"') != -1
+            ):
                return RESPONSE_OK
-            else:
-                # Goodreads may return legacy version for a/b testing
-                # retry if so
-                return RESPONSE_NETWORK_ERROR
+            # Goodreads may return legacy version for a/b testing
+            # retry if so
+            return RESPONSE_NETWORK_ERROR
        else:
            return RESPONSE_INVALID_CONTENT

--- a/journal/importers/goodreads.py
+++ b/journal/importers/goodreads.py
@ -1,7 +1,5 @@
 import re
-from lxml import html
 from datetime import datetime
-from django.conf import settings
 from user_messages import api as msg
 import django_rq
 from django.utils.timezone import make_aware
@ -77,8 +75,11 @@ class GoodreadsImporter:
                for book in shelf["books"]:
                    mark = Mark(user, book["book"])
                    if (
-                        mark.shelf_type == shelf_type
-                        or mark.shelf_type == ShelfType.COMPLETE
+                        (mark.shelf_type == shelf_type and mark.text == book["review"])
+                        or (
+                            mark.shelf_type == ShelfType.COMPLETE
+                            and shelf_type != ShelfType.COMPLETE
+                        )
                        or (
                            mark.shelf_type == ShelfType.PROGRESS
                            and shelf_type == ShelfType.WISHLIST
@ -140,7 +141,7 @@ class GoodreadsImporter:
                    "https://www.goodreads.com"
                    + cell.xpath(".//td[@class='field actions']//a/@href")[0].strip()
                )
-                review = ""
+                review = None
                last_updated = None
                date_elem = cell.xpath(".//td[@class='field date_added']//span/text()")
                for d in date_elem:
@ -157,7 +158,7 @@ class GoodreadsImporter:
                            )
                        )
                try:
-                    c2 = BasicDownloader(url_shelf).download().html()
+                    c2 = BasicDownloader(url_review).download().html()
                    review_elem = c2.xpath("//div[@itemprop='reviewBody']/text()")
                    review = (
                        "\n".join(p.strip() for p in review_elem) if review_elem else ""
@ -189,8 +190,8 @@ class GoodreadsImporter:
                            "last_updated": last_updated,
                        }
                    )
-                except Exception:
-                    print("Error adding " + url_book)
+                except Exception as e:
+                    print(f"Error adding {url_book} {e}")
                    pass  # likely just download error
            next_elem = content.xpath("//a[@class='next_page']/@href")
            url_shelf = (