fix goodreads asin mix up

2022-12-16 07:58:34 -05:00 · 2022-12-16 07:58:34 -05:00 · c05aa65e3f
commit c05aa65e3f
parent 47cd239e21
4 changed files with 44 additions and 11 deletions
--- a/catalog/book/tests.py
+++ b/catalog/book/tests.py
@ -1,5 +1,6 @@
 from django.test import TestCase
 from catalog.book.models import *
+from catalog.book.utils import *
 from catalog.common import *


@ -34,6 +35,14 @@ class BookTestCase(TestCase):
        self.assertEqual(hyperion.isbn10, None)

    def test_isbn(self):
+        t, n = detect_isbn_asin('0553283685')
+        self.assertEqual(t, IdType.ISBN)
+        self.assertEqual(n, '9780553283686')
+        t, n = detect_isbn_asin('9780553283686')
+        self.assertEqual(t, IdType.ISBN)
+        t, n = detect_isbn_asin(' b0043M6780')
+        self.assertEqual(t, IdType.ASIN)
+
        hyperion = Edition.objects.get(title="Hyperion")
        self.assertEqual(hyperion.isbn, '9780553283686')
        self.assertEqual(hyperion.isbn10, '0553283685')
@ -82,7 +91,7 @@ class GoodreadsTestCase(TestCase):
        site.get_resource_ready()
        self.assertEqual(site.ready, True)
        self.assertEqual(site.resource.metadata.get('title'), 'Hyperion')
-        self.assertEqual(site.resource.metadata.get('isbn'), isbn)
+        self.assertEqual(site.resource.get_all_lookup_ids().get(IdType.ISBN), isbn)
        self.assertEqual(site.resource.required_resources[0]['id_value'], '1383900')
        edition = Edition.objects.get(primary_lookup_id_type=IdType.ISBN, primary_lookup_id_value=isbn)
        resource = edition.external_resources.all().first()
@ -229,6 +238,7 @@ class MultiBookSitesTestCase(TestCase):
        w3 = p3.item.works.all().first()
        self.assertNotEqual(w3, w2)
        p4 = SiteManager.get_site_by_url(url4).get_resource_ready()
+        self.assertEqual(p4.item.id, p1.item.id)
        self.assertEqual(p4.item.works.all().count(), 2)
        self.assertEqual(p1.item.works.all().count(), 2)
        w2e = w2.editions.all().order_by('title')
--- a/catalog/book/utils.py
+++ b/catalog/book/utils.py
@ -1,3 +1,7 @@
+import re
+from .models import IdType
+
+
 def check_digit_10(isbn):
    assert len(isbn) == 9
    sum = 0
@ -34,12 +38,23 @@ def isbn_13_to_10(isbn):


 def is_isbn_13(isbn):
-    return len(isbn) == 13
+    return re.match(r'\d{13}', isbn) is not None


 def is_isbn_10(isbn):
-    return len(isbn) == 10 and isbn[0] >= '0' and isbn[0] <= '9'
+    return re.match(r'\d{9}[X0-9]', isbn) is not None


 def is_asin(asin):
-    return len(asin) == 10 and asin[0].lower == 'b'
+    return re.match(r'B[A-Z0-9]{9}', asin) is not None
+
+
+def detect_isbn_asin(s):
+    n = s.strip().upper() if s else ''
+    if is_isbn_13(n):
+        return IdType.ISBN, n
+    if is_isbn_10(n):
+        return IdType.ISBN, isbn_10_to_13(n)
+    if is_asin(n):
+        return IdType.ASIN, n
+    return None, None
--- a/catalog/sites/douban_book.py
+++ b/catalog/sites/douban_book.py
@ -176,7 +176,9 @@ class DoubanBook(AbstractSite):
            }]

        pd = ResourceContent(metadata=data)
-        pd.lookup_ids[IdType.ISBN] = isbn
+        t, n = detect_isbn_asin(isbn)
+        if t:
+            pd.lookup_ids[t] = n
        pd.lookup_ids[IdType.CUBN] = cubn
        pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(img_url, self.url)
        return pd
--- a/catalog/sites/goodreads.py
+++ b/catalog/sites/goodreads.py
@ -1,5 +1,6 @@
 from catalog.book.models import Edition, Work
 from catalog.common import *
+from catalog.book.utils import detect_isbn_asin
 from lxml import html
 import json
 import logging
@ -60,10 +61,15 @@ class Goodreads(AbstractSite):
            raise ParseError(self, 'Book in __NEXT_DATA__ json')
        data['title'] = b['title']
        data['brief'] = b['description']
-        data['isbn'] = b['details'].get('isbn13')
-        asin = b['details'].get('asin')
-        if asin and asin != data['isbn']:
-            data['asin'] = asin
+        ids = {}
+        t, n = detect_isbn_asin(b['details'].get('asin'))
+        if t:
+            ids[t] = n
+        t, n = detect_isbn_asin(b['details'].get('isbn13'))
+        if t:
+            ids[t] = n
+        # amazon has a known problem to use another book's isbn as asin
+        # so we alway overwrite asin-converted isbn with real isbn
        data['pages'] = b['details'].get('numPages')
        data['cover_image_url'] = b['imageUrl']
        w = next(filter(lambda x: x.get('details'), o['Work']), None)
@ -76,8 +82,8 @@ class Goodreads(AbstractSite):
                'url': w['editions']['webUrl'],
            }]
        pd = ResourceContent(metadata=data)
-        pd.lookup_ids[IdType.ISBN] = data.get('isbn')
-        pd.lookup_ids[IdType.ASIN] = data.get('asin')
+        pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN)
+        pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)
        if data["cover_image_url"]:
            imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
            try: