fix goodreads asin mix up

This commit is contained in:
Your Name 2022-12-16 07:58:34 -05:00
parent 47cd239e21
commit c05aa65e3f
4 changed files with 44 additions and 11 deletions

View file

@ -1,5 +1,6 @@
from django.test import TestCase
from catalog.book.models import *
from catalog.book.utils import *
from catalog.common import *
@ -34,6 +35,14 @@ class BookTestCase(TestCase):
self.assertEqual(hyperion.isbn10, None)
def test_isbn(self):
t, n = detect_isbn_asin('0553283685')
self.assertEqual(t, IdType.ISBN)
self.assertEqual(n, '9780553283686')
t, n = detect_isbn_asin('9780553283686')
self.assertEqual(t, IdType.ISBN)
t, n = detect_isbn_asin(' b0043M6780')
self.assertEqual(t, IdType.ASIN)
hyperion = Edition.objects.get(title="Hyperion")
self.assertEqual(hyperion.isbn, '9780553283686')
self.assertEqual(hyperion.isbn10, '0553283685')
@ -82,7 +91,7 @@ class GoodreadsTestCase(TestCase):
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata.get('title'), 'Hyperion')
self.assertEqual(site.resource.metadata.get('isbn'), isbn)
self.assertEqual(site.resource.get_all_lookup_ids().get(IdType.ISBN), isbn)
self.assertEqual(site.resource.required_resources[0]['id_value'], '1383900')
edition = Edition.objects.get(primary_lookup_id_type=IdType.ISBN, primary_lookup_id_value=isbn)
resource = edition.external_resources.all().first()
@ -229,6 +238,7 @@ class MultiBookSitesTestCase(TestCase):
w3 = p3.item.works.all().first()
self.assertNotEqual(w3, w2)
p4 = SiteManager.get_site_by_url(url4).get_resource_ready()
self.assertEqual(p4.item.id, p1.item.id)
self.assertEqual(p4.item.works.all().count(), 2)
self.assertEqual(p1.item.works.all().count(), 2)
w2e = w2.editions.all().order_by('title')

View file

@ -1,3 +1,7 @@
import re
from .models import IdType
def check_digit_10(isbn):
assert len(isbn) == 9
sum = 0
@ -34,12 +38,23 @@ def isbn_13_to_10(isbn):
def is_isbn_13(isbn):
return len(isbn) == 13
return re.match(r'\d{13}', isbn) is not None
def is_isbn_10(isbn):
return len(isbn) == 10 and isbn[0] >= '0' and isbn[0] <= '9'
return re.match(r'\d{9}[X0-9]', isbn) is not None
def is_asin(asin):
return len(asin) == 10 and asin[0].lower == 'b'
return re.match(r'B[A-Z0-9]{9}', asin) is not None
def detect_isbn_asin(s):
n = s.strip().upper() if s else ''
if is_isbn_13(n):
return IdType.ISBN, n
if is_isbn_10(n):
return IdType.ISBN, isbn_10_to_13(n)
if is_asin(n):
return IdType.ASIN, n
return None, None

View file

@ -176,7 +176,9 @@ class DoubanBook(AbstractSite):
}]
pd = ResourceContent(metadata=data)
pd.lookup_ids[IdType.ISBN] = isbn
t, n = detect_isbn_asin(isbn)
if t:
pd.lookup_ids[t] = n
pd.lookup_ids[IdType.CUBN] = cubn
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(img_url, self.url)
return pd

View file

@ -1,5 +1,6 @@
from catalog.book.models import Edition, Work
from catalog.common import *
from catalog.book.utils import detect_isbn_asin
from lxml import html
import json
import logging
@ -60,10 +61,15 @@ class Goodreads(AbstractSite):
raise ParseError(self, 'Book in __NEXT_DATA__ json')
data['title'] = b['title']
data['brief'] = b['description']
data['isbn'] = b['details'].get('isbn13')
asin = b['details'].get('asin')
if asin and asin != data['isbn']:
data['asin'] = asin
ids = {}
t, n = detect_isbn_asin(b['details'].get('asin'))
if t:
ids[t] = n
t, n = detect_isbn_asin(b['details'].get('isbn13'))
if t:
ids[t] = n
# amazon has a known problem to use another book's isbn as asin
# so we alway overwrite asin-converted isbn with real isbn
data['pages'] = b['details'].get('numPages')
data['cover_image_url'] = b['imageUrl']
w = next(filter(lambda x: x.get('details'), o['Work']), None)
@ -76,8 +82,8 @@ class Goodreads(AbstractSite):
'url': w['editions']['webUrl'],
}]
pd = ResourceContent(metadata=data)
pd.lookup_ids[IdType.ISBN] = data.get('isbn')
pd.lookup_ids[IdType.ASIN] = data.get('asin')
pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN)
pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)
if data["cover_image_url"]:
imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
try: