diff --git a/catalog/book/tests.py b/catalog/book/tests.py index 4229bc1c..d6dce95d 100644 --- a/catalog/book/tests.py +++ b/catalog/book/tests.py @@ -1,5 +1,6 @@ from django.test import TestCase from catalog.book.models import * +from catalog.book.utils import * from catalog.common import * @@ -34,6 +35,14 @@ class BookTestCase(TestCase): self.assertEqual(hyperion.isbn10, None) def test_isbn(self): + t, n = detect_isbn_asin('0553283685') + self.assertEqual(t, IdType.ISBN) + self.assertEqual(n, '9780553283686') + t, n = detect_isbn_asin('9780553283686') + self.assertEqual(t, IdType.ISBN) + t, n = detect_isbn_asin(' b0043M6780') + self.assertEqual(t, IdType.ASIN) + hyperion = Edition.objects.get(title="Hyperion") self.assertEqual(hyperion.isbn, '9780553283686') self.assertEqual(hyperion.isbn10, '0553283685') @@ -82,7 +91,7 @@ class GoodreadsTestCase(TestCase): site.get_resource_ready() self.assertEqual(site.ready, True) self.assertEqual(site.resource.metadata.get('title'), 'Hyperion') - self.assertEqual(site.resource.metadata.get('isbn'), isbn) + self.assertEqual(site.resource.get_all_lookup_ids().get(IdType.ISBN), isbn) self.assertEqual(site.resource.required_resources[0]['id_value'], '1383900') edition = Edition.objects.get(primary_lookup_id_type=IdType.ISBN, primary_lookup_id_value=isbn) resource = edition.external_resources.all().first() @@ -229,6 +238,7 @@ class MultiBookSitesTestCase(TestCase): w3 = p3.item.works.all().first() self.assertNotEqual(w3, w2) p4 = SiteManager.get_site_by_url(url4).get_resource_ready() + self.assertEqual(p4.item.id, p1.item.id) self.assertEqual(p4.item.works.all().count(), 2) self.assertEqual(p1.item.works.all().count(), 2) w2e = w2.editions.all().order_by('title') diff --git a/catalog/book/utils.py b/catalog/book/utils.py index fe3e50fc..a8775ec5 100644 --- a/catalog/book/utils.py +++ b/catalog/book/utils.py @@ -1,3 +1,7 @@ +import re +from .models import IdType + + def check_digit_10(isbn): assert len(isbn) == 9 sum = 0 @@ -34,12 +38,23 @@ def isbn_13_to_10(isbn): def is_isbn_13(isbn): - return len(isbn) == 13 + return re.match(r'\d{13}', isbn) is not None def is_isbn_10(isbn): - return len(isbn) == 10 and isbn[0] >= '0' and isbn[0] <= '9' + return re.match(r'\d{9}[X0-9]', isbn) is not None def is_asin(asin): - return len(asin) == 10 and asin[0].lower == 'b' + return re.match(r'B[A-Z0-9]{9}', asin) is not None + + +def detect_isbn_asin(s): + n = s.strip().upper() if s else '' + if is_isbn_13(n): + return IdType.ISBN, n + if is_isbn_10(n): + return IdType.ISBN, isbn_10_to_13(n) + if is_asin(n): + return IdType.ASIN, n + return None, None diff --git a/catalog/sites/douban_book.py b/catalog/sites/douban_book.py index 8872d1f3..8532e07e 100644 --- a/catalog/sites/douban_book.py +++ b/catalog/sites/douban_book.py @@ -176,7 +176,9 @@ class DoubanBook(AbstractSite): }] pd = ResourceContent(metadata=data) - pd.lookup_ids[IdType.ISBN] = isbn + t, n = detect_isbn_asin(isbn) + if t: + pd.lookup_ids[t] = n pd.lookup_ids[IdType.CUBN] = cubn pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(img_url, self.url) return pd diff --git a/catalog/sites/goodreads.py b/catalog/sites/goodreads.py index 42c54ce1..45d3d637 100644 --- a/catalog/sites/goodreads.py +++ b/catalog/sites/goodreads.py @@ -1,5 +1,6 @@ from catalog.book.models import Edition, Work from catalog.common import * +from catalog.book.utils import detect_isbn_asin from lxml import html import json import logging @@ -60,10 +61,15 @@ class Goodreads(AbstractSite): raise ParseError(self, 'Book in __NEXT_DATA__ json') data['title'] = b['title'] data['brief'] = b['description'] - data['isbn'] = b['details'].get('isbn13') - asin = b['details'].get('asin') - if asin and asin != data['isbn']: - data['asin'] = asin + ids = {} + t, n = detect_isbn_asin(b['details'].get('asin')) + if t: + ids[t] = n + t, n = detect_isbn_asin(b['details'].get('isbn13')) + if t: + ids[t] = n + # amazon has a known problem to use another book's isbn as asin + # so we alway overwrite asin-converted isbn with real isbn data['pages'] = b['details'].get('numPages') data['cover_image_url'] = b['imageUrl'] w = next(filter(lambda x: x.get('details'), o['Work']), None) @@ -76,8 +82,8 @@ class Goodreads(AbstractSite): 'url': w['editions']['webUrl'], }] pd = ResourceContent(metadata=data) - pd.lookup_ids[IdType.ISBN] = data.get('isbn') - pd.lookup_ids[IdType.ASIN] = data.get('asin') + pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN) + pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN) if data["cover_image_url"]: imgdl = BasicImageDownloader(data["cover_image_url"], self.url) try: