diff --git a/catalog/api.py b/catalog/api.py index 26eb4f37..25da154d 100644 --- a/catalog/api.py +++ b/catalog/api.py @@ -10,7 +10,11 @@ from django.utils.baseconv import base62 from django.shortcuts import render, get_object_or_404, redirect, reverse from django.http import Http404 -api = NinjaAPI(title=settings.SITE_INFO['site_name'], version="1.0.0", description=f"{settings.SITE_INFO['site_name']} API
Learn more") +api = NinjaAPI( + title=settings.SITE_INFO["site_name"], + version="1.0.0", + description=f"{settings.SITE_INFO['site_name']} API
Learn more", +) class ItemIn(Schema): diff --git a/catalog/apps.py b/catalog/apps.py index 62a2dd40..aff10ed7 100644 --- a/catalog/apps.py +++ b/catalog/apps.py @@ -2,8 +2,8 @@ from django.apps import AppConfig class CatalogConfig(AppConfig): - default_auto_field = 'django.db.models.BigAutoField' - name = 'catalog' + default_auto_field = "django.db.models.BigAutoField" + name = "catalog" def ready(self): # load key modules in proper order, make sure class inject and signal works as expected diff --git a/catalog/book/models.py b/catalog/book/models.py index 1e441fe9..18051b21 100644 --- a/catalog/book/models.py +++ b/catalog/book/models.py @@ -25,8 +25,8 @@ from .utils import * class Edition(Item): category = ItemCategory.Book - url_path = 'book' - demonstrative = _('这本书') + url_path = "book" + demonstrative = _("这本书") isbn = PrimaryLookupIdDescriptor(IdType.ISBN) asin = PrimaryLookupIdDescriptor(IdType.ASIN) @@ -35,30 +35,30 @@ class Edition(Item): # goodreads = LookupIdDescriptor(IdType.Goodreads) METADATA_COPY_LIST = [ - 'title', - 'brief', + "title", + "brief", # legacy fields - 'subtitle', - 'orig_title', - 'author', - 'translator', - 'language', - 'pub_house', - 'pub_year', - 'pub_month', - 'binding', - 'price', - 'pages', - 'contents', - 'series', - 'imprint', + "subtitle", + "orig_title", + "author", + "translator", + "language", + "pub_house", + "pub_year", + "pub_month", + "binding", + "price", + "pages", + "contents", + "series", + "imprint", ] subtitle = jsondata.CharField(null=True, blank=True, default=None) orig_title = jsondata.CharField(null=True, blank=True, default=None) - author = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list) - translator = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list) + author = jsondata.ArrayField(_("作者"), null=False, blank=False, default=list) + translator = jsondata.ArrayField(_("译者"), null=True, blank=True, default=list) language = jsondata.CharField(_("语言"), null=True, blank=True, default=None) - pub_house = jsondata.CharField(_('出版方'), null=True, blank=True, default=None) + pub_house = jsondata.CharField(_("出版方"), null=True, blank=True, default=None) pub_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True) pub_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True) binding = jsondata.CharField(null=True, blank=True, default=None) @@ -80,8 +80,11 @@ class Edition(Item): """add Work from resource.metadata['work'] if not yet""" links = resource.required_resources + resource.related_resources for w in links: - if w['model'] == 'Work': - work = Work.objects.filter(primary_lookup_id_type=w['id_type'], primary_lookup_id_value=w['id_value']).first() + if w["model"] == "Work": + work = Work.objects.filter( + primary_lookup_id_type=w["id_type"], + primary_lookup_id_value=w["id_value"], + ).first() if work and work not in self.works.all(): self.works.add(work) # if not work: @@ -90,15 +93,15 @@ class Edition(Item): class Work(Item): category = ItemCategory.Book - url_path = 'book/work' + url_path = "book/work" douban_work = PrimaryLookupIdDescriptor(IdType.DoubanBook_Work) goodreads_work = PrimaryLookupIdDescriptor(IdType.Goodreads_Work) - editions = models.ManyToManyField(Edition, related_name='works') + editions = models.ManyToManyField(Edition, related_name="works") class Series(Item): category = ItemCategory.Book - url_path = 'book/series' + url_path = "book/series" # douban_serie = LookupIdDescriptor(IdType.DoubanBook_Serie) # goodreads_serie = LookupIdDescriptor(IdType.Goodreads_Serie) diff --git a/catalog/book/tests.py b/catalog/book/tests.py index d6dce95d..720761fc 100644 --- a/catalog/book/tests.py +++ b/catalog/book/tests.py @@ -8,7 +8,7 @@ class BookTestCase(TestCase): def setUp(self): hyperion = Edition.objects.create(title="Hyperion") hyperion.pages = 500 - hyperion.isbn = '9780553283686' + hyperion.isbn = "9780553283686" hyperion.save() # hyperion.isbn10 = '0553283685' @@ -22,39 +22,39 @@ class BookTestCase(TestCase): self.assertEqual(hyperion.title, "Hyperion") self.assertEqual(hyperion.pages, 500) self.assertEqual(hyperion.primary_lookup_id_type, IdType.ISBN) - self.assertEqual(hyperion.primary_lookup_id_value, '9780553283686') + self.assertEqual(hyperion.primary_lookup_id_value, "9780553283686") andymion = Edition(title="Andymion", pages=42) self.assertEqual(andymion.pages, 42) def test_lookupids(self): hyperion = Edition.objects.get(title="Hyperion") - hyperion.asin = 'B004G60EHS' + hyperion.asin = "B004G60EHS" self.assertEqual(hyperion.primary_lookup_id_type, IdType.ASIN) - self.assertEqual(hyperion.primary_lookup_id_value, 'B004G60EHS') + self.assertEqual(hyperion.primary_lookup_id_value, "B004G60EHS") self.assertEqual(hyperion.isbn, None) self.assertEqual(hyperion.isbn10, None) def test_isbn(self): - t, n = detect_isbn_asin('0553283685') + t, n = detect_isbn_asin("0553283685") self.assertEqual(t, IdType.ISBN) - self.assertEqual(n, '9780553283686') - t, n = detect_isbn_asin('9780553283686') + self.assertEqual(n, "9780553283686") + t, n = detect_isbn_asin("9780553283686") self.assertEqual(t, IdType.ISBN) - t, n = detect_isbn_asin(' b0043M6780') + t, n = detect_isbn_asin(" b0043M6780") self.assertEqual(t, IdType.ASIN) hyperion = Edition.objects.get(title="Hyperion") - self.assertEqual(hyperion.isbn, '9780553283686') - self.assertEqual(hyperion.isbn10, '0553283685') - hyperion.isbn10 = '0575099437' - self.assertEqual(hyperion.isbn, '9780575099432') - self.assertEqual(hyperion.isbn10, '0575099437') + self.assertEqual(hyperion.isbn, "9780553283686") + self.assertEqual(hyperion.isbn10, "0553283685") + hyperion.isbn10 = "0575099437" + self.assertEqual(hyperion.isbn, "9780575099432") + self.assertEqual(hyperion.isbn10, "0575099437") def test_work(self): hyperion_print = Edition.objects.get(title="Hyperion") hyperion_ebook = Edition(title="Hyperion") hyperion_ebook.save() - hyperion_ebook.asin = 'B0043M6780' + hyperion_ebook.asin = "B0043M6780" hyperion = Work(title="Hyperion") hyperion.save() hyperion.editions.add(hyperion_print) @@ -69,9 +69,9 @@ class GoodreadsTestCase(TestCase): def test_parse(self): t_type = IdType.Goodreads - t_id = '77566' - t_url = 'https://www.goodreads.com/zh/book/show/77566.Hyperion' - t_url2 = 'https://www.goodreads.com/book/show/77566' + t_id = "77566" + t_url = "https://www.goodreads.com/zh/book/show/77566.Hyperion" + t_url2 = "https://www.goodreads.com/book/show/77566" p1 = SiteManager.get_site_by_id_type(t_type) p2 = SiteManager.get_site_by_url(t_url) self.assertEqual(p1.id_to_url(t_id), t_url2) @@ -79,9 +79,9 @@ class GoodreadsTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://www.goodreads.com/book/show/77566.Hyperion' - t_url2 = 'https://www.goodreads.com/book/show/77566' - isbn = '9780553283686' + t_url = "https://www.goodreads.com/book/show/77566.Hyperion" + t_url2 = "https://www.goodreads.com/book/show/77566" + isbn = "9780553283686" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) self.assertEqual(site.url, t_url2) @@ -90,39 +90,43 @@ class GoodreadsTestCase(TestCase): self.assertIsNotNone(site.resource) site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.resource.metadata.get('title'), 'Hyperion') + self.assertEqual(site.resource.metadata.get("title"), "Hyperion") self.assertEqual(site.resource.get_all_lookup_ids().get(IdType.ISBN), isbn) - self.assertEqual(site.resource.required_resources[0]['id_value'], '1383900') - edition = Edition.objects.get(primary_lookup_id_type=IdType.ISBN, primary_lookup_id_value=isbn) + self.assertEqual(site.resource.required_resources[0]["id_value"], "1383900") + edition = Edition.objects.get( + primary_lookup_id_type=IdType.ISBN, primary_lookup_id_value=isbn + ) resource = edition.external_resources.all().first() self.assertEqual(resource.id_type, IdType.Goodreads) - self.assertEqual(resource.id_value, '77566') - self.assertNotEqual(resource.cover, '/media/item/default.svg') - self.assertEqual(edition.isbn, '9780553283686') - self.assertEqual(edition.title, 'Hyperion') + self.assertEqual(resource.id_value, "77566") + self.assertNotEqual(resource.cover, "/media/item/default.svg") + self.assertEqual(edition.isbn, "9780553283686") + self.assertEqual(edition.title, "Hyperion") edition.delete() site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) self.assertEqual(site.url, t_url2) site.get_resource() - self.assertEqual(site.ready, True, 'previous resource should still exist with data') + self.assertEqual( + site.ready, True, "previous resource should still exist with data" + ) @use_local_response def test_asin(self): - t_url = 'https://www.goodreads.com/book/show/45064996-hyperion' + t_url = "https://www.goodreads.com/book/show/45064996-hyperion" site = SiteManager.get_site_by_url(t_url) site.get_resource_ready() - self.assertEqual(site.resource.item.title, 'Hyperion') - self.assertEqual(site.resource.item.asin, 'B004G60EHS') + self.assertEqual(site.resource.item.title, "Hyperion") + self.assertEqual(site.resource.item.asin, "B004G60EHS") @use_local_response def test_work(self): - url = 'https://www.goodreads.com/work/editions/153313' + url = "https://www.goodreads.com/work/editions/153313" p = SiteManager.get_site_by_url(url).get_resource_ready() - self.assertEqual(p.item.title, '1984') - url1 = 'https://www.goodreads.com/book/show/3597767-rok-1984' - url2 = 'https://www.goodreads.com/book/show/40961427-1984' + self.assertEqual(p.item.title, "1984") + url1 = "https://www.goodreads.com/book/show/3597767-rok-1984" + url2 = "https://www.goodreads.com/book/show/40961427-1984" p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p2 = SiteManager.get_site_by_url(url2).get_resource_ready() w1 = p1.item.works.all().first() @@ -133,9 +137,9 @@ class GoodreadsTestCase(TestCase): class GoogleBooksTestCase(TestCase): def test_parse(self): t_type = IdType.GoogleBooks - t_id = 'hV--zQEACAAJ' - t_url = 'https://books.google.com.bn/books?id=hV--zQEACAAJ&hl=ms' - t_url2 = 'https://books.google.com/books?id=hV--zQEACAAJ' + t_id = "hV--zQEACAAJ" + t_url = "https://books.google.com.bn/books?id=hV--zQEACAAJ&hl=ms" + t_url2 = "https://books.google.com/books?id=hV--zQEACAAJ" p1 = SiteManager.get_site_by_url(t_url) p2 = SiteManager.get_site_by_url(t_url2) self.assertIsNotNone(p1) @@ -146,17 +150,19 @@ class GoogleBooksTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://books.google.com.bn/books?id=hV--zQEACAAJ' + t_url = "https://books.google.com.bn/books?id=hV--zQEACAAJ" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.resource.metadata.get('title'), '1984 Nineteen Eighty-Four') - self.assertEqual(site.resource.metadata.get('isbn'), '9781847498571') + self.assertEqual( + site.resource.metadata.get("title"), "1984 Nineteen Eighty-Four" + ) + self.assertEqual(site.resource.metadata.get("isbn"), "9781847498571") self.assertEqual(site.resource.id_type, IdType.GoogleBooks) - self.assertEqual(site.resource.id_value, 'hV--zQEACAAJ') - self.assertEqual(site.resource.item.isbn, '9781847498571') - self.assertEqual(site.resource.item.title, '1984 Nineteen Eighty-Four') + self.assertEqual(site.resource.id_value, "hV--zQEACAAJ") + self.assertEqual(site.resource.item.isbn, "9781847498571") + self.assertEqual(site.resource.item.title, "1984 Nineteen Eighty-Four") class DoubanBookTestCase(TestCase): @@ -165,9 +171,9 @@ class DoubanBookTestCase(TestCase): def test_parse(self): t_type = IdType.DoubanBook - t_id = '35902899' - t_url = 'https://m.douban.com/book/subject/35902899/' - t_url2 = 'https://book.douban.com/subject/35902899/' + t_id = "35902899" + t_url = "https://m.douban.com/book/subject/35902899/" + t_url2 = "https://book.douban.com/subject/35902899/" p1 = SiteManager.get_site_by_url(t_url) p2 = SiteManager.get_site_by_url(t_url2) self.assertEqual(p1.url, t_url2) @@ -177,44 +183,46 @@ class DoubanBookTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://book.douban.com/subject/35902899/' + t_url = "https://book.douban.com/subject/35902899/" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) site.get_resource_ready() self.assertEqual(site.ready, True) self.assertEqual(site.resource.site_name, SiteName.Douban) - self.assertEqual(site.resource.metadata.get('title'), '1984 Nineteen Eighty-Four') - self.assertEqual(site.resource.metadata.get('isbn'), '9781847498571') + self.assertEqual( + site.resource.metadata.get("title"), "1984 Nineteen Eighty-Four" + ) + self.assertEqual(site.resource.metadata.get("isbn"), "9781847498571") self.assertEqual(site.resource.id_type, IdType.DoubanBook) - self.assertEqual(site.resource.id_value, '35902899') - self.assertEqual(site.resource.item.isbn, '9781847498571') - self.assertEqual(site.resource.item.title, '1984 Nineteen Eighty-Four') + self.assertEqual(site.resource.id_value, "35902899") + self.assertEqual(site.resource.item.isbn, "9781847498571") + self.assertEqual(site.resource.item.title, "1984 Nineteen Eighty-Four") @use_local_response def test_work(self): # url = 'https://www.goodreads.com/work/editions/153313' - url1 = 'https://book.douban.com/subject/1089243/' - url2 = 'https://book.douban.com/subject/2037260/' + url1 = "https://book.douban.com/subject/1089243/" + url2 = "https://book.douban.com/subject/2037260/" p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p2 = SiteManager.get_site_by_url(url2).get_resource_ready() w1 = p1.item.works.all().first() w2 = p2.item.works.all().first() - self.assertEqual(w1.title, '黄金时代') - self.assertEqual(w2.title, '黄金时代') + self.assertEqual(w1.title, "黄金时代") + self.assertEqual(w2.title, "黄金时代") self.assertEqual(w1, w2) - editions = w1.editions.all().order_by('title') + editions = w1.editions.all().order_by("title") self.assertEqual(editions.count(), 2) - self.assertEqual(editions[0].title, 'Wang in Love and Bondage') - self.assertEqual(editions[1].title, '黄金时代') + self.assertEqual(editions[0].title, "Wang in Love and Bondage") + self.assertEqual(editions[1].title, "黄金时代") class MultiBookSitesTestCase(TestCase): @use_local_response def test_editions(self): # isbn = '9781847498571' - url1 = 'https://www.goodreads.com/book/show/56821625-1984' - url2 = 'https://book.douban.com/subject/35902899/' - url3 = 'https://books.google.com/books?id=hV--zQEACAAJ' + url1 = "https://www.goodreads.com/book/show/56821625-1984" + url2 = "https://book.douban.com/subject/35902899/" + url3 = "https://books.google.com/books?id=hV--zQEACAAJ" p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p2 = SiteManager.get_site_by_url(url2).get_resource_ready() p3 = SiteManager.get_site_by_url(url3).get_resource_ready() @@ -224,11 +232,13 @@ class MultiBookSitesTestCase(TestCase): @use_local_response def test_works(self): # url1 and url4 has same ISBN, hence they share same Edition instance, which belongs to 2 Work instances - url1 = 'https://book.douban.com/subject/1089243/' - url2 = 'https://book.douban.com/subject/2037260/' - url3 = 'https://www.goodreads.com/book/show/59952545-golden-age' - url4 = 'https://www.goodreads.com/book/show/11798823' - p1 = SiteManager.get_site_by_url(url1).get_resource_ready() # lxml bug may break this + url1 = "https://book.douban.com/subject/1089243/" + url2 = "https://book.douban.com/subject/2037260/" + url3 = "https://www.goodreads.com/book/show/59952545-golden-age" + url4 = "https://www.goodreads.com/book/show/11798823" + p1 = SiteManager.get_site_by_url( + url1 + ).get_resource_ready() # lxml bug may break this w1 = p1.item.works.all().first() p2 = SiteManager.get_site_by_url(url2).get_resource_ready() w2 = p2.item.works.all().first() @@ -241,13 +251,13 @@ class MultiBookSitesTestCase(TestCase): self.assertEqual(p4.item.id, p1.item.id) self.assertEqual(p4.item.works.all().count(), 2) self.assertEqual(p1.item.works.all().count(), 2) - w2e = w2.editions.all().order_by('title') + w2e = w2.editions.all().order_by("title") self.assertEqual(w2e.count(), 2) - self.assertEqual(w2e[0].title, 'Wang in Love and Bondage') - self.assertEqual(w2e[1].title, '黄金时代') - w3e = w3.editions.all().order_by('title') + self.assertEqual(w2e[0].title, "Wang in Love and Bondage") + self.assertEqual(w2e[1].title, "黄金时代") + w3e = w3.editions.all().order_by("title") self.assertEqual(w3e.count(), 2) - self.assertEqual(w3e[0].title, 'Golden Age: A Novel') - self.assertEqual(w3e[1].title, '黄金时代') + self.assertEqual(w3e[0].title, "Golden Age: A Novel") + self.assertEqual(w3e[1].title, "黄金时代") e = Edition.objects.get(primary_lookup_id_value=9781662601217) - self.assertEqual(e.title, 'Golden Age: A Novel') + self.assertEqual(e.title, "Golden Age: A Novel") diff --git a/catalog/book/utils.py b/catalog/book/utils.py index 6598e65b..62e08e00 100644 --- a/catalog/book/utils.py +++ b/catalog/book/utils.py @@ -10,7 +10,7 @@ def check_digit_10(isbn): w = i + 1 sum += w * c r = sum % 11 - return 'X' if r == 10 else str(r) + return "X" if r == 10 else str(r) def check_digit_13(isbn): @@ -21,38 +21,38 @@ def check_digit_13(isbn): w = 3 if i % 2 else 1 sum += w * c r = 10 - (sum % 10) - return '0' if r == 10 else str(r) + return "0" if r == 10 else str(r) def isbn_10_to_13(isbn): if not isbn or len(isbn) != 10: return None - return '978' + isbn[:-1] + check_digit_13('978' + isbn[:-1]) + return "978" + isbn[:-1] + check_digit_13("978" + isbn[:-1]) def isbn_13_to_10(isbn): - if not isbn or len(isbn) != 13 or isbn[:3] != '978': + if not isbn or len(isbn) != 13 or isbn[:3] != "978": return None else: return isbn[3:12] + check_digit_10(isbn[3:12]) def is_isbn_13(isbn): - return re.match(r'\d{13}', isbn) is not None + return re.match(r"\d{13}", isbn) is not None def is_isbn_10(isbn): - return re.match(r'\d{9}[X0-9]', isbn) is not None + return re.match(r"\d{9}[X0-9]", isbn) is not None def is_asin(asin): - return re.match(r'B[A-Z0-9]{9}', asin) is not None + return re.match(r"B[A-Z0-9]{9}", asin) is not None def detect_isbn_asin(s): if not s: return None, None - n = re.sub(r'[^0-9A-Z]', '', s.upper()) + n = re.sub(r"[^0-9A-Z]", "", s.upper()) if is_isbn_13(n): return IdType.ISBN, n if is_isbn_10(n): diff --git a/catalog/common/__init__.py b/catalog/common/__init__.py index 105be222..33fc7184 100644 --- a/catalog/common/__init__.py +++ b/catalog/common/__init__.py @@ -5,4 +5,28 @@ from .scrapers import * from . import jsondata -__all__ = ('IdType', 'SiteName', 'ItemCategory', 'Item', 'ExternalResource', 'ResourceContent', 'ParseError', 'AbstractSite', 'SiteManager', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'get_mock_mode', 'get_mock_file', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP') +__all__ = ( + "IdType", + "SiteName", + "ItemCategory", + "Item", + "ExternalResource", + "ResourceContent", + "ParseError", + "AbstractSite", + "SiteManager", + "jsondata", + "PrimaryLookupIdDescriptor", + "LookupIdDescriptor", + "get_mock_mode", + "get_mock_file", + "use_local_response", + "RetryDownloader", + "BasicDownloader", + "ProxiedDownloader", + "BasicImageDownloader", + "RESPONSE_OK", + "RESPONSE_NETWORK_ERROR", + "RESPONSE_INVALID_CONTENT", + "RESPONSE_CENSORSHIP", +) diff --git a/catalog/common/downloaders.py b/catalog/common/downloaders.py index a9d95e21..b3d7cf47 100644 --- a/catalog/common/downloaders.py +++ b/catalog/common/downloaders.py @@ -29,6 +29,7 @@ def use_local_response(func): set_mock_mode(True) func(args) set_mock_mode(False) + return _func @@ -43,9 +44,9 @@ def get_mock_mode(): def get_mock_file(url): - fn = url.replace('***REMOVED***', '1234') # Thank you, Github Action -_-! - fn = re.sub(r'[^\w]', '_', fn) - fn = re.sub(r'_key_[*A-Za-z0-9]+', '_key_8964', fn) + fn = url.replace("***REMOVED***", "1234") # Thank you, Github Action -_-! + fn = re.sub(r"[^\w]", "_", fn) + fn = re.sub(r"_key_[*A-Za-z0-9]+", "_key_8964", fn) return fn @@ -61,21 +62,23 @@ class DownloadError(Exception): error = "Censored Content" else: error = "Unknown Error" - self.message = f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}" + self.message = ( + f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}" + ) super().__init__(self.message) class BasicDownloader: headers = { # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0', - 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', - 'Accept-Encoding': 'gzip, deflate', - 'Connection': 'keep-alive', - 'DNT': '1', - 'Upgrade-Insecure-Requests': '1', - 'Cache-Control': 'no-cache', + "User-Agent": "Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", + "Accept-Encoding": "gzip, deflate", + "Connection": "keep-alive", + "DNT": "1", + "Upgrade-Insecure-Requests": "1", + "Cache-Control": "no-cache", } def __init__(self, url, headers=None): @@ -100,18 +103,28 @@ class BasicDownloader: try: if not _mock_mode: # TODO cache = get/set from redis - resp = requests.get(url, headers=self.headers, timeout=self.get_timeout()) + resp = requests.get( + url, headers=self.headers, timeout=self.get_timeout() + ) if settings.DOWNLOADER_SAVEDIR: - with open(settings.DOWNLOADER_SAVEDIR + '/' + get_mock_file(url), 'w', encoding='utf-8') as fp: + with open( + settings.DOWNLOADER_SAVEDIR + "/" + get_mock_file(url), + "w", + encoding="utf-8", + ) as fp: fp.write(resp.text) else: resp = MockResponse(self.url) response_type = self.validate_response(resp) - self.logs.append({'response_type': response_type, 'url': url, 'exception': None}) + self.logs.append( + {"response_type": response_type, "url": url, "exception": None} + ) return resp, response_type except RequestException as e: - self.logs.append({'response_type': RESPONSE_NETWORK_ERROR, 'url': url, 'exception': e}) + self.logs.append( + {"response_type": RESPONSE_NETWORK_ERROR, "url": url, "exception": e} + ) return None, RESPONSE_NETWORK_ERROR def download(self): @@ -126,16 +139,26 @@ class ProxiedDownloader(BasicDownloader): def get_proxied_urls(self): urls = [] if settings.PROXYCRAWL_KEY is not None: - urls.append(f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={self.url}') + urls.append( + f"https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={self.url}" + ) if settings.SCRAPESTACK_KEY is not None: # urls.append(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={self.url}') - urls.append(f'http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={self.url}') + urls.append( + f"http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={self.url}" + ) if settings.SCRAPERAPI_KEY is not None: - urls.append(f'http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={self.url}') + urls.append( + f"http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={self.url}" + ) return urls def get_special_proxied_url(self): - return f'{settings.LOCAL_PROXY}?url={self.url}' if settings.LOCAL_PROXY is not None else None + return ( + f"{settings.LOCAL_PROXY}?url={self.url}" + if settings.LOCAL_PROXY is not None + else None + ) def download(self): urls = self.get_proxied_urls() @@ -144,7 +167,11 @@ class ProxiedDownloader(BasicDownloader): resp = None while url: resp, resp_type = self._download(url) - if resp_type == RESPONSE_OK or resp_type == RESPONSE_INVALID_CONTENT or last_try: + if ( + resp_type == RESPONSE_OK + or resp_type == RESPONSE_INVALID_CONTENT + or last_try + ): url = None elif resp_type == RESPONSE_CENSORSHIP: url = self.get_special_proxied_url() @@ -169,15 +196,15 @@ class RetryDownloader(BasicDownloader): elif self.response_type != RESPONSE_NETWORK_ERROR and retries == 0: raise DownloadError(self) elif retries > 0: - _logger.debug('Retry ' + self.url) + _logger.debug("Retry " + self.url) time.sleep((settings.DOWNLOADER_RETRIES - retries) * 0.5) - raise DownloadError(self, 'max out of retries') + raise DownloadError(self, "max out of retries") class ImageDownloaderMixin: def __init__(self, url, referer=None): if referer is not None: - self.headers['Referer'] = referer + self.headers["Referer"] = referer super().__init__(url) def validate_response(self, response): @@ -186,8 +213,10 @@ class ImageDownloaderMixin: raw_img = response.content img = Image.open(BytesIO(raw_img)) img.load() # corrupted image will trigger exception - content_type = response.headers.get('Content-Type') - self.extention = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension + content_type = response.headers.get("Content-Type") + self.extention = filetype.get_type( + mime=content_type.partition(";")[0].strip() + ).extension return RESPONSE_OK except Exception: return RESPONSE_NETWORK_ERROR @@ -213,7 +242,9 @@ class ProxiedImageDownloader(ImageDownloaderMixin, ProxiedDownloader): pass -_local_response_path = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/' +_local_response_path = ( + str(Path(__file__).parent.parent.parent.absolute()) + "/test_data/" +) class MockResponse: @@ -225,23 +256,27 @@ class MockResponse: self.status_code = 200 _logger.debug(f"use local response for {url} from {fn}") except Exception: - self.content = b'Error: response file not found' + self.content = b"Error: response file not found" self.status_code = 404 _logger.debug(f"local response not found for {url} at {fn}") @property def text(self): - return self.content.decode('utf-8') + return self.content.decode("utf-8") def json(self): return json.load(StringIO(self.text)) def html(self): - return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5 + return html.fromstring( + self.text + ) # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5 @property def headers(self): - return {'Content-Type': 'image/jpeg' if self.url.endswith('jpg') else 'text/html'} + return { + "Content-Type": "image/jpeg" if self.url.endswith("jpg") else "text/html" + } requests.Response.html = MockResponse.html diff --git a/catalog/common/sites.py b/catalog/common/sites.py index 8f0dcdb3..676bffdc 100644 --- a/catalog/common/sites.py +++ b/catalog/common/sites.py @@ -24,25 +24,29 @@ class ResourceContent: cover_image_extention: str = None def dict(self): - return {'metadata': self.metadata, 'lookup_ids': self.lookup_ids} + return {"metadata": self.metadata, "lookup_ids": self.lookup_ids} def to_json(self) -> str: - return json.dumps({'metadata': self.metadata, 'lookup_ids': self.lookup_ids}) + return json.dumps({"metadata": self.metadata, "lookup_ids": self.lookup_ids}) class AbstractSite: """ Abstract class to represent a site """ + SITE_NAME = None ID_TYPE = None - WIKI_PROPERTY_ID = 'P0undefined0' + WIKI_PROPERTY_ID = "P0undefined0" DEFAULT_MODEL = None URL_PATTERNS = [r"\w+://undefined/(\d+)"] @classmethod def validate_url(self, url: str): - u = next(iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), None) + u = next( + iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), + None, + ) return u is not None @classmethod @@ -51,15 +55,18 @@ class AbstractSite: @classmethod def id_to_url(self, id_value): - return 'https://undefined/' + id_value + return "https://undefined/" + id_value @classmethod def url_to_id(self, url: str): - u = next(iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), None) + u = next( + iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), + None, + ) return u[1] if u else None def __str__(self): - return f'<{self.__class__.__name__}: {self.url}>' + return f"<{self.__class__.__name__}: {self.url}>" def __init__(self, url=None): self.id_value = self.url_to_id(url) if url else None @@ -70,7 +77,9 @@ class AbstractSite: if not self.resource: self.resource = ExternalResource.objects.filter(url=self.url).first() if self.resource is None: - self.resource = ExternalResource(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url) + self.resource = ExternalResource( + id_type=self.ID_TYPE, id_value=self.id_value, url=self.url + ) return self.resource def scrape(self) -> ResourceContent: @@ -91,11 +100,13 @@ class AbstractSite: model = self.DEFAULT_MODEL t, v = model.get_best_lookup_id(p.get_all_lookup_ids()) if t is not None: - p.item = model.objects.filter(primary_lookup_id_type=t, primary_lookup_id_value=v).first() + p.item = model.objects.filter( + primary_lookup_id_type=t, primary_lookup_id_value=v + ).first() if p.item is None: obj = model.copy_metadata(p.metadata) - obj['primary_lookup_id_type'] = t - obj['primary_lookup_id_value'] = v + obj["primary_lookup_id_type"] = t + obj["primary_lookup_id_value"] = v p.item = model.objects.create(**obj) return p.item @@ -103,10 +114,17 @@ class AbstractSite: def ready(self): return bool(self.resource and self.resource.ready) - def get_resource_ready(self, auto_save=True, auto_create=True, auto_link=True, preloaded_content=None, ignore_existing_content=False): + def get_resource_ready( + self, + auto_save=True, + auto_create=True, + auto_link=True, + preloaded_content=None, + ignore_existing_content=False, + ): """ Returns an ExternalResource in scraped state if possible - + Parameters ---------- auto_save : bool @@ -137,7 +155,7 @@ class AbstractSite: resource_content = self.scrape() p.update_content(resource_content) if not p.ready: - _logger.error(f'unable to get resource {self.url} ready') + _logger.error(f"unable to get resource {self.url} ready") return None if auto_create and p.item is None: self.get_item() @@ -148,9 +166,12 @@ class AbstractSite: p.item.save() if auto_link: for linked_resource in p.required_resources: - linked_site = SiteManager.get_site_by_url(linked_resource['url']) + linked_site = SiteManager.get_site_by_url(linked_resource["url"]) if linked_site: - linked_site.get_resource_ready(auto_link=False, preloaded_content=linked_resource.get('content')) + linked_site.get_resource_ready( + auto_link=False, + preloaded_content=linked_resource.get("content"), + ) else: _logger.error(f'unable to get site for {linked_resource["url"]}') p.item.update_linked_items_from_external_resource(p) @@ -165,7 +186,7 @@ class SiteManager: def register(target) -> Callable: id_type = target.ID_TYPE if id_type in SiteManager.registry: - raise ValueError(f'Site for {id_type} already exists') + raise ValueError(f"Site for {id_type} already exists") SiteManager.registry[id_type] = target return target @@ -175,9 +196,17 @@ class SiteManager: @staticmethod def get_site_by_url(url: str): - cls = next(filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None) + cls = next( + filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None + ) if cls is None: - cls = next(filter(lambda p: p.validate_url_fallback(url), SiteManager.registry.values()), None) + cls = next( + filter( + lambda p: p.validate_url_fallback(url), + SiteManager.registry.values(), + ), + None, + ) return cls(url) if cls else None @staticmethod @@ -190,5 +219,7 @@ class SiteManager: return SiteManager.get_site_by_id_type(resource.id_type) -ExternalResource.get_site = lambda resource: SiteManager.get_site_by_id_type(resource.id_type) +ExternalResource.get_site = lambda resource: SiteManager.get_site_by_id_type( + resource.id_type +) # ExternalResource.get_site = SiteManager.get_site_by_resource diff --git a/catalog/common/utils.py b/catalog/common/utils.py index 5bfc82c4..29f32a93 100644 --- a/catalog/common/utils.py +++ b/catalog/common/utils.py @@ -6,9 +6,14 @@ import uuid _logger = logging.getLogger(__name__) -DEFAULT_ITEM_COVER = 'item/default.svg' +DEFAULT_ITEM_COVER = "item/default.svg" def item_cover_path(resource, filename): - fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1] - return 'item/' + resource.id_type + '/' + fn + fn = ( + timezone.now().strftime("%Y/%m/%d/") + + str(uuid.uuid4()) + + "." + + filename.split(".")[-1] + ) + return "item/" + resource.id_type + "/" + fn diff --git a/catalog/game/models.py b/catalog/game/models.py index ea6b0b51..295b882f 100644 --- a/catalog/game/models.py +++ b/catalog/game/models.py @@ -5,66 +5,63 @@ from django.db import models class Game(Item): category = ItemCategory.Game - url_path = 'game' - demonstrative = _('这个游戏') + url_path = "game" + demonstrative = _("这个游戏") igdb = PrimaryLookupIdDescriptor(IdType.IGDB) steam = PrimaryLookupIdDescriptor(IdType.Steam) douban_game = PrimaryLookupIdDescriptor(IdType.DoubanGame) METADATA_COPY_LIST = [ - 'title', - 'brief', - 'other_title', - 'developer', - 'publisher', - 'release_date', - 'genre', - 'platform', - 'official_site', + "title", + "brief", + "other_title", + "developer", + "publisher", + "release_date", + "genre", + "platform", + "official_site", ] other_title = jsondata.ArrayField( - models.CharField(blank=True, default='', max_length=500), + models.CharField(blank=True, default="", max_length=500), null=True, blank=True, default=list, ) developer = jsondata.ArrayField( - models.CharField(blank=True, default='', max_length=500), + models.CharField(blank=True, default="", max_length=500), null=True, blank=True, default=list, ) publisher = jsondata.ArrayField( - models.CharField(blank=True, default='', max_length=500), + models.CharField(blank=True, default="", max_length=500), null=True, blank=True, default=list, ) release_date = jsondata.DateField( - auto_now=False, - auto_now_add=False, - null=True, - blank=True + auto_now=False, auto_now_add=False, null=True, blank=True ) genre = jsondata.ArrayField( - models.CharField(blank=True, default='', max_length=200), + models.CharField(blank=True, default="", max_length=200), null=True, blank=True, default=list, ) platform = jsondata.ArrayField( - models.CharField(blank=True, default='', max_length=200), + models.CharField(blank=True, default="", max_length=200), null=True, blank=True, default=list, ) official_site = jsondata.CharField( - default='', + default="", ) diff --git a/catalog/game/tests.py b/catalog/game/tests.py index bef6d4cf..4a455810 100644 --- a/catalog/game/tests.py +++ b/catalog/game/tests.py @@ -6,8 +6,8 @@ from catalog.models import * class IGDBTestCase(TestCase): def test_parse(self): t_id_type = IdType.IGDB - t_id_value = 'portal-2' - t_url = 'https://www.igdb.com/games/portal-2' + t_id_value = "portal-2" + t_url = "https://www.igdb.com/games/portal-2" site = SiteManager.get_site_by_id_type(t_id_type) self.assertIsNotNone(site) self.assertEqual(site.validate_url(t_url), True) @@ -17,34 +17,39 @@ class IGDBTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://www.igdb.com/games/portal-2' + t_url = "https://www.igdb.com/games/portal-2" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.resource.metadata['title'], 'Portal 2') + self.assertEqual(site.resource.metadata["title"], "Portal 2") self.assertIsInstance(site.resource.item, Game) - self.assertEqual(site.resource.item.steam, '620') + self.assertEqual(site.resource.item.steam, "620") @use_local_response def test_scrape_non_steam(self): - t_url = 'https://www.igdb.com/games/the-legend-of-zelda-breath-of-the-wild' + t_url = "https://www.igdb.com/games/the-legend-of-zelda-breath-of-the-wild" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.resource.metadata['title'], 'The Legend of Zelda: Breath of the Wild') + self.assertEqual( + site.resource.metadata["title"], "The Legend of Zelda: Breath of the Wild" + ) self.assertIsInstance(site.resource.item, Game) self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IGDB) - self.assertEqual(site.resource.item.primary_lookup_id_value, 'the-legend-of-zelda-breath-of-the-wild') + self.assertEqual( + site.resource.item.primary_lookup_id_value, + "the-legend-of-zelda-breath-of-the-wild", + ) class SteamTestCase(TestCase): def test_parse(self): t_id_type = IdType.Steam - t_id_value = '620' - t_url = 'https://store.steampowered.com/app/620/Portal_2/' - t_url2 = 'https://store.steampowered.com/app/620' + t_id_value = "620" + t_url = "https://store.steampowered.com/app/620/Portal_2/" + t_url2 = "https://store.steampowered.com/app/620" site = SiteManager.get_site_by_id_type(t_id_type) self.assertIsNotNone(site) self.assertEqual(site.validate_url(t_url), True) @@ -54,22 +59,24 @@ class SteamTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://store.steampowered.com/app/620/Portal_2/' + t_url = "https://store.steampowered.com/app/620/Portal_2/" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.resource.metadata['title'], 'Portal 2') - self.assertEqual(site.resource.metadata['brief'], '“终身测试计划”现已升级,您可以为您自己或您的好友设计合作谜题!') + self.assertEqual(site.resource.metadata["title"], "Portal 2") + self.assertEqual( + site.resource.metadata["brief"], "“终身测试计划”现已升级,您可以为您自己或您的好友设计合作谜题!" + ) self.assertIsInstance(site.resource.item, Game) - self.assertEqual(site.resource.item.steam, '620') + self.assertEqual(site.resource.item.steam, "620") class DoubanGameTestCase(TestCase): def test_parse(self): t_id_type = IdType.DoubanGame - t_id_value = '10734307' - t_url = 'https://www.douban.com/game/10734307/' + t_id_value = "10734307" + t_url = "https://www.douban.com/game/10734307/" site = SiteManager.get_site_by_id_type(t_id_type) self.assertIsNotNone(site) self.assertEqual(site.validate_url(t_url), True) @@ -79,21 +86,21 @@ class DoubanGameTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://www.douban.com/game/10734307/' + t_url = "https://www.douban.com/game/10734307/" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.resource.metadata['title'], '传送门2 Portal 2') + self.assertEqual(site.resource.metadata["title"], "传送门2 Portal 2") self.assertIsInstance(site.resource.item, Game) - self.assertEqual(site.resource.item.douban_game, '10734307') + self.assertEqual(site.resource.item.douban_game, "10734307") class BangumiGameTestCase(TestCase): def test_parse(self): t_id_type = IdType.Bangumi - t_id_value = '15912' - t_url = 'https://bgm.tv/subject/15912' + t_id_value = "15912" + t_url = "https://bgm.tv/subject/15912" site = SiteManager.get_site_by_id_type(t_id_type) self.assertIsNotNone(site) self.assertEqual(site.validate_url(t_url), True) @@ -110,8 +117,8 @@ class BangumiGameTestCase(TestCase): class MultiGameSitesTestCase(TestCase): @use_local_response def test_games(self): - url1 = 'https://www.igdb.com/games/portal-2' - url2 = 'https://store.steampowered.com/app/620/Portal_2/' + url1 = "https://www.igdb.com/games/portal-2" + url2 = "https://store.steampowered.com/app/620/Portal_2/" p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p2 = SiteManager.get_site_by_url(url2).get_resource_ready() self.assertEqual(p1.item.id, p2.item.id) diff --git a/catalog/management/commands/cat.py b/catalog/management/commands/cat.py index f7e162f0..b853e0b3 100644 --- a/catalog/management/commands/cat.py +++ b/catalog/management/commands/cat.py @@ -5,24 +5,24 @@ from catalog.sites import * class Command(BaseCommand): - help = 'Scrape a catalog item from external resource (and save it)' + help = "Scrape a catalog item from external resource (and save it)" def add_arguments(self, parser): - parser.add_argument('url', type=str, help='URL to scrape') + parser.add_argument("url", type=str, help="URL to scrape") parser.add_argument( - '--save', - action='store_true', - help='save to database', + "--save", + action="store_true", + help="save to database", ) def handle(self, *args, **options): - url = str(options['url']) + url = str(options["url"]) site = SiteManager.get_site_by_url(url) if site is None: - self.stdout.write(self.style.ERROR(f'Unknown site for {url}')) + self.stdout.write(self.style.ERROR(f"Unknown site for {url}")) return - self.stdout.write(f'Fetching from {site}') - if options['save']: + self.stdout.write(f"Fetching from {site}") + if options["save"]: resource = site.get_resource_ready() pprint.pp(resource.metadata) pprint.pp(site.get_item()) @@ -31,4 +31,4 @@ class Command(BaseCommand): resource = site.scrape() pprint.pp(resource.metadata) pprint.pp(resource.lookup_ids) - self.stdout.write(self.style.SUCCESS(f'Done.')) + self.stdout.write(self.style.SUCCESS(f"Done.")) diff --git a/catalog/models.py b/catalog/models.py index 306f57c1..75b13ec5 100644 --- a/catalog/models.py +++ b/catalog/models.py @@ -37,7 +37,9 @@ def all_content_types(): if _CONTENT_TYPE_LIST is None: _CONTENT_TYPE_LIST = {} for cls in Item.__subclasses__(): - _CONTENT_TYPE_LIST[cls] = ContentType.objects.get(app_label='catalog', model=cls.__name__.lower()).id + _CONTENT_TYPE_LIST[cls] = ContentType.objects.get( + app_label="catalog", model=cls.__name__.lower() + ).id return _CONTENT_TYPE_LIST @@ -46,7 +48,7 @@ def all_categories(): if _CATEGORY_LIST is None: _CATEGORY_LIST = {} for cls in Item.__subclasses__(): - c = getattr(cls, 'category', None) + c = getattr(cls, "category", None) if c not in _CATEGORY_LIST: _CATEGORY_LIST[c] = [cls] else: diff --git a/catalog/movie/models.py b/catalog/movie/models.py index 1d001437..ae4a0e6b 100644 --- a/catalog/movie/models.py +++ b/catalog/movie/models.py @@ -5,43 +5,93 @@ from django.db import models class Movie(Item): category = ItemCategory.Movie - url_path = 'movie' + url_path = "movie" imdb = PrimaryLookupIdDescriptor(IdType.IMDB) tmdb_movie = PrimaryLookupIdDescriptor(IdType.TMDB_Movie) douban_movie = PrimaryLookupIdDescriptor(IdType.DoubanMovie) - demonstrative = _('这部电影') + demonstrative = _("这部电影") METADATA_COPY_LIST = [ - 'title', - 'orig_title', - 'other_title', - 'director', - 'playwright', - 'actor', - 'genre', - 'showtime', - 'site', - 'area', - 'language', - 'year', - 'duration', - 'season_number', - 'episodes', - 'single_episode_length', - 'brief', + "title", + "orig_title", + "other_title", + "director", + "playwright", + "actor", + "genre", + "showtime", + "site", + "area", + "language", + "year", + "duration", + "season_number", + "episodes", + "single_episode_length", + "brief", ] - orig_title = jsondata.CharField(_("original title"), blank=True, default='', max_length=500) - other_title = jsondata.ArrayField(models.CharField(_("other title"), blank=True, default='', max_length=500), null=True, blank=True, default=list, ) - director = jsondata.ArrayField(models.CharField(_("director"), blank=True, default='', max_length=200), null=True, blank=True, default=list, ) - playwright = jsondata.ArrayField(models.CharField(_("playwright"), blank=True, default='', max_length=200), null=True, blank=True, default=list, ) - actor = jsondata.ArrayField(models.CharField(_("actor"), blank=True, default='', max_length=200), null=True, blank=True, default=list, ) - genre = jsondata.ArrayField(models.CharField(_("genre"), blank=True, default='', max_length=50), null=True, blank=True, default=list, ) # , choices=MovieGenreEnum.choices - showtime = jsondata.ArrayField(null=True, blank=True, default=list, ) - site = jsondata.URLField(_('site url'), blank=True, default='', max_length=200) - area = jsondata.ArrayField(models.CharField(_("country or region"), blank=True, default='', max_length=100, ), null=True, blank=True, default=list, ) - language = jsondata.ArrayField(models.CharField(blank=True, default='', max_length=100, ), null=True, blank=True, default=list, ) + orig_title = jsondata.CharField( + _("original title"), blank=True, default="", max_length=500 + ) + other_title = jsondata.ArrayField( + models.CharField(_("other title"), blank=True, default="", max_length=500), + null=True, + blank=True, + default=list, + ) + director = jsondata.ArrayField( + models.CharField(_("director"), blank=True, default="", max_length=200), + null=True, + blank=True, + default=list, + ) + playwright = jsondata.ArrayField( + models.CharField(_("playwright"), blank=True, default="", max_length=200), + null=True, + blank=True, + default=list, + ) + actor = jsondata.ArrayField( + models.CharField(_("actor"), blank=True, default="", max_length=200), + null=True, + blank=True, + default=list, + ) + genre = jsondata.ArrayField( + models.CharField(_("genre"), blank=True, default="", max_length=50), + null=True, + blank=True, + default=list, + ) # , choices=MovieGenreEnum.choices + showtime = jsondata.ArrayField( + null=True, + blank=True, + default=list, + ) + site = jsondata.URLField(_("site url"), blank=True, default="", max_length=200) + area = jsondata.ArrayField( + models.CharField( + _("country or region"), + blank=True, + default="", + max_length=100, + ), + null=True, + blank=True, + default=list, + ) + language = jsondata.ArrayField( + models.CharField( + blank=True, + default="", + max_length=100, + ), + null=True, + blank=True, + default=list, + ) year = jsondata.IntegerField(null=True, blank=True) season_number = jsondata.IntegerField(null=True, blank=True) episodes = jsondata.IntegerField(null=True, blank=True) single_episode_length = jsondata.IntegerField(null=True, blank=True) - duration = jsondata.CharField(blank=True, default='', max_length=200) + duration = jsondata.CharField(blank=True, default="", max_length=200) diff --git a/catalog/movie/tests.py b/catalog/movie/tests.py index 44ab58c1..8f41b4fe 100644 --- a/catalog/movie/tests.py +++ b/catalog/movie/tests.py @@ -4,8 +4,8 @@ from catalog.common import * class DoubanMovieTestCase(TestCase): def test_parse(self): - t_id = '3541415' - t_url = 'https://movie.douban.com/subject/3541415/' + t_id = "3541415" + t_url = "https://movie.douban.com/subject/3541415/" p1 = SiteManager.get_site_by_id_type(IdType.DoubanMovie) self.assertIsNotNone(p1) self.assertEqual(p1.validate_url(t_url), True) @@ -15,22 +15,24 @@ class DoubanMovieTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://movie.douban.com/subject/3541415/' + t_url = "https://movie.douban.com/subject/3541415/" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) - self.assertEqual(site.id_value, '3541415') + self.assertEqual(site.id_value, "3541415") site.get_resource_ready() - self.assertEqual(site.resource.metadata['title'], '盗梦空间') + self.assertEqual(site.resource.metadata["title"], "盗梦空间") self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB) - self.assertEqual(site.resource.item.__class__.__name__, 'Movie') - self.assertEqual(site.resource.item.imdb, 'tt1375666') + self.assertEqual(site.resource.item.__class__.__name__, "Movie") + self.assertEqual(site.resource.item.imdb, "tt1375666") class TMDBMovieTestCase(TestCase): def test_parse(self): - t_id = '293767' - t_url = 'https://www.themoviedb.org/movie/293767-billy-lynn-s-long-halftime-walk' - t_url2 = 'https://www.themoviedb.org/movie/293767' + t_id = "293767" + t_url = ( + "https://www.themoviedb.org/movie/293767-billy-lynn-s-long-halftime-walk" + ) + t_url2 = "https://www.themoviedb.org/movie/293767" p1 = SiteManager.get_site_by_id_type(IdType.TMDB_Movie) self.assertIsNotNone(p1) self.assertEqual(p1.validate_url(t_url), True) @@ -41,22 +43,22 @@ class TMDBMovieTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://www.themoviedb.org/movie/293767' + t_url = "https://www.themoviedb.org/movie/293767" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) - self.assertEqual(site.id_value, '293767') + self.assertEqual(site.id_value, "293767") site.get_resource_ready() - self.assertEqual(site.resource.metadata['title'], '比利·林恩的中场战事') + self.assertEqual(site.resource.metadata["title"], "比利·林恩的中场战事") self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB) - self.assertEqual(site.resource.item.__class__.__name__, 'Movie') - self.assertEqual(site.resource.item.imdb, 'tt2513074') + self.assertEqual(site.resource.item.__class__.__name__, "Movie") + self.assertEqual(site.resource.item.imdb, "tt2513074") class IMDBMovieTestCase(TestCase): def test_parse(self): - t_id = 'tt1375666' - t_url = 'https://www.imdb.com/title/tt1375666/' - t_url2 = 'https://www.imdb.com/title/tt1375666/' + t_id = "tt1375666" + t_url = "https://www.imdb.com/title/tt1375666/" + t_url2 = "https://www.imdb.com/title/tt1375666/" p1 = SiteManager.get_site_by_id_type(IdType.IMDB) self.assertIsNotNone(p1) self.assertEqual(p1.validate_url(t_url), True) @@ -67,22 +69,22 @@ class IMDBMovieTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://www.imdb.com/title/tt1375666/' + t_url = "https://www.imdb.com/title/tt1375666/" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) - self.assertEqual(site.id_value, 'tt1375666') + self.assertEqual(site.id_value, "tt1375666") site.get_resource_ready() - self.assertEqual(site.resource.metadata['title'], '盗梦空间') + self.assertEqual(site.resource.metadata["title"], "盗梦空间") self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB) - self.assertEqual(site.resource.item.imdb, 'tt1375666') + self.assertEqual(site.resource.item.imdb, "tt1375666") class MultiMovieSitesTestCase(TestCase): @use_local_response def test_movies(self): - url1 = 'https://www.themoviedb.org/movie/27205-inception' - url2 = 'https://movie.douban.com/subject/3541415/' - url3 = 'https://www.imdb.com/title/tt1375666/' + url1 = "https://www.themoviedb.org/movie/27205-inception" + url2 = "https://movie.douban.com/subject/3541415/" + url3 = "https://www.imdb.com/title/tt1375666/" p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p2 = SiteManager.get_site_by_url(url2).get_resource_ready() p3 = SiteManager.get_site_by_url(url3).get_resource_ready() diff --git a/catalog/music/models.py b/catalog/music/models.py index b4993eae..12f777fe 100644 --- a/catalog/music/models.py +++ b/catalog/music/models.py @@ -4,35 +4,47 @@ from django.db import models class Album(Item): - url_path = 'album' + url_path = "album" category = ItemCategory.Music - demonstrative = _('这张专辑') + demonstrative = _("这张专辑") barcode = PrimaryLookupIdDescriptor(IdType.GTIN) douban_music = PrimaryLookupIdDescriptor(IdType.DoubanMusic) spotify_album = PrimaryLookupIdDescriptor(IdType.Spotify_Album) METADATA_COPY_LIST = [ - 'title', - 'other_title', - 'album_type', - 'media', - 'disc_count', - 'artist', - 'genre', - 'release_date', - 'duration', - 'company', - 'track_list', - 'brief', - 'bandcamp_album_id', + "title", + "other_title", + "album_type", + "media", + "disc_count", + "artist", + "genre", + "release_date", + "duration", + "company", + "track_list", + "brief", + "bandcamp_album_id", ] - release_date = jsondata.DateField(_('发行日期'), auto_now=False, auto_now_add=False, null=True, blank=True) + release_date = jsondata.DateField( + _("发行日期"), auto_now=False, auto_now_add=False, null=True, blank=True + ) duration = jsondata.IntegerField(_("时长"), null=True, blank=True) - artist = jsondata.ArrayField(models.CharField(_("artist"), blank=True, default='', max_length=200), null=True, blank=True, default=list) - genre = jsondata.CharField(_("流派"), blank=True, default='', max_length=100) - company = jsondata.ArrayField(models.CharField(blank=True, default='', max_length=500), null=True, blank=True, default=list) + artist = jsondata.ArrayField( + models.CharField(_("artist"), blank=True, default="", max_length=200), + null=True, + blank=True, + default=list, + ) + genre = jsondata.CharField(_("流派"), blank=True, default="", max_length=100) + company = jsondata.ArrayField( + models.CharField(blank=True, default="", max_length=500), + null=True, + blank=True, + default=list, + ) track_list = jsondata.TextField(_("曲目"), blank=True, default="") - other_title = jsondata.CharField(blank=True, default='', max_length=500) - album_type = jsondata.CharField(blank=True, default='', max_length=500) - media = jsondata.CharField(blank=True, default='', max_length=500) - bandcamp_album_id = jsondata.CharField(blank=True, default='', max_length=500) - disc_count = jsondata.IntegerField(blank=True, default='', max_length=500) + other_title = jsondata.CharField(blank=True, default="", max_length=500) + album_type = jsondata.CharField(blank=True, default="", max_length=500) + media = jsondata.CharField(blank=True, default="", max_length=500) + bandcamp_album_id = jsondata.CharField(blank=True, default="", max_length=500) + disc_count = jsondata.IntegerField(blank=True, default="", max_length=500) diff --git a/catalog/music/tests.py b/catalog/music/tests.py index a2182692..aed4e715 100644 --- a/catalog/music/tests.py +++ b/catalog/music/tests.py @@ -6,8 +6,8 @@ from catalog.models import * class SpotifyTestCase(TestCase): def test_parse(self): t_id_type = IdType.Spotify_Album - t_id_value = '65KwtzkJXw7oT819NFWmEP' - t_url = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP' + t_id_value = "65KwtzkJXw7oT819NFWmEP" + t_url = "https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP" site = SiteManager.get_site_by_id_type(t_id_type) self.assertIsNotNone(site) self.assertEqual(site.validate_url(t_url), True) @@ -17,21 +17,21 @@ class SpotifyTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP' + t_url = "https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.resource.metadata['title'], 'The Race For Space') + self.assertEqual(site.resource.metadata["title"], "The Race For Space") self.assertIsInstance(site.resource.item, Album) - self.assertEqual(site.resource.item.barcode, '3610159662676') + self.assertEqual(site.resource.item.barcode, "3610159662676") class DoubanMusicTestCase(TestCase): def test_parse(self): t_id_type = IdType.DoubanMusic - t_id_value = '33551231' - t_url = 'https://music.douban.com/subject/33551231/' + t_id_value = "33551231" + t_url = "https://music.douban.com/subject/33551231/" site = SiteManager.get_site_by_id_type(t_id_type) self.assertIsNotNone(site) self.assertEqual(site.validate_url(t_url), True) @@ -41,21 +41,21 @@ class DoubanMusicTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://music.douban.com/subject/33551231/' + t_url = "https://music.douban.com/subject/33551231/" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.resource.metadata['title'], 'The Race For Space') + self.assertEqual(site.resource.metadata["title"], "The Race For Space") self.assertIsInstance(site.resource.item, Album) - self.assertEqual(site.resource.item.barcode, '3610159662676') + self.assertEqual(site.resource.item.barcode, "3610159662676") class MultiMusicSitesTestCase(TestCase): @use_local_response def test_albums(self): - url1 = 'https://music.douban.com/subject/33551231/' - url2 = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP' + url1 = "https://music.douban.com/subject/33551231/" + url2 = "https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP" p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p2 = SiteManager.get_site_by_url(url2).get_resource_ready() self.assertEqual(p1.item.id, p2.item.id) @@ -64,9 +64,9 @@ class MultiMusicSitesTestCase(TestCase): class BandcampTestCase(TestCase): def test_parse(self): t_id_type = IdType.Bandcamp - t_id_value = 'intlanthem.bandcamp.com/album/in-these-times' - t_url = 'https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw' - t_url2 = 'https://intlanthem.bandcamp.com/album/in-these-times' + t_id_value = "intlanthem.bandcamp.com/album/in-these-times" + t_url = "https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw" + t_url2 = "https://intlanthem.bandcamp.com/album/in-these-times" site = SiteManager.get_site_by_id_type(t_id_type) self.assertIsNotNone(site) self.assertEqual(site.validate_url(t_url), True) @@ -76,11 +76,11 @@ class BandcampTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw' + t_url = "https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.resource.metadata['title'], 'In These Times') - self.assertEqual(site.resource.metadata['artist'], ['Makaya McCraven']) + self.assertEqual(site.resource.metadata["title"], "In These Times") + self.assertEqual(site.resource.metadata["artist"], ["Makaya McCraven"]) self.assertIsInstance(site.resource.item, Album) diff --git a/catalog/performance/models.py b/catalog/performance/models.py index e82b905d..a0b531ab 100644 --- a/catalog/performance/models.py +++ b/catalog/performance/models.py @@ -4,12 +4,12 @@ from django.utils.translation import gettext_lazy as _ class Performance(Item): category = ItemCategory.Performance - url_path = 'performance' + url_path = "performance" douban_drama = LookupIdDescriptor(IdType.DoubanDrama) - versions = jsondata.ArrayField(_('版本'), null=False, blank=False, default=list) - directors = jsondata.ArrayField(_('导演'), null=False, blank=False, default=list) - playwrights = jsondata.ArrayField(_('编剧'), null=False, blank=False, default=list) - actors = jsondata.ArrayField(_('主演'), null=False, blank=False, default=list) + versions = jsondata.ArrayField(_("版本"), null=False, blank=False, default=list) + directors = jsondata.ArrayField(_("导演"), null=False, blank=False, default=list) + playwrights = jsondata.ArrayField(_("编剧"), null=False, blank=False, default=list) + actors = jsondata.ArrayField(_("主演"), null=False, blank=False, default=list) class Meta: proxy = True diff --git a/catalog/performance/tests.py b/catalog/performance/tests.py index 8e765743..9154706a 100644 --- a/catalog/performance/tests.py +++ b/catalog/performance/tests.py @@ -7,8 +7,8 @@ class DoubanDramaTestCase(TestCase): pass def test_parse(self): - t_id = '24849279' - t_url = 'https://www.douban.com/location/drama/24849279/' + t_id = "24849279" + t_url = "https://www.douban.com/location/drama/24849279/" p1 = SiteManager.get_site_by_id_type(IdType.DoubanDrama) self.assertIsNotNone(p1) p1 = SiteManager.get_site_by_url(t_url) @@ -19,14 +19,14 @@ class DoubanDramaTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://www.douban.com/location/drama/24849279/' + t_url = "https://www.douban.com/location/drama/24849279/" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) resource = site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(resource.metadata['title'], '红花侠') + self.assertEqual(resource.metadata["title"], "红花侠") item = site.get_item() - self.assertEqual(item.title, '红花侠') + self.assertEqual(item.title, "红花侠") # self.assertEqual(i.other_titles, ['スカーレットピンパーネル', 'THE SCARLET PIMPERNEL']) # self.assertEqual(len(i.brief), 545) diff --git a/catalog/podcast/models.py b/catalog/podcast/models.py index 367daea2..6c808c8a 100644 --- a/catalog/podcast/models.py +++ b/catalog/podcast/models.py @@ -3,7 +3,7 @@ from catalog.common import * class Podcast(Item): category = ItemCategory.Podcast - url_path = 'podcast' + url_path = "podcast" feed_url = PrimaryLookupIdDescriptor(IdType.Feed) apple_podcast = PrimaryLookupIdDescriptor(IdType.ApplePodcast) # ximalaya = LookupIdDescriptor(IdType.Ximalaya) diff --git a/catalog/podcast/tests.py b/catalog/podcast/tests.py index 615b8925..93140791 100644 --- a/catalog/podcast/tests.py +++ b/catalog/podcast/tests.py @@ -8,9 +8,9 @@ class ApplePodcastTestCase(TestCase): pass def test_parse(self): - t_id = '657765158' - t_url = 'https://podcasts.apple.com/us/podcast/%E5%A4%A7%E5%86%85%E5%AF%86%E8%B0%88/id657765158' - t_url2 = 'https://podcasts.apple.com/us/podcast/id657765158' + t_id = "657765158" + t_url = "https://podcasts.apple.com/us/podcast/%E5%A4%A7%E5%86%85%E5%AF%86%E8%B0%88/id657765158" + t_url2 = "https://podcasts.apple.com/us/podcast/id657765158" p1 = SiteManager.get_site_by_id_type(IdType.ApplePodcast) self.assertIsNotNone(p1) self.assertEqual(p1.validate_url(t_url), True) @@ -20,11 +20,14 @@ class ApplePodcastTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://podcasts.apple.com/gb/podcast/the-new-yorker-radio-hour/id1050430296' + t_url = "https://podcasts.apple.com/gb/podcast/the-new-yorker-radio-hour/id1050430296" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) - self.assertEqual(site.id_value, '1050430296') + self.assertEqual(site.id_value, "1050430296") site.get_resource_ready() - self.assertEqual(site.resource.metadata['title'], 'The New Yorker Radio Hour') + self.assertEqual(site.resource.metadata["title"], "The New Yorker Radio Hour") # self.assertEqual(site.resource.metadata['feed_url'], 'http://feeds.wnyc.org/newyorkerradiohour') - self.assertEqual(site.resource.metadata['feed_url'], 'http://feeds.feedburner.com/newyorkerradiohour') + self.assertEqual( + site.resource.metadata["feed_url"], + "http://feeds.feedburner.com/newyorkerradiohour", + ) diff --git a/catalog/sites/apple_podcast.py b/catalog/sites/apple_podcast.py index 2fd78bd8..d1bc0534 100644 --- a/catalog/sites/apple_podcast.py +++ b/catalog/sites/apple_podcast.py @@ -11,7 +11,7 @@ class ApplePodcast(AbstractSite): SITE_NAME = SiteName.ApplePodcast ID_TYPE = IdType.ApplePodcast URL_PATTERNS = [r"https://[^.]+.apple.com/\w+/podcast/*[^/?]*/id(\d+)"] - WIKI_PROPERTY_ID = 'P5842' + WIKI_PROPERTY_ID = "P5842" DEFAULT_MODEL = Podcast @classmethod @@ -19,23 +19,27 @@ class ApplePodcast(AbstractSite): return "https://podcasts.apple.com/us/podcast/id" + id_value def scrape(self): - api_url = f'https://itunes.apple.com/lookup?id={self.id_value}' + api_url = f"https://itunes.apple.com/lookup?id={self.id_value}" dl = BasicDownloader(api_url) resp = dl.download() - r = resp.json()['results'][0] - pd = ResourceContent(metadata={ - 'title': r['trackName'], - 'feed_url': r['feedUrl'], - 'hosts': [r['artistName']], - 'genres': r['genres'], - 'cover_image_url': r['artworkUrl600'], - }) - pd.lookup_ids[IdType.Feed] = pd.metadata.get('feed_url') + r = resp.json()["results"][0] + pd = ResourceContent( + metadata={ + "title": r["trackName"], + "feed_url": r["feedUrl"], + "hosts": [r["artistName"]], + "genres": r["genres"], + "cover_image_url": r["artworkUrl600"], + } + ) + pd.lookup_ids[IdType.Feed] = pd.metadata.get("feed_url") if pd.metadata["cover_image_url"]: imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url) try: pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug( + f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}' + ) return pd diff --git a/catalog/sites/bandcamp.py b/catalog/sites/bandcamp.py index 394dafa8..b96752f8 100644 --- a/catalog/sites/bandcamp.py +++ b/catalog/sites/bandcamp.py @@ -14,11 +14,9 @@ _logger = logging.getLogger(__name__) class Bandcamp(AbstractSite): SITE_NAME = SiteName.Bandcamp ID_TYPE = IdType.Bandcamp - URL_PATTERNS = [ - r"https://([a-z0-9\-]+.bandcamp.com/album/[^?#/]+)" - ] + URL_PATTERNS = [r"https://([a-z0-9\-]+.bandcamp.com/album/[^?#/]+)"] URL_PATTERN_FALLBACK = r"https://([a-z0-9\-\.]+/album/[^?#/]+)" - WIKI_PROPERTY_ID = '' + WIKI_PROPERTY_ID = "" DEFAULT_MODEL = Album @classmethod @@ -32,16 +30,16 @@ class Bandcamp(AbstractSite): parsed_url = urllib.parse.urlparse(url) hostname = parsed_url.netloc try: - answers = dns.resolver.query(hostname, 'CNAME') + answers = dns.resolver.query(hostname, "CNAME") for rdata in answers: - if str(rdata.target) == 'dom.bandcamp.com.': + if str(rdata.target) == "dom.bandcamp.com.": return True except Exception: pass try: - answers = dns.resolver.query(hostname, 'A') + answers = dns.resolver.query(hostname, "A") for rdata in answers: - if str(rdata.address) == '35.241.62.186': + if str(rdata.address) == "35.241.62.186": return True except Exception: pass @@ -50,34 +48,45 @@ class Bandcamp(AbstractSite): content = BasicDownloader(self.url).download().html() try: title = content.xpath("//h2[@class='trackTitle']/text()")[0].strip() - artist = [content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip()] + artist = [ + content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip() + ] except IndexError: raise ValueError("given url contains no valid info") genre = [] # TODO: parse tags track_list = [] - release_nodes = content.xpath("//div[@class='tralbumData tralbum-credits']/text()") - release_date = dateparser.parse(re.sub(r'releas\w+ ', '', release_nodes[0].strip())).strftime('%Y-%m-%d') if release_nodes else None + release_nodes = content.xpath( + "//div[@class='tralbumData tralbum-credits']/text()" + ) + release_date = ( + dateparser.parse( + re.sub(r"releas\w+ ", "", release_nodes[0].strip()) + ).strftime("%Y-%m-%d") + if release_nodes + else None + ) duration = None company = None brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()") brief = "".join(brief_nodes) if brief_nodes else None cover_url = content.xpath("//div[@id='tralbumArt']/a/@href")[0].strip() - bandcamp_page_data = json.loads(content.xpath( - "//meta[@name='bc-page-properties']/@content")[0].strip()) - bandcamp_album_id = bandcamp_page_data['item_id'] + bandcamp_page_data = json.loads( + content.xpath("//meta[@name='bc-page-properties']/@content")[0].strip() + ) + bandcamp_album_id = bandcamp_page_data["item_id"] data = { - 'title': title, - 'artist': artist, - 'genre': genre, - 'track_list': track_list, - 'release_date': release_date, - 'duration': duration, - 'company': company, - 'brief': brief, - 'bandcamp_album_id': bandcamp_album_id, - 'cover_image_url': cover_url, + "title": title, + "artist": artist, + "genre": genre, + "track_list": track_list, + "release_date": release_date, + "duration": duration, + "company": company, + "brief": brief, + "bandcamp_album_id": bandcamp_album_id, + "cover_image_url": cover_url, } pd = ResourceContent(metadata=data) if data["cover_image_url"]: @@ -86,5 +95,7 @@ class Bandcamp(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - _logger.debug(f'failed to download cover for {self.url} from {data["cover_image_url"]}') + _logger.debug( + f'failed to download cover for {self.url} from {data["cover_image_url"]}' + ) return pd diff --git a/catalog/sites/bangumi.py b/catalog/sites/bangumi.py index 6be9bd6b..5c95b957 100644 --- a/catalog/sites/bangumi.py +++ b/catalog/sites/bangumi.py @@ -13,7 +13,7 @@ class Bangumi(AbstractSite): URL_PATTERNS = [ r"https://bgm\.tv/subject/(\d+)", ] - WIKI_PROPERTY_ID = '' + WIKI_PROPERTY_ID = "" DEFAULT_MODEL = None @classmethod diff --git a/catalog/sites/douban.py b/catalog/sites/douban.py index b26d42fc..6f47af2e 100644 --- a/catalog/sites/douban.py +++ b/catalog/sites/douban.py @@ -13,14 +13,17 @@ class DoubanDownloader(ProxiedDownloader): elif response.status_code == 204: return RESPONSE_CENSORSHIP elif response.status_code == 200: - content = response.content.decode('utf-8') - if content.find('关于豆瓣') == -1: + content = response.content.decode("utf-8") + if content.find("关于豆瓣") == -1: # if content.find('你的 IP 发出') == -1: # error = error + 'Content not authentic' # response is garbage # else: # error = error + 'IP banned' return RESPONSE_NETWORK_ERROR - elif content.find('页面不存在') != -1 or content.find('呃... 你想访问的条目豆瓣不收录。') != -1: # re.search('不存在[^<]+', content, re.MULTILINE): + elif ( + content.find("页面不存在") != -1 + or content.find("呃... 你想访问的条目豆瓣不收录。") != -1 + ): # re.search('不存在[^<]+', content, re.MULTILINE): return RESPONSE_CENSORSHIP else: return RESPONSE_OK diff --git a/catalog/sites/douban_book.py b/catalog/sites/douban_book.py index 22ae9119..2bf11908 100644 --- a/catalog/sites/douban_book.py +++ b/catalog/sites/douban_book.py @@ -12,8 +12,11 @@ _logger = logging.getLogger(__name__) class DoubanBook(AbstractSite): SITE_NAME = SiteName.Douban ID_TYPE = IdType.DoubanBook - URL_PATTERNS = [r"\w+://book\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/book/subject/(\d+)/{0,1}"] - WIKI_PROPERTY_ID = '?' + URL_PATTERNS = [ + r"\w+://book\.douban\.com/subject/(\d+)/{0,1}", + r"\w+://m.douban.com/book/subject/(\d+)/{0,1}", + ] + WIKI_PROPERTY_ID = "?" DEFAULT_MODEL = Edition @classmethod @@ -23,31 +26,40 @@ class DoubanBook(AbstractSite): def scrape(self): content = DoubanDownloader(self.url).download().html() - isbn_elem = content.xpath("//div[@id='info']//span[text()='ISBN:']/following::text()") + isbn_elem = content.xpath( + "//div[@id='info']//span[text()='ISBN:']/following::text()" + ) isbn = isbn_elem[0].strip() if isbn_elem else None title_elem = content.xpath("/html/body//h1/span/text()") - title = title_elem[0].strip() if title_elem else f"Unknown Title {self.id_value}" + title = ( + title_elem[0].strip() if title_elem else f"Unknown Title {self.id_value}" + ) subtitle_elem = content.xpath( - "//div[@id='info']//span[text()='副标题:']/following::text()") + "//div[@id='info']//span[text()='副标题:']/following::text()" + ) subtitle = subtitle_elem[0].strip()[:500] if subtitle_elem else None orig_title_elem = content.xpath( - "//div[@id='info']//span[text()='原作名:']/following::text()") + "//div[@id='info']//span[text()='原作名:']/following::text()" + ) orig_title = orig_title_elem[0].strip()[:500] if orig_title_elem else None language_elem = content.xpath( - "//div[@id='info']//span[text()='语言:']/following::text()") + "//div[@id='info']//span[text()='语言:']/following::text()" + ) language = language_elem[0].strip() if language_elem else None pub_house_elem = content.xpath( - "//div[@id='info']//span[text()='出版社:']/following::text()") + "//div[@id='info']//span[text()='出版社:']/following::text()" + ) pub_house = pub_house_elem[0].strip() if pub_house_elem else None pub_date_elem = content.xpath( - "//div[@id='info']//span[text()='出版年:']/following::text()") - pub_date = pub_date_elem[0].strip() if pub_date_elem else '' + "//div[@id='info']//span[text()='出版年:']/following::text()" + ) + pub_date = pub_date_elem[0].strip() if pub_date_elem else "" year_month_day = RE_NUMBERS.findall(pub_date) if len(year_month_day) in (2, 3): pub_year = int(year_month_day[0]) @@ -60,45 +72,62 @@ class DoubanBook(AbstractSite): pub_month = None if pub_year and pub_month and pub_year < pub_month: pub_year, pub_month = pub_month, pub_year - pub_year = None if pub_year is not None and pub_year not in range( - 0, 3000) else pub_year - pub_month = None if pub_month is not None and pub_month not in range( - 1, 12) else pub_month + pub_year = ( + None + if pub_year is not None and pub_year not in range(0, 3000) + else pub_year + ) + pub_month = ( + None + if pub_month is not None and pub_month not in range(1, 12) + else pub_month + ) binding_elem = content.xpath( - "//div[@id='info']//span[text()='装帧:']/following::text()") + "//div[@id='info']//span[text()='装帧:']/following::text()" + ) binding = binding_elem[0].strip() if binding_elem else None price_elem = content.xpath( - "//div[@id='info']//span[text()='定价:']/following::text()") + "//div[@id='info']//span[text()='定价:']/following::text()" + ) price = price_elem[0].strip() if price_elem else None pages_elem = content.xpath( - "//div[@id='info']//span[text()='页数:']/following::text()") + "//div[@id='info']//span[text()='页数:']/following::text()" + ) pages = pages_elem[0].strip() if pages_elem else None if pages is not None: - pages = int(RE_NUMBERS.findall(pages)[ - 0]) if RE_NUMBERS.findall(pages) else None + pages = ( + int(RE_NUMBERS.findall(pages)[0]) if RE_NUMBERS.findall(pages) else None + ) if pages and (pages > 999999 or pages < 1): pages = None brief_elem = content.xpath( - "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()") - brief = '\n'.join(p.strip() - for p in brief_elem) if brief_elem else None + "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()" + ) + brief = "\n".join(p.strip() for p in brief_elem) if brief_elem else None contents = None try: contents_elem = content.xpath( - "//h2/span[text()='目录']/../following-sibling::div[1]")[0] + "//h2/span[text()='目录']/../following-sibling::div[1]" + )[0] # if next the id of next sibling contains `dir`, that would be the full contents if "dir" in contents_elem.getnext().xpath("@id")[0]: contents_elem = contents_elem.getnext() - contents = '\n'.join(p.strip() for p in contents_elem.xpath( - "text()")[:-2]) if contents_elem is not None else None + contents = ( + "\n".join(p.strip() for p in contents_elem.xpath("text()")[:-2]) + if contents_elem is not None + else None + ) else: - contents = '\n'.join(p.strip() for p in contents_elem.xpath( - "text()")) if contents_elem is not None else None + contents = ( + "\n".join(p.strip() for p in contents_elem.xpath("text()")) + if contents_elem is not None + else None + ) except Exception: pass @@ -106,82 +135,97 @@ class DoubanBook(AbstractSite): img_url = img_url_elem[0].strip() if img_url_elem else None # there are two html formats for authors and translators - authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/ - preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""") + authors_elem = content.xpath( + """//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/ + preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""" + ) if not authors_elem: authors_elem = content.xpath( - """//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""") + """//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""" + ) if authors_elem: authors = [] for author in authors_elem: - authors.append(RE_WHITESPACES.sub(' ', author.strip())[:200]) + authors.append(RE_WHITESPACES.sub(" ", author.strip())[:200]) else: authors = None - translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/ - preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""") + translators_elem = content.xpath( + """//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/ + preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""" + ) if not translators_elem: translators_elem = content.xpath( - """//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""") + """//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""" + ) if translators_elem: translators = [] for translator in translators_elem: - translators.append(RE_WHITESPACES.sub(' ', translator.strip())) + translators.append(RE_WHITESPACES.sub(" ", translator.strip())) else: translators = None cncode_elem = content.xpath( - "//div[@id='info']//span[text()='统一书号:']/following::text()") + "//div[@id='info']//span[text()='统一书号:']/following::text()" + ) cubn = cncode_elem[0].strip() if cncode_elem else None series_elem = content.xpath( - "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()") + "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()" + ) series = series_elem[0].strip() if series_elem else None imprint_elem = content.xpath( - "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()") + "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()" + ) imprint = imprint_elem[0].strip() if imprint_elem else None data = { - 'title': title, - 'subtitle': subtitle, - 'orig_title': orig_title, - 'author': authors, - 'translator': translators, - 'language': language, - 'pub_house': pub_house, - 'pub_year': pub_year, - 'pub_month': pub_month, - 'binding': binding, - 'price': price, - 'pages': pages, - 'isbn': isbn, - 'cubn': cubn, - 'brief': brief, - 'contents': contents, - 'series': series, - 'imprint': imprint, - 'cover_image_url': img_url, + "title": title, + "subtitle": subtitle, + "orig_title": orig_title, + "author": authors, + "translator": translators, + "language": language, + "pub_house": pub_house, + "pub_year": pub_year, + "pub_month": pub_month, + "binding": binding, + "price": price, + "pages": pages, + "isbn": isbn, + "cubn": cubn, + "brief": brief, + "contents": contents, + "series": series, + "imprint": imprint, + "cover_image_url": img_url, } - works_element = content.xpath('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href') + works_element = content.xpath( + '//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href' + ) if works_element: - r = re.match(r'\w+://book.douban.com/works/(\d+)', works_element[0]) - data['required_resources'] = [{ - 'model': 'Work', - 'id_type': IdType.DoubanBook_Work, - 'id_value': r[1] if r else None, - 'title': data['title'], - 'url': works_element[0], - 'content': {'metadata': {'title': data['title']}} - }] + r = re.match(r"\w+://book.douban.com/works/(\d+)", works_element[0]) + data["required_resources"] = [ + { + "model": "Work", + "id_type": IdType.DoubanBook_Work, + "id_value": r[1] if r else None, + "title": data["title"], + "url": works_element[0], + "content": {"metadata": {"title": data["title"]}}, + } + ] pd = ResourceContent(metadata=data) t, n = detect_isbn_asin(isbn) if t: pd.lookup_ids[t] = n pd.lookup_ids[IdType.CUBN] = cubn - pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(img_url, self.url) + pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image( + img_url, self.url + ) return pd @@ -189,7 +233,7 @@ class DoubanBook(AbstractSite): class DoubanBook_Work(AbstractSite): ID_TYPE = IdType.DoubanBook_Work URL_PATTERNS = [r"\w+://book\.douban\.com/works/(\d+)"] - WIKI_PROPERTY_ID = '?' + WIKI_PROPERTY_ID = "?" DEFAULT_MODEL = Work @classmethod @@ -199,10 +243,12 @@ class DoubanBook_Work(AbstractSite): def scrape(self): content = DoubanDownloader(self.url).download().html() title_elem = content.xpath("//h1/text()") - title = title_elem[0].split('全部版本(')[0].strip() if title_elem else None + title = title_elem[0].split("全部版本(")[0].strip() if title_elem else None if not title: - raise ParseError(self, 'title') - pd = ResourceContent(metadata={ - 'title': title, - }) + raise ParseError(self, "title") + pd = ResourceContent( + metadata={ + "title": title, + } + ) return pd diff --git a/catalog/sites/douban_drama.py b/catalog/sites/douban_drama.py index 8740a61c..a4185cbb 100644 --- a/catalog/sites/douban_drama.py +++ b/catalog/sites/douban_drama.py @@ -12,7 +12,7 @@ class DoubanDrama(AbstractSite): SITE_NAME = SiteName.Douban ID_TYPE = IdType.DoubanDrama URL_PATTERNS = [r"\w+://www.douban.com/location/drama/(\d+)/"] - WIKI_PROPERTY_ID = 'P6443' + WIKI_PROPERTY_ID = "P6443" DEFAULT_MODEL = Performance @classmethod @@ -29,24 +29,51 @@ class DoubanDrama(AbstractSite): else: raise ParseError(self, "title") - data['other_titles'] = [s.strip() for s in title_elem[1:]] - other_title_elem = h.xpath("//dl//dt[text()='又名:']/following::dd[@itemprop='name']/text()") + data["other_titles"] = [s.strip() for s in title_elem[1:]] + other_title_elem = h.xpath( + "//dl//dt[text()='又名:']/following::dd[@itemprop='name']/text()" + ) if len(other_title_elem) > 0: - data['other_titles'].append(other_title_elem[0].strip()) + data["other_titles"].append(other_title_elem[0].strip()) plot_elem = h.xpath("//div[@id='link-report']/text()") if len(plot_elem) == 0: plot_elem = h.xpath("//div[@class='abstract']/text()") - data['brief'] = '\n'.join(plot_elem) if len(plot_elem) > 0 else '' + data["brief"] = "\n".join(plot_elem) if len(plot_elem) > 0 else "" - data['genres'] = [s.strip() for s in h.xpath("//dl//dt[text()='类型:']/following-sibling::dd[@itemprop='genre']/text()")] - data['versions'] = [s.strip() for s in h.xpath("//dl//dt[text()='版本:']/following-sibling::dd[@class='titles']/a//text()")] - data['directors'] = [s.strip() for s in h.xpath("//div[@class='meta']/dl//dt[text()='导演:']/following-sibling::dd/a[@itemprop='director']//text()")] - data['playwrights'] = [s.strip() for s in h.xpath("//div[@class='meta']/dl//dt[text()='编剧:']/following-sibling::dd/a[@itemprop='author']//text()")] - data['actors'] = [s.strip() for s in h.xpath("//div[@class='meta']/dl//dt[text()='主演:']/following-sibling::dd/a[@itemprop='actor']//text()")] + data["genres"] = [ + s.strip() + for s in h.xpath( + "//dl//dt[text()='类型:']/following-sibling::dd[@itemprop='genre']/text()" + ) + ] + data["versions"] = [ + s.strip() + for s in h.xpath( + "//dl//dt[text()='版本:']/following-sibling::dd[@class='titles']/a//text()" + ) + ] + data["directors"] = [ + s.strip() + for s in h.xpath( + "//div[@class='meta']/dl//dt[text()='导演:']/following-sibling::dd/a[@itemprop='director']//text()" + ) + ] + data["playwrights"] = [ + s.strip() + for s in h.xpath( + "//div[@class='meta']/dl//dt[text()='编剧:']/following-sibling::dd/a[@itemprop='author']//text()" + ) + ] + data["actors"] = [ + s.strip() + for s in h.xpath( + "//div[@class='meta']/dl//dt[text()='主演:']/following-sibling::dd/a[@itemprop='actor']//text()" + ) + ] img_url_elem = h.xpath("//img[@itemprop='image']/@src") - data['cover_image_url'] = img_url_elem[0].strip() if img_url_elem else None + data["cover_image_url"] = img_url_elem[0].strip() if img_url_elem else None pd = ResourceContent(metadata=data) if pd.metadata["cover_image_url"]: @@ -55,5 +82,7 @@ class DoubanDrama(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug( + f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}' + ) return pd diff --git a/catalog/sites/douban_game.py b/catalog/sites/douban_game.py index 210002f2..d639cdd4 100644 --- a/catalog/sites/douban_game.py +++ b/catalog/sites/douban_game.py @@ -12,8 +12,11 @@ _logger = logging.getLogger(__name__) class DoubanGame(AbstractSite): SITE_NAME = SiteName.Douban ID_TYPE = IdType.DoubanGame - URL_PATTERNS = [r"\w+://www\.douban\.com/game/(\d+)/{0,1}", r"\w+://m.douban.com/game/subject/(\d+)/{0,1}"] - WIKI_PROPERTY_ID = '' + URL_PATTERNS = [ + r"\w+://www\.douban\.com/game/(\d+)/{0,1}", + r"\w+://m.douban.com/game/subject/(\d+)/{0,1}", + ] + WIKI_PROPERTY_ID = "" DEFAULT_MODEL = Game @classmethod @@ -29,49 +32,69 @@ class DoubanGame(AbstractSite): raise ParseError(self, "title") other_title_elem = content.xpath( - "//dl[@class='game-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()") - other_title = other_title_elem[0].strip().split(' / ') if other_title_elem else None + "//dl[@class='game-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()" + ) + other_title = ( + other_title_elem[0].strip().split(" / ") if other_title_elem else None + ) developer_elem = content.xpath( - "//dl[@class='game-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()") - developer = developer_elem[0].strip().split(' / ') if developer_elem else None + "//dl[@class='game-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()" + ) + developer = developer_elem[0].strip().split(" / ") if developer_elem else None publisher_elem = content.xpath( - "//dl[@class='game-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()") - publisher = publisher_elem[0].strip().split(' / ') if publisher_elem else None + "//dl[@class='game-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()" + ) + publisher = publisher_elem[0].strip().split(" / ") if publisher_elem else None platform_elem = content.xpath( - "//dl[@class='game-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()") + "//dl[@class='game-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()" + ) platform = platform_elem if platform_elem else None genre_elem = content.xpath( - "//dl[@class='game-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()") + "//dl[@class='game-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()" + ) genre = None if genre_elem: - genre = [g for g in genre_elem if g != '游戏'] + genre = [g for g in genre_elem if g != "游戏"] date_elem = content.xpath( - "//dl[@class='game-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()") - release_date = dateparser.parse(date_elem[0].strip()).strftime('%Y-%m-%d') if date_elem else None + "//dl[@class='game-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()" + ) + release_date = ( + dateparser.parse(date_elem[0].strip()).strftime("%Y-%m-%d") + if date_elem + else None + ) brief_elem = content.xpath("//div[@class='mod item-desc']/p/text()") - brief = '\n'.join(brief_elem) if brief_elem else None + brief = "\n".join(brief_elem) if brief_elem else None img_url_elem = content.xpath( - "//div[@class='item-subject-info']/div[@class='pic']//img/@src") + "//div[@class='item-subject-info']/div[@class='pic']//img/@src" + ) img_url = img_url_elem[0].strip() if img_url_elem else None - pd = ResourceContent(metadata={ - 'title': title, - 'other_title': other_title, - 'developer': developer, - 'publisher': publisher, - 'release_date': release_date, - 'genre': genre, - 'platform': platform, - 'brief': brief, - 'cover_image_url': img_url - }) + pd = ResourceContent( + metadata={ + "title": title, + "other_title": other_title, + "developer": developer, + "publisher": publisher, + "release_date": release_date, + "genre": genre, + "platform": platform, + "brief": brief, + "cover_image_url": img_url, + } + ) if pd.metadata["cover_image_url"]: - pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(pd.metadata['cover_image_url'], self.url) + ( + pd.cover_image, + pd.cover_image_extention, + ) = BasicImageDownloader.download_image( + pd.metadata["cover_image_url"], self.url + ) return pd diff --git a/catalog/sites/douban_movie.py b/catalog/sites/douban_movie.py index d00ddc16..1a246887 100644 --- a/catalog/sites/douban_movie.py +++ b/catalog/sites/douban_movie.py @@ -15,8 +15,11 @@ _logger = logging.getLogger(__name__) class DoubanMovie(AbstractSite): SITE_NAME = SiteName.Douban ID_TYPE = IdType.DoubanMovie - URL_PATTERNS = [r"\w+://movie\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/movie/subject/(\d+)/{0,1}"] - WIKI_PROPERTY_ID = '?' + URL_PATTERNS = [ + r"\w+://movie\.douban\.com/subject/(\d+)/{0,1}", + r"\w+://m.douban.com/movie/subject/(\d+)/{0,1}", + ] + WIKI_PROPERTY_ID = "?" # no DEFAULT_MODEL as it may be either TV Season and Movie @classmethod @@ -27,16 +30,16 @@ class DoubanMovie(AbstractSite): content = DoubanDownloader(self.url).download().html() try: - raw_title = content.xpath( - "//span[@property='v:itemreviewed']/text()")[0].strip() + raw_title = content.xpath("//span[@property='v:itemreviewed']/text()")[ + 0 + ].strip() except IndexError: - raise ParseError(self, 'title') + raise ParseError(self, "title") - orig_title = content.xpath( - "//img[@rel='v:image']/@alt")[0].strip() + orig_title = content.xpath("//img[@rel='v:image']/@alt")[0].strip() title = raw_title.split(orig_title)[0].strip() # if has no chinese title - if title == '': + if title == "": title = orig_title if title == orig_title: @@ -44,107 +47,134 @@ class DoubanMovie(AbstractSite): # there are two html formats for authors and translators other_title_elem = content.xpath( - "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]") - other_title = other_title_elem[0].strip().split( - ' / ') if other_title_elem else None + "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]" + ) + other_title = ( + other_title_elem[0].strip().split(" / ") if other_title_elem else None + ) imdb_elem = content.xpath( - "//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()") + "//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()" + ) if not imdb_elem: imdb_elem = content.xpath( - "//div[@id='info']//span[text()='IMDb:']/following-sibling::text()[1]") + "//div[@id='info']//span[text()='IMDb:']/following-sibling::text()[1]" + ) imdb_code = imdb_elem[0].strip() if imdb_elem else None director_elem = content.xpath( - "//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()") + "//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()" + ) director = director_elem if director_elem else None playwright_elem = content.xpath( - "//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()") - playwright = list(map(lambda a: a[:200], playwright_elem)) if playwright_elem else None + "//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()" + ) + playwright = ( + list(map(lambda a: a[:200], playwright_elem)) if playwright_elem else None + ) actor_elem = content.xpath( - "//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()") + "//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()" + ) actor = list(map(lambda a: a[:200], actor_elem)) if actor_elem else None genre_elem = content.xpath("//span[@property='v:genre']/text()") genre = [] if genre_elem: for g in genre_elem: - g = g.split(' ')[0] - if g == '紀錄片': # likely some original data on douban was corrupted - g = '纪录片' - elif g == '鬼怪': - g = '惊悚' + g = g.split(" ")[0] + if g == "紀錄片": # likely some original data on douban was corrupted + g = "纪录片" + elif g == "鬼怪": + g = "惊悚" genre.append(g) - showtime_elem = content.xpath( - "//span[@property='v:initialReleaseDate']/text()") + showtime_elem = content.xpath("//span[@property='v:initialReleaseDate']/text()") if showtime_elem: showtime = [] for st in showtime_elem: - parts = st.split('(') + parts = st.split("(") if len(parts) == 1: - time = st.split('(')[0] - region = '' + time = st.split("(")[0] + region = "" else: - time = st.split('(')[0] - region = st.split('(')[1][0:-1] + time = st.split("(")[0] + region = st.split("(")[1][0:-1] showtime.append({time: region}) else: showtime = None site_elem = content.xpath( - "//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href") + "//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href" + ) site = site_elem[0].strip()[:200] if site_elem else None - if site and not re.match(r'http.+', site): + if site and not re.match(r"http.+", site): site = None area_elem = content.xpath( - "//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]") + "//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]" + ) if area_elem: - area = [a.strip()[:100] for a in area_elem[0].split('/')] + area = [a.strip()[:100] for a in area_elem[0].split("/")] else: area = None language_elem = content.xpath( - "//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]") + "//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]" + ) if language_elem: - language = [a.strip() for a in language_elem[0].split(' / ')] + language = [a.strip() for a in language_elem[0].split(" / ")] else: language = None year_elem = content.xpath("//span[@class='year']/text()") - year = int(re.search(r'\d+', year_elem[0])[0]) if year_elem and re.search(r'\d+', year_elem[0]) else None + year = ( + int(re.search(r"\d+", year_elem[0])[0]) + if year_elem and re.search(r"\d+", year_elem[0]) + else None + ) duration_elem = content.xpath("//span[@property='v:runtime']/text()") other_duration_elem = content.xpath( - "//span[@property='v:runtime']/following-sibling::text()[1]") + "//span[@property='v:runtime']/following-sibling::text()[1]" + ) if duration_elem: duration = duration_elem[0].strip() if other_duration_elem: duration += other_duration_elem[0].rstrip() - duration = duration.split('/')[0].strip() + duration = duration.split("/")[0].strip() else: duration = None season_elem = content.xpath( - "//*[@id='season']/option[@selected='selected']/text()") + "//*[@id='season']/option[@selected='selected']/text()" + ) if not season_elem: season_elem = content.xpath( - "//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]") + "//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]" + ) season = int(season_elem[0].strip()) if season_elem else None else: season = int(season_elem[0].strip()) episodes_elem = content.xpath( - "//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]") - episodes = int(episodes_elem[0].strip()) if episodes_elem and episodes_elem[0].strip().isdigit() else None + "//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]" + ) + episodes = ( + int(episodes_elem[0].strip()) + if episodes_elem and episodes_elem[0].strip().isdigit() + else None + ) single_episode_length_elem = content.xpath( - "//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]") - single_episode_length = single_episode_length_elem[0].strip( - )[:100] if single_episode_length_elem else None + "//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]" + ) + single_episode_length = ( + single_episode_length_elem[0].strip()[:100] + if single_episode_length_elem + else None + ) # if has field `episodes` not none then must be series is_series = True if episodes else False @@ -152,64 +182,87 @@ class DoubanMovie(AbstractSite): brief_elem = content.xpath("//span[@class='all hidden']") if not brief_elem: brief_elem = content.xpath("//span[@property='v:summary']") - brief = '\n'.join([e.strip() for e in brief_elem[0].xpath( - './text()')]) if brief_elem else None + brief = ( + "\n".join([e.strip() for e in brief_elem[0].xpath("./text()")]) + if brief_elem + else None + ) img_url_elem = content.xpath("//img[@rel='v:image']/@src") img_url = img_url_elem[0].strip() if img_url_elem else None - pd = ResourceContent(metadata={ - 'title': title, - 'orig_title': orig_title, - 'other_title': other_title, - 'imdb_code': imdb_code, - 'director': director, - 'playwright': playwright, - 'actor': actor, - 'genre': genre, - 'showtime': showtime, - 'site': site, - 'area': area, - 'language': language, - 'year': year, - 'duration': duration, - 'season_number': season, - 'episode_count': episodes, - 'single_episode_length': single_episode_length, - 'brief': brief, - 'is_series': is_series, - 'cover_image_url': img_url, - }) - pd.metadata['preferred_model'] = ('TVSeason' if season else 'TVShow') if is_series else 'Movie' + pd = ResourceContent( + metadata={ + "title": title, + "orig_title": orig_title, + "other_title": other_title, + "imdb_code": imdb_code, + "director": director, + "playwright": playwright, + "actor": actor, + "genre": genre, + "showtime": showtime, + "site": site, + "area": area, + "language": language, + "year": year, + "duration": duration, + "season_number": season, + "episode_count": episodes, + "single_episode_length": single_episode_length, + "brief": brief, + "is_series": is_series, + "cover_image_url": img_url, + } + ) + pd.metadata["preferred_model"] = ( + ("TVSeason" if season else "TVShow") if is_series else "Movie" + ) if imdb_code: res_data = search_tmdb_by_imdb_id(imdb_code) tmdb_show_id = None - if 'movie_results' in res_data and len(res_data['movie_results']) > 0: - pd.metadata['preferred_model'] = 'Movie' - elif 'tv_results' in res_data and len(res_data['tv_results']) > 0: - pd.metadata['preferred_model'] = 'TVShow' - elif 'tv_season_results' in res_data and len(res_data['tv_season_results']) > 0: - pd.metadata['preferred_model'] = 'TVSeason' - tmdb_show_id = res_data['tv_season_results'][0]['show_id'] - elif 'tv_episode_results' in res_data and len(res_data['tv_episode_results']) > 0: - pd.metadata['preferred_model'] = 'TVSeason' - tmdb_show_id = res_data['tv_episode_results'][0]['show_id'] - if res_data['tv_episode_results'][0]['episode_number'] != 1: - _logger.warning(f'Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}') - resp = query_tmdb_tv_episode(tmdb_show_id, res_data['tv_episode_results'][0]['season_number'], 1) - imdb_code = resp['external_ids']['imdb_id'] - _logger.warning(f'Douban Movie {self.url} re-mapped to imdb episode {imdb_code}') + if "movie_results" in res_data and len(res_data["movie_results"]) > 0: + pd.metadata["preferred_model"] = "Movie" + elif "tv_results" in res_data and len(res_data["tv_results"]) > 0: + pd.metadata["preferred_model"] = "TVShow" + elif ( + "tv_season_results" in res_data + and len(res_data["tv_season_results"]) > 0 + ): + pd.metadata["preferred_model"] = "TVSeason" + tmdb_show_id = res_data["tv_season_results"][0]["show_id"] + elif ( + "tv_episode_results" in res_data + and len(res_data["tv_episode_results"]) > 0 + ): + pd.metadata["preferred_model"] = "TVSeason" + tmdb_show_id = res_data["tv_episode_results"][0]["show_id"] + if res_data["tv_episode_results"][0]["episode_number"] != 1: + _logger.warning( + f"Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}" + ) + resp = query_tmdb_tv_episode( + tmdb_show_id, + res_data["tv_episode_results"][0]["season_number"], + 1, + ) + imdb_code = resp["external_ids"]["imdb_id"] + _logger.warning( + f"Douban Movie {self.url} re-mapped to imdb episode {imdb_code}" + ) pd.lookup_ids[IdType.IMDB] = imdb_code if tmdb_show_id: - pd.metadata['required_resources'] = [{ - 'model': 'TVShow', - 'id_type': IdType.TMDB_TV, - 'id_value': tmdb_show_id, - 'title': title, - 'url': TMDB_TV.id_to_url(tmdb_show_id), - }] + pd.metadata["required_resources"] = [ + { + "model": "TVShow", + "id_type": IdType.TMDB_TV, + "id_value": tmdb_show_id, + "title": title, + "url": TMDB_TV.id_to_url(tmdb_show_id), + } + ] # TODO parse sister seasons # pd.metadata['related_resources'] = [] if pd.metadata["cover_image_url"]: @@ -218,5 +271,7 @@ class DoubanMovie(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug( + f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}' + ) return pd diff --git a/catalog/sites/douban_music.py b/catalog/sites/douban_music.py index 1d8038f0..db31a7b0 100644 --- a/catalog/sites/douban_music.py +++ b/catalog/sites/douban_music.py @@ -12,8 +12,11 @@ _logger = logging.getLogger(__name__) class DoubanMusic(AbstractSite): SITE_NAME = SiteName.Douban ID_TYPE = IdType.DoubanMusic - URL_PATTERNS = [r"\w+://music\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/music/subject/(\d+)/{0,1}"] - WIKI_PROPERTY_ID = '' + URL_PATTERNS = [ + r"\w+://music\.douban\.com/subject/(\d+)/{0,1}", + r"\w+://m.douban.com/music/subject/(\d+)/{0,1}", + ] + WIKI_PROPERTY_ID = "" DEFAULT_MODEL = Album @classmethod @@ -28,75 +31,95 @@ class DoubanMusic(AbstractSite): if not title: raise ParseError(self, "title") - artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()") - artist = None if not artists_elem else list(map(lambda a: a[:200], artists_elem)) + artists_elem = content.xpath( + "//div[@id='info']/span/span[@class='pl']/a/text()" + ) + artist = ( + None if not artists_elem else list(map(lambda a: a[:200], artists_elem)) + ) genre_elem = content.xpath( - "//div[@id='info']//span[text()='流派:']/following::text()[1]") + "//div[@id='info']//span[text()='流派:']/following::text()[1]" + ) genre = genre_elem[0].strip() if genre_elem else None date_elem = content.xpath( - "//div[@id='info']//span[text()='发行时间:']/following::text()[1]") - release_date = dateparser.parse(date_elem[0].strip()).strftime('%Y-%m-%d') if date_elem else None + "//div[@id='info']//span[text()='发行时间:']/following::text()[1]" + ) + release_date = ( + dateparser.parse(date_elem[0].strip()).strftime("%Y-%m-%d") + if date_elem + else None + ) company_elem = content.xpath( - "//div[@id='info']//span[text()='出版者:']/following::text()[1]") + "//div[@id='info']//span[text()='出版者:']/following::text()[1]" + ) company = company_elem[0].strip() if company_elem else None track_list_elem = content.xpath( "//div[@class='track-list']/div[@class='indent']/div/text()" ) if track_list_elem: - track_list = '\n'.join([track.strip() for track in track_list_elem]) + track_list = "\n".join([track.strip() for track in track_list_elem]) else: track_list = None brief_elem = content.xpath("//span[@class='all hidden']") if not brief_elem: brief_elem = content.xpath("//span[@property='v:summary']") - brief = '\n'.join([e.strip() for e in brief_elem[0].xpath( - './text()')]) if brief_elem else None + brief = ( + "\n".join([e.strip() for e in brief_elem[0].xpath("./text()")]) + if brief_elem + else None + ) img_url_elem = content.xpath("//div[@id='mainpic']//img/@src") img_url = img_url_elem[0].strip() if img_url_elem else None data = { - 'title': title, - 'artist': artist, - 'genre': genre, - 'release_date': release_date, - 'duration': None, - 'company': [company], - 'track_list': track_list, - 'brief': brief, - 'cover_image_url': img_url + "title": title, + "artist": artist, + "genre": genre, + "release_date": release_date, + "duration": None, + "company": [company], + "track_list": track_list, + "brief": brief, + "cover_image_url": img_url, } gtin = None isrc = None other_elem = content.xpath( - "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]") + "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]" + ) if other_elem: - data['other_title'] = other_elem[0].strip() + data["other_title"] = other_elem[0].strip() other_elem = content.xpath( - "//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]") + "//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]" + ) if other_elem: - data['album_type'] = other_elem[0].strip() + data["album_type"] = other_elem[0].strip() other_elem = content.xpath( - "//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]") + "//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]" + ) if other_elem: - data['media'] = other_elem[0].strip() + data["media"] = other_elem[0].strip() other_elem = content.xpath( - "//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]") + "//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]" + ) if other_elem: isrc = other_elem[0].strip() other_elem = content.xpath( - "//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]") + "//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]" + ) if other_elem: gtin = other_elem[0].strip() other_elem = content.xpath( - "//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]") + "//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]" + ) if other_elem: - data['disc_count'] = other_elem[0].strip() + data["disc_count"] = other_elem[0].strip() pd = ResourceContent(metadata=data) if gtin: @@ -109,5 +132,7 @@ class DoubanMusic(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug( + f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}' + ) return pd diff --git a/catalog/sites/goodreads.py b/catalog/sites/goodreads.py index 45d3d637..b6d2768f 100644 --- a/catalog/sites/goodreads.py +++ b/catalog/sites/goodreads.py @@ -14,7 +14,7 @@ class GoodreadsDownloader(RetryDownloader): if response is None: return RESPONSE_NETWORK_ERROR elif response.status_code == 200: - if response.text.find('__NEXT_DATA__') != -1: + if response.text.find("__NEXT_DATA__") != -1: return RESPONSE_OK else: # Goodreads may return legacy version for a/b testing @@ -28,9 +28,12 @@ class GoodreadsDownloader(RetryDownloader): class Goodreads(AbstractSite): SITE_NAME = SiteName.Goodreads ID_TYPE = IdType.Goodreads - WIKI_PROPERTY_ID = 'P2968' + WIKI_PROPERTY_ID = "P2968" DEFAULT_MODEL = Edition - URL_PATTERNS = [r".+goodreads.com/.*book/show/(\d+)", r".+goodreads.com/.*book/(\d+)"] + URL_PATTERNS = [ + r".+goodreads.com/.*book/show/(\d+)", + r".+goodreads.com/.*book/(\d+)", + ] @classmethod def id_to_url(self, id_value): @@ -48,39 +51,41 @@ class Goodreads(AbstractSite): elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()') src = elem[0].strip() if elem else None if not src: - raise ParseError(self, '__NEXT_DATA__ element') - d = json.loads(src)['props']['pageProps']['apolloState'] - o = {'Book': [], 'Work': [], 'Series': [], 'Contributor': []} + raise ParseError(self, "__NEXT_DATA__ element") + d = json.loads(src)["props"]["pageProps"]["apolloState"] + o = {"Book": [], "Work": [], "Series": [], "Contributor": []} for v in d.values(): - t = v.get('__typename') + t = v.get("__typename") if t and t in o: o[t].append(v) - b = next(filter(lambda x: x.get('title'), o['Book']), None) + b = next(filter(lambda x: x.get("title"), o["Book"]), None) if not b: # Goodreads may return empty page template when internal service timeouts - raise ParseError(self, 'Book in __NEXT_DATA__ json') - data['title'] = b['title'] - data['brief'] = b['description'] + raise ParseError(self, "Book in __NEXT_DATA__ json") + data["title"] = b["title"] + data["brief"] = b["description"] ids = {} - t, n = detect_isbn_asin(b['details'].get('asin')) + t, n = detect_isbn_asin(b["details"].get("asin")) if t: ids[t] = n - t, n = detect_isbn_asin(b['details'].get('isbn13')) + t, n = detect_isbn_asin(b["details"].get("isbn13")) if t: ids[t] = n # amazon has a known problem to use another book's isbn as asin # so we alway overwrite asin-converted isbn with real isbn - data['pages'] = b['details'].get('numPages') - data['cover_image_url'] = b['imageUrl'] - w = next(filter(lambda x: x.get('details'), o['Work']), None) + data["pages"] = b["details"].get("numPages") + data["cover_image_url"] = b["imageUrl"] + w = next(filter(lambda x: x.get("details"), o["Work"]), None) if w: - data['required_resources'] = [{ - 'model': 'Work', - 'id_type': IdType.Goodreads_Work, - 'id_value': str(w['legacyId']), - 'title': w['details']['originalTitle'], - 'url': w['editions']['webUrl'], - }] + data["required_resources"] = [ + { + "model": "Work", + "id_type": IdType.Goodreads_Work, + "id_value": str(w["legacyId"]), + "title": w["details"]["originalTitle"], + "url": w["editions"]["webUrl"], + } + ] pd = ResourceContent(metadata=data) pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN) pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN) @@ -90,7 +95,9 @@ class Goodreads(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - _logger.debug(f'failed to download cover for {self.url} from {data["cover_image_url"]}') + _logger.debug( + f'failed to download cover for {self.url} from {data["cover_image_url"]}' + ) return pd @@ -98,7 +105,7 @@ class Goodreads(AbstractSite): class Goodreads_Work(AbstractSite): SITE_NAME = SiteName.Goodreads ID_TYPE = IdType.Goodreads_Work - WIKI_PROPERTY_ID = '' + WIKI_PROPERTY_ID = "" DEFAULT_MODEL = Work URL_PATTERNS = [r".+goodreads.com/work/editions/(\d+)"] @@ -111,14 +118,18 @@ class Goodreads_Work(AbstractSite): title_elem = content.xpath("//h1/a/text()") title = title_elem[0].strip() if title_elem else None if not title: - raise ParseError(self, 'title') + raise ParseError(self, "title") author_elem = content.xpath("//h2/a/text()") author = author_elem[0].strip() if author_elem else None first_published_elem = content.xpath("//h2/span/text()") - first_published = first_published_elem[0].strip() if first_published_elem else None - pd = ResourceContent(metadata={ - 'title': title, - 'author': author, - 'first_published': first_published - }) + first_published = ( + first_published_elem[0].strip() if first_published_elem else None + ) + pd = ResourceContent( + metadata={ + "title": title, + "author": author, + "first_published": first_published, + } + ) return pd diff --git a/catalog/sites/google_books.py b/catalog/sites/google_books.py index 806056a6..a036df37 100644 --- a/catalog/sites/google_books.py +++ b/catalog/sites/google_books.py @@ -16,7 +16,7 @@ class GoogleBooks(AbstractSite): r"https://www\.google\.co[^/]+/books/edition/[^/]+/([^&#?]+)", r"https://books\.google\.co[^/]+/books/about/[^?]+?id=([^&#?]+)", ] - WIKI_PROPERTY_ID = '' + WIKI_PROPERTY_ID = "" DEFAULT_MODEL = Edition @classmethod @@ -24,57 +24,76 @@ class GoogleBooks(AbstractSite): return "https://books.google.com/books?id=" + id_value def scrape(self): - api_url = f'https://www.googleapis.com/books/v1/volumes/{self.id_value}' + api_url = f"https://www.googleapis.com/books/v1/volumes/{self.id_value}" b = BasicDownloader(api_url).download().json() other = {} - title = b['volumeInfo']['title'] - subtitle = b['volumeInfo']['subtitle'] if 'subtitle' in b['volumeInfo'] else None + title = b["volumeInfo"]["title"] + subtitle = ( + b["volumeInfo"]["subtitle"] if "subtitle" in b["volumeInfo"] else None + ) pub_year = None pub_month = None - if 'publishedDate' in b['volumeInfo']: - pub_date = b['volumeInfo']['publishedDate'].split('-') + if "publishedDate" in b["volumeInfo"]: + pub_date = b["volumeInfo"]["publishedDate"].split("-") pub_year = pub_date[0] pub_month = pub_date[1] if len(pub_date) > 1 else None - pub_house = b['volumeInfo']['publisher'] if 'publisher' in b['volumeInfo'] else None - language = b['volumeInfo']['language'] if 'language' in b['volumeInfo'] else None - pages = b['volumeInfo']['pageCount'] if 'pageCount' in b['volumeInfo'] else None - if 'mainCategory' in b['volumeInfo']: - other['分类'] = b['volumeInfo']['mainCategory'] - authors = b['volumeInfo']['authors'] if 'authors' in b['volumeInfo'] else None - if 'description' in b['volumeInfo']: - brief = b['volumeInfo']['description'] - elif 'textSnippet' in b['volumeInfo']: + pub_house = ( + b["volumeInfo"]["publisher"] if "publisher" in b["volumeInfo"] else None + ) + language = ( + b["volumeInfo"]["language"] if "language" in b["volumeInfo"] else None + ) + pages = b["volumeInfo"]["pageCount"] if "pageCount" in b["volumeInfo"] else None + if "mainCategory" in b["volumeInfo"]: + other["分类"] = b["volumeInfo"]["mainCategory"] + authors = b["volumeInfo"]["authors"] if "authors" in b["volumeInfo"] else None + if "description" in b["volumeInfo"]: + brief = b["volumeInfo"]["description"] + elif "textSnippet" in b["volumeInfo"]: brief = b["volumeInfo"]["textSnippet"]["searchInfo"] else: - brief = '' - brief = re.sub(r'<.*?>', '', brief.replace('", "", brief.replace(" 0: - url = f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}" - elif 'tv_results' in res_data and len(res_data['tv_results']) > 0: + if "movie_results" in res_data and len(res_data["movie_results"]) > 0: + url = ( + f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}" + ) + elif "tv_results" in res_data and len(res_data["tv_results"]) > 0: url = f"https://www.themoviedb.org/tv/{res_data['tv_results'][0]['id']}" - elif 'tv_season_results' in res_data and len(res_data['tv_season_results']) > 0: + elif "tv_season_results" in res_data and len(res_data["tv_season_results"]) > 0: # this should not happen given IMDB only has ids for either show or episode - tv_id = res_data['tv_season_results'][0]['show_id'] - season_number = res_data['tv_season_results'][0]['season_number'] + tv_id = res_data["tv_season_results"][0]["show_id"] + season_number = res_data["tv_season_results"][0]["season_number"] url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}" - elif 'tv_episode_results' in res_data and len(res_data['tv_episode_results']) > 0: - tv_id = res_data['tv_episode_results'][0]['show_id'] - season_number = res_data['tv_episode_results'][0]['season_number'] - episode_number = res_data['tv_episode_results'][0]['episode_number'] + elif ( + "tv_episode_results" in res_data and len(res_data["tv_episode_results"]) > 0 + ): + tv_id = res_data["tv_episode_results"][0]["show_id"] + season_number = res_data["tv_episode_results"][0]["season_number"] + episode_number = res_data["tv_episode_results"][0]["episode_number"] if season_number == 0: url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}" elif episode_number == 1: url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}" else: - raise ParseError(self, "IMDB id matching TMDB but not first episode, this is not supported") + raise ParseError( + self, + "IMDB id matching TMDB but not first episode, this is not supported", + ) else: raise ParseError(self, "IMDB id not found in TMDB") tmdb = SiteManager.get_site_by_url(url) pd = tmdb.scrape() - pd.metadata['preferred_model'] = tmdb.DEFAULT_MODEL.__name__ + pd.metadata["preferred_model"] = tmdb.DEFAULT_MODEL.__name__ return pd diff --git a/catalog/sites/spotify.py b/catalog/sites/spotify.py index 23f68120..5656cc1a 100644 --- a/catalog/sites/spotify.py +++ b/catalog/sites/spotify.py @@ -23,8 +23,8 @@ spotify_token_expire_time = time.time() class Spotify(AbstractSite): SITE_NAME = SiteName.Spotify ID_TYPE = IdType.Spotify_Album - URL_PATTERNS = [r'\w+://open\.spotify\.com/album/([a-zA-Z0-9]+)'] - WIKI_PROPERTY_ID = '?' + URL_PATTERNS = [r"\w+://open\.spotify\.com/album/([a-zA-Z0-9]+)"] + WIKI_PROPERTY_ID = "?" DEFAULT_MODEL = Album @classmethod @@ -33,58 +33,63 @@ class Spotify(AbstractSite): def scrape(self): api_url = "https://api.spotify.com/v1/albums/" + self.id_value - headers = { - 'Authorization': f"Bearer {get_spotify_token()}" - } + headers = {"Authorization": f"Bearer {get_spotify_token()}"} res_data = BasicDownloader(api_url, headers=headers).download().json() artist = [] - for artist_dict in res_data['artists']: - artist.append(artist_dict['name']) + for artist_dict in res_data["artists"]: + artist.append(artist_dict["name"]) - title = res_data['name'] + title = res_data["name"] - genre = ', '.join(res_data['genres']) + genre = ", ".join(res_data["genres"]) company = [] - for com in res_data['copyrights']: - company.append(com['text']) + for com in res_data["copyrights"]: + company.append(com["text"]) duration = 0 track_list = [] track_urls = [] - for track in res_data['tracks']['items']: - track_urls.append(track['external_urls']['spotify']) - duration += track['duration_ms'] - if res_data['tracks']['items'][-1]['disc_number'] > 1: + for track in res_data["tracks"]["items"]: + track_urls.append(track["external_urls"]["spotify"]) + duration += track["duration_ms"] + if res_data["tracks"]["items"][-1]["disc_number"] > 1: # more than one disc - track_list.append(str( - track['disc_number']) + '-' + str(track['track_number']) + '. ' + track['name']) + track_list.append( + str(track["disc_number"]) + + "-" + + str(track["track_number"]) + + ". " + + track["name"] + ) else: - track_list.append(str(track['track_number']) + '. ' + track['name']) - track_list = '\n'.join(track_list) + track_list.append(str(track["track_number"]) + ". " + track["name"]) + track_list = "\n".join(track_list) - release_date = dateparser.parse(res_data['release_date']).strftime('%Y-%m-%d') + release_date = dateparser.parse(res_data["release_date"]).strftime("%Y-%m-%d") gtin = None - if res_data['external_ids'].get('upc'): - gtin = res_data['external_ids'].get('upc') - if res_data['external_ids'].get('ean'): - gtin = res_data['external_ids'].get('ean') + if res_data["external_ids"].get("upc"): + gtin = res_data["external_ids"].get("upc") + if res_data["external_ids"].get("ean"): + gtin = res_data["external_ids"].get("ean") isrc = None - if res_data['external_ids'].get('isrc'): - isrc = res_data['external_ids'].get('isrc') + if res_data["external_ids"].get("isrc"): + isrc = res_data["external_ids"].get("isrc") - pd = ResourceContent(metadata={ - 'title': title, - 'artist': artist, - 'genre': genre, - 'track_list': track_list, - 'release_date': release_date, - 'duration': duration, - 'company': company, - 'brief': None, - 'cover_image_url': res_data['images'][0]['url'] - }) + pd = ResourceContent( + metadata={ + "title": title, + "artist": artist, + "genre": genre, + "track_list": track_list, + "release_date": release_date, + "duration": duration, + "company": company, + "brief": None, + "cover_image_url": res_data["images"][0]["url"], + } + ) if gtin: pd.lookup_ids[IdType.GTIN] = gtin if isrc: @@ -95,14 +100,16 @@ class Spotify(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug( + f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}' + ) return pd def get_spotify_token(): global spotify_token, spotify_token_expire_time if get_mock_mode(): - return 'mocked' + return "mocked" if spotify_token is None or is_spotify_token_expired(): invoke_spotify_token() return spotify_token @@ -117,12 +124,8 @@ def invoke_spotify_token(): global spotify_token, spotify_token_expire_time r = requests.post( "https://accounts.spotify.com/api/token", - data={ - "grant_type": "client_credentials" - }, - headers={ - "Authorization": f"Basic {settings.SPOTIFY_CREDENTIAL}" - } + data={"grant_type": "client_credentials"}, + headers={"Authorization": f"Basic {settings.SPOTIFY_CREDENTIAL}"}, ) data = r.json() if r.status_code == 401: @@ -131,16 +134,12 @@ def invoke_spotify_token(): # for example debugging using a http client r = requests.post( "https://accounts.spotify.com/api/token", - data={ - "grant_type": "client_credentials" - }, - headers={ - "Authorization": f"Basic {settings.SPOTIFY_CREDENTIAL}" - } + data={"grant_type": "client_credentials"}, + headers={"Authorization": f"Basic {settings.SPOTIFY_CREDENTIAL}"}, ) data = r.json() elif r.status_code != 200: raise Exception(f"Request to spotify API fails. Reason: {r.reason}") # minus 2 for execution time error - spotify_token_expire_time = int(data['expires_in']) + time.time() - 2 - spotify_token = data['access_token'] + spotify_token_expire_time = int(data["expires_in"]) + time.time() - 2 + spotify_token = data["access_token"] diff --git a/catalog/sites/steam.py b/catalog/sites/steam.py index 029e885f..77815361 100644 --- a/catalog/sites/steam.py +++ b/catalog/sites/steam.py @@ -13,7 +13,7 @@ class Steam(AbstractSite): SITE_NAME = SiteName.Steam ID_TYPE = IdType.Steam URL_PATTERNS = [r"\w+://store\.steampowered\.com/app/(\d+)"] - WIKI_PROPERTY_ID = '?' + WIKI_PROPERTY_ID = "?" DEFAULT_MODEL = Game @classmethod @@ -25,41 +25,58 @@ class Steam(AbstractSite): pd = i.scrape() if i else ResourceContent() headers = BasicDownloader.headers.copy() - headers['Host'] = 'store.steampowered.com' - headers['Cookie'] = "wants_mature_content=1; birthtime=754700401;" + headers["Host"] = "store.steampowered.com" + headers["Cookie"] = "wants_mature_content=1; birthtime=754700401;" content = BasicDownloader(self.url, headers=headers).download().html() title = content.xpath("//div[@class='apphub_AppName']/text()")[0] developer = content.xpath("//div[@id='developers_list']/a/text()") - publisher = content.xpath("//div[@class='glance_ctn']//div[@class='dev_row'][2]//a/text()") + publisher = content.xpath( + "//div[@class='glance_ctn']//div[@class='dev_row'][2]//a/text()" + ) release_date = dateparser.parse( - content.xpath( - "//div[@class='release_date']/div[@class='date']/text()")[0] - ).strftime('%Y-%m-%d') + content.xpath("//div[@class='release_date']/div[@class='date']/text()")[0] + ).strftime("%Y-%m-%d") genre = content.xpath( - "//div[@class='details_block']/b[2]/following-sibling::a/text()") - platform = ['PC'] - brief = content.xpath( - "//div[@class='game_description_snippet']/text()")[0].strip() + "//div[@class='details_block']/b[2]/following-sibling::a/text()" + ) + platform = ["PC"] + brief = content.xpath("//div[@class='game_description_snippet']/text()")[ + 0 + ].strip() # try Steam images if no image from IGDB if pd.cover_image is None: - pd.metadata['cover_image_url'] = content.xpath("//img[@class='game_header_image_full']/@src")[0].replace("header.jpg", "library_600x900.jpg") - pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(pd.metadata['cover_image_url'], self.url) + pd.metadata["cover_image_url"] = content.xpath( + "//img[@class='game_header_image_full']/@src" + )[0].replace("header.jpg", "library_600x900.jpg") + ( + pd.cover_image, + pd.cover_image_extention, + ) = BasicImageDownloader.download_image( + pd.metadata["cover_image_url"], self.url + ) if pd.cover_image is None: - pd.metadata['cover_image_url'] = content.xpath("//img[@class='game_header_image_full']/@src")[0] - pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(pd.metadata['cover_image_url'], self.url) + pd.metadata["cover_image_url"] = content.xpath( + "//img[@class='game_header_image_full']/@src" + )[0] + ( + pd.cover_image, + pd.cover_image_extention, + ) = BasicImageDownloader.download_image( + pd.metadata["cover_image_url"], self.url + ) # merge data from IGDB, use localized Steam data if available d = { - 'developer': developer, - 'publisher': publisher, - 'release_date': release_date, - 'genre': genre, - 'platform': platform, + "developer": developer, + "publisher": publisher, + "release_date": release_date, + "genre": genre, + "platform": platform, } d.update(pd.metadata) pd.metadata = d if title: - pd.metadata['title'] = title + pd.metadata["title"] = title if brief: - pd.metadata['brief'] = brief + pd.metadata["brief"] = brief return pd diff --git a/catalog/sites/tmdb.py b/catalog/sites/tmdb.py index ba7f7538..1b721bfe 100644 --- a/catalog/sites/tmdb.py +++ b/catalog/sites/tmdb.py @@ -37,8 +37,8 @@ def _copy_dict(s, key_map): class TMDB_Movie(AbstractSite): SITE_NAME = SiteName.TMDB ID_TYPE = IdType.TMDB_Movie - URL_PATTERNS = [r'\w+://www.themoviedb.org/movie/(\d+)'] - WIKI_PROPERTY_ID = '?' + URL_PATTERNS = [r"\w+://www.themoviedb.org/movie/(\d+)"] + WIKI_PROPERTY_ID = "?" DEFAULT_MODEL = Movie @classmethod @@ -55,37 +55,59 @@ class TMDB_Movie(AbstractSite): res_data = BasicDownloader(api_url).download().json() if is_series: - title = res_data['name'] - orig_title = res_data['original_name'] - year = int(res_data['first_air_date'].split( - '-')[0]) if res_data['first_air_date'] else None - imdb_code = res_data['external_ids']['imdb_id'] - showtime = [{res_data['first_air_date']: "首播日期"} - ] if res_data['first_air_date'] else None + title = res_data["name"] + orig_title = res_data["original_name"] + year = ( + int(res_data["first_air_date"].split("-")[0]) + if res_data["first_air_date"] + else None + ) + imdb_code = res_data["external_ids"]["imdb_id"] + showtime = ( + [{res_data["first_air_date"]: "首播日期"}] + if res_data["first_air_date"] + else None + ) duration = None else: - title = res_data['title'] - orig_title = res_data['original_title'] - year = int(res_data['release_date'].split('-') - [0]) if res_data['release_date'] else None - showtime = [{res_data['release_date']: "发布日期"} - ] if res_data['release_date'] else None - imdb_code = res_data['imdb_id'] + title = res_data["title"] + orig_title = res_data["original_title"] + year = ( + int(res_data["release_date"].split("-")[0]) + if res_data["release_date"] + else None + ) + showtime = ( + [{res_data["release_date"]: "发布日期"}] + if res_data["release_date"] + else None + ) + imdb_code = res_data["imdb_id"] # in minutes - duration = res_data['runtime'] if res_data['runtime'] else None + duration = res_data["runtime"] if res_data["runtime"] else None - genre = [x['name'] for x in res_data['genres']] - language = list(map(lambda x: x['name'], res_data['spoken_languages'])) - brief = res_data['overview'] + genre = [x["name"] for x in res_data["genres"]] + language = list(map(lambda x: x["name"], res_data["spoken_languages"])) + brief = res_data["overview"] if is_series: - director = list(map(lambda x: x['name'], res_data['created_by'])) + director = list(map(lambda x: x["name"], res_data["created_by"])) else: - director = list(map(lambda x: x['name'], filter( - lambda c: c['job'] == 'Director', res_data['credits']['crew']))) - playwright = list(map(lambda x: x['name'], filter( - lambda c: c['job'] == 'Screenplay', res_data['credits']['crew']))) - actor = list(map(lambda x: x['name'], res_data['credits']['cast'])) + director = list( + map( + lambda x: x["name"], + filter( + lambda c: c["job"] == "Director", res_data["credits"]["crew"] + ), + ) + ) + playwright = list( + map( + lambda x: x["name"], + filter(lambda c: c["job"] == "Screenplay", res_data["credits"]["crew"]), + ) + ) + actor = list(map(lambda x: x["name"], res_data["credits"]["cast"])) area = [] other_info = {} @@ -95,33 +117,39 @@ class TMDB_Movie(AbstractSite): # other_info['奖项'] = res_data['awards'] # other_info['TMDB_ID'] = id if is_series: - other_info['Seasons'] = res_data['number_of_seasons'] - other_info['Episodes'] = res_data['number_of_episodes'] + other_info["Seasons"] = res_data["number_of_seasons"] + other_info["Episodes"] = res_data["number_of_episodes"] # TODO: use GET /configuration to get base url - img_url = ('https://image.tmdb.org/t/p/original/' + res_data['poster_path']) if res_data['poster_path'] is not None else None + img_url = ( + ("https://image.tmdb.org/t/p/original/" + res_data["poster_path"]) + if res_data["poster_path"] is not None + else None + ) - pd = ResourceContent(metadata={ - 'title': title, - 'orig_title': orig_title, - 'other_title': None, - 'imdb_code': imdb_code, - 'director': director, - 'playwright': playwright, - 'actor': actor, - 'genre': genre, - 'showtime': showtime, - 'site': None, - 'area': area, - 'language': language, - 'year': year, - 'duration': duration, - 'season': None, - 'episodes': None, - 'single_episode_length': None, - 'brief': brief, - 'cover_image_url': img_url, - }) + pd = ResourceContent( + metadata={ + "title": title, + "orig_title": orig_title, + "other_title": None, + "imdb_code": imdb_code, + "director": director, + "playwright": playwright, + "actor": actor, + "genre": genre, + "showtime": showtime, + "site": None, + "area": area, + "language": language, + "year": year, + "duration": duration, + "season": None, + "episodes": None, + "single_episode_length": None, + "brief": brief, + "cover_image_url": img_url, + } + ) if imdb_code: pd.lookup_ids[IdType.IMDB] = imdb_code if pd.metadata["cover_image_url"]: @@ -130,7 +158,9 @@ class TMDB_Movie(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug( + f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}' + ) return pd @@ -138,8 +168,11 @@ class TMDB_Movie(AbstractSite): class TMDB_TV(AbstractSite): SITE_NAME = SiteName.TMDB ID_TYPE = IdType.TMDB_TV - URL_PATTERNS = [r'\w+://www.themoviedb.org/tv/(\d+)[^/]*$', r'\w+://www.themoviedb.org/tv/(\d+)[^/]*/seasons'] - WIKI_PROPERTY_ID = '?' + URL_PATTERNS = [ + r"\w+://www.themoviedb.org/tv/(\d+)[^/]*$", + r"\w+://www.themoviedb.org/tv/(\d+)[^/]*/seasons", + ] + WIKI_PROPERTY_ID = "?" DEFAULT_MODEL = TVShow @classmethod @@ -156,38 +189,60 @@ class TMDB_TV(AbstractSite): res_data = BasicDownloader(api_url).download().json() if is_series: - title = res_data['name'] - orig_title = res_data['original_name'] - year = int(res_data['first_air_date'].split( - '-')[0]) if res_data['first_air_date'] else None - imdb_code = res_data['external_ids']['imdb_id'] - showtime = [{res_data['first_air_date']: "首播日期"} - ] if res_data['first_air_date'] else None + title = res_data["name"] + orig_title = res_data["original_name"] + year = ( + int(res_data["first_air_date"].split("-")[0]) + if res_data["first_air_date"] + else None + ) + imdb_code = res_data["external_ids"]["imdb_id"] + showtime = ( + [{res_data["first_air_date"]: "首播日期"}] + if res_data["first_air_date"] + else None + ) duration = None else: - title = res_data['title'] - orig_title = res_data['original_title'] - year = int(res_data['release_date'].split('-') - [0]) if res_data['release_date'] else None - showtime = [{res_data['release_date']: "发布日期"} - ] if res_data['release_date'] else None - imdb_code = res_data['imdb_id'] + title = res_data["title"] + orig_title = res_data["original_title"] + year = ( + int(res_data["release_date"].split("-")[0]) + if res_data["release_date"] + else None + ) + showtime = ( + [{res_data["release_date"]: "发布日期"}] + if res_data["release_date"] + else None + ) + imdb_code = res_data["imdb_id"] # in minutes - duration = res_data['runtime'] if res_data['runtime'] else None + duration = res_data["runtime"] if res_data["runtime"] else None - genre = [x['name'] for x in res_data['genres']] + genre = [x["name"] for x in res_data["genres"]] - language = list(map(lambda x: x['name'], res_data['spoken_languages'])) - brief = res_data['overview'] + language = list(map(lambda x: x["name"], res_data["spoken_languages"])) + brief = res_data["overview"] if is_series: - director = list(map(lambda x: x['name'], res_data['created_by'])) + director = list(map(lambda x: x["name"], res_data["created_by"])) else: - director = list(map(lambda x: x['name'], filter( - lambda c: c['job'] == 'Director', res_data['credits']['crew']))) - playwright = list(map(lambda x: x['name'], filter( - lambda c: c['job'] == 'Screenplay', res_data['credits']['crew']))) - actor = list(map(lambda x: x['name'], res_data['credits']['cast'])) + director = list( + map( + lambda x: x["name"], + filter( + lambda c: c["job"] == "Director", res_data["credits"]["crew"] + ), + ) + ) + playwright = list( + map( + lambda x: x["name"], + filter(lambda c: c["job"] == "Screenplay", res_data["credits"]["crew"]), + ) + ) + actor = list(map(lambda x: x["name"], res_data["credits"]["cast"])) area = [] other_info = {} @@ -197,41 +252,53 @@ class TMDB_TV(AbstractSite): # other_info['奖项'] = res_data['awards'] # other_info['TMDB_ID'] = id if is_series: - other_info['Seasons'] = res_data['number_of_seasons'] - other_info['Episodes'] = res_data['number_of_episodes'] + other_info["Seasons"] = res_data["number_of_seasons"] + other_info["Episodes"] = res_data["number_of_episodes"] # TODO: use GET /configuration to get base url - img_url = ('https://image.tmdb.org/t/p/original/' + res_data['poster_path']) if res_data['poster_path'] is not None else None + img_url = ( + ("https://image.tmdb.org/t/p/original/" + res_data["poster_path"]) + if res_data["poster_path"] is not None + else None + ) - season_links = list(map(lambda s: { - 'model': 'TVSeason', - 'id_type': IdType.TMDB_TVSeason, - 'id_value': f'{self.id_value}-{s["season_number"]}', - 'title': s['name'], - 'url': f'{self.url}/season/{s["season_number"]}'}, res_data['seasons'])) - pd = ResourceContent(metadata={ - 'title': title, - 'orig_title': orig_title, - 'other_title': None, - 'imdb_code': imdb_code, - 'director': director, - 'playwright': playwright, - 'actor': actor, - 'genre': genre, - 'showtime': showtime, - 'site': None, - 'area': area, - 'language': language, - 'year': year, - 'duration': duration, - 'season_count': res_data['number_of_seasons'], - 'season': None, - 'episodes': None, - 'single_episode_length': None, - 'brief': brief, - 'cover_image_url': img_url, - 'related_resources': season_links, - }) + season_links = list( + map( + lambda s: { + "model": "TVSeason", + "id_type": IdType.TMDB_TVSeason, + "id_value": f'{self.id_value}-{s["season_number"]}', + "title": s["name"], + "url": f'{self.url}/season/{s["season_number"]}', + }, + res_data["seasons"], + ) + ) + pd = ResourceContent( + metadata={ + "title": title, + "orig_title": orig_title, + "other_title": None, + "imdb_code": imdb_code, + "director": director, + "playwright": playwright, + "actor": actor, + "genre": genre, + "showtime": showtime, + "site": None, + "area": area, + "language": language, + "year": year, + "duration": duration, + "season_count": res_data["number_of_seasons"], + "season": None, + "episodes": None, + "single_episode_length": None, + "brief": brief, + "cover_image_url": img_url, + "related_resources": season_links, + } + ) if imdb_code: pd.lookup_ids[IdType.IMDB] = imdb_code @@ -241,7 +308,9 @@ class TMDB_TV(AbstractSite): pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug( + f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}' + ) return pd @@ -249,58 +318,87 @@ class TMDB_TV(AbstractSite): class TMDB_TVSeason(AbstractSite): SITE_NAME = SiteName.TMDB ID_TYPE = IdType.TMDB_TVSeason - URL_PATTERNS = [r'\w+://www.themoviedb.org/tv/(\d+)[^/]*/season/(\d+)[^/]*$'] - WIKI_PROPERTY_ID = '?' + URL_PATTERNS = [r"\w+://www.themoviedb.org/tv/(\d+)[^/]*/season/(\d+)[^/]*$"] + WIKI_PROPERTY_ID = "?" DEFAULT_MODEL = TVSeason - ID_PATTERN = r'^(\d+)-(\d+)$' + ID_PATTERN = r"^(\d+)-(\d+)$" @classmethod def url_to_id(cls, url: str): - u = next(iter([re.match(p, url) for p in cls.URL_PATTERNS if re.match(p, url)]), None) - return u[1] + '-' + u[2] if u else None + u = next( + iter([re.match(p, url) for p in cls.URL_PATTERNS if re.match(p, url)]), None + ) + return u[1] + "-" + u[2] if u else None @classmethod def id_to_url(cls, id_value): - v = id_value.split('-') + v = id_value.split("-") return f"https://www.themoviedb.org/tv/{v[0]}/season/{v[1]}" def scrape(self): - v = self.id_value.split('-') + v = self.id_value.split("-") api_url = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits" d = BasicDownloader(api_url).download().json() - if not d.get('id'): - raise ParseError('id') - pd = ResourceContent(metadata=_copy_dict(d, {'name': 'title', 'overview': 'brief', 'air_date': 'air_date', 'season_number': 0, 'external_ids': []})) - pd.metadata['required_resources'] = [{ - 'model': 'TVShow', - 'id_type': IdType.TMDB_TV, - 'id_value': v[0], - 'title': f'TMDB TV Show {v[0]}', - 'url': f"https://www.themoviedb.org/tv/{v[0]}", - }] - pd.lookup_ids[IdType.IMDB] = d['external_ids'].get('imdb_id') - pd.metadata['cover_image_url'] = ('https://image.tmdb.org/t/p/original/' + d['poster_path']) if d['poster_path'] else None - pd.metadata['title'] = pd.metadata['title'] if pd.metadata['title'] else f'Season {d["season_number"]}' - pd.metadata['episode_number_list'] = list(map(lambda ep: ep['episode_number'], d['episodes'])) - pd.metadata['episode_count'] = len(pd.metadata['episode_number_list']) + if not d.get("id"): + raise ParseError("id") + pd = ResourceContent( + metadata=_copy_dict( + d, + { + "name": "title", + "overview": "brief", + "air_date": "air_date", + "season_number": 0, + "external_ids": [], + }, + ) + ) + pd.metadata["required_resources"] = [ + { + "model": "TVShow", + "id_type": IdType.TMDB_TV, + "id_value": v[0], + "title": f"TMDB TV Show {v[0]}", + "url": f"https://www.themoviedb.org/tv/{v[0]}", + } + ] + pd.lookup_ids[IdType.IMDB] = d["external_ids"].get("imdb_id") + pd.metadata["cover_image_url"] = ( + ("https://image.tmdb.org/t/p/original/" + d["poster_path"]) + if d["poster_path"] + else None + ) + pd.metadata["title"] = ( + pd.metadata["title"] + if pd.metadata["title"] + else f'Season {d["season_number"]}' + ) + pd.metadata["episode_number_list"] = list( + map(lambda ep: ep["episode_number"], d["episodes"]) + ) + pd.metadata["episode_count"] = len(pd.metadata["episode_number_list"]) if pd.metadata["cover_image_url"]: imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url) try: pd.cover_image = imgdl.download().content pd.cover_image_extention = imgdl.extention except Exception: - _logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}') + _logger.debug( + f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}' + ) # get external id from 1st episode if pd.lookup_ids[IdType.IMDB]: _logger.warning("Unexpected IMDB id for TMDB tv season") - elif len(pd.metadata['episode_number_list']) == 0: - _logger.warning("Unable to lookup IMDB id for TMDB tv season with zero episodes") + elif len(pd.metadata["episode_number_list"]) == 0: + _logger.warning( + "Unable to lookup IMDB id for TMDB tv season with zero episodes" + ) else: - ep = pd.metadata['episode_number_list'][0] + ep = pd.metadata["episode_number_list"][0] api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits" d2 = BasicDownloader(api_url2).download().json() - if not d2.get('id'): - raise ParseError('episode id for season') - pd.lookup_ids[IdType.IMDB] = d2['external_ids'].get('imdb_id') + if not d2.get("id"): + raise ParseError("episode id for season") + pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id") return pd diff --git a/catalog/tv/models.py b/catalog/tv/models.py index c8692010..9b74e16c 100644 --- a/catalog/tv/models.py +++ b/catalog/tv/models.py @@ -31,8 +31,8 @@ from django.utils.translation import gettext_lazy as _ class TVShow(Item): category = ItemCategory.TV - url_path = 'tv' - demonstrative = _('这部剧集') + url_path = "tv" + demonstrative = _("这部剧集") imdb = PrimaryLookupIdDescriptor(IdType.IMDB) tmdb_tv = PrimaryLookupIdDescriptor(IdType.TMDB_TV) imdb = PrimaryLookupIdDescriptor(IdType.IMDB) @@ -40,100 +40,208 @@ class TVShow(Item): episode_count = models.PositiveIntegerField(null=True) METADATA_COPY_LIST = [ - 'title', - 'season_count', - 'orig_title', - 'other_title', - 'director', - 'playwright', - 'actor', - 'genre', - 'showtime', - 'site', - 'area', - 'language', - 'year', - 'duration', - 'season_count', - 'episode_count', - 'single_episode_length', - 'brief', + "title", + "season_count", + "orig_title", + "other_title", + "director", + "playwright", + "actor", + "genre", + "showtime", + "site", + "area", + "language", + "year", + "duration", + "season_count", + "episode_count", + "single_episode_length", + "brief", ] - orig_title = jsondata.CharField(_("original title"), blank=True, default='', max_length=500) - other_title = jsondata.ArrayField(models.CharField(_("other title"), blank=True, default='', max_length=500), null=True, blank=True, default=list, ) - director = jsondata.ArrayField(models.CharField(_("director"), blank=True, default='', max_length=200), null=True, blank=True, default=list, ) - playwright = jsondata.ArrayField(models.CharField(_("playwright"), blank=True, default='', max_length=200), null=True, blank=True, default=list, ) - actor = jsondata.ArrayField(models.CharField(_("actor"), blank=True, default='', max_length=200), null=True, blank=True, default=list, ) - genre = jsondata.ArrayField(models.CharField(_("genre"), blank=True, default='', max_length=50), null=True, blank=True, default=list, ) # , choices=MovieGenreEnum.choices - showtime = jsondata.ArrayField(null=True, blank=True, default=list, ) - site = jsondata.URLField(_('site url'), blank=True, default='', max_length=200) - area = jsondata.ArrayField(models.CharField(_("country or region"), blank=True, default='', max_length=100, ), null=True, blank=True, default=list, ) - language = jsondata.ArrayField(models.CharField(blank=True, default='', max_length=100, ), null=True, blank=True, default=list, ) + orig_title = jsondata.CharField( + _("original title"), blank=True, default="", max_length=500 + ) + other_title = jsondata.ArrayField( + models.CharField(_("other title"), blank=True, default="", max_length=500), + null=True, + blank=True, + default=list, + ) + director = jsondata.ArrayField( + models.CharField(_("director"), blank=True, default="", max_length=200), + null=True, + blank=True, + default=list, + ) + playwright = jsondata.ArrayField( + models.CharField(_("playwright"), blank=True, default="", max_length=200), + null=True, + blank=True, + default=list, + ) + actor = jsondata.ArrayField( + models.CharField(_("actor"), blank=True, default="", max_length=200), + null=True, + blank=True, + default=list, + ) + genre = jsondata.ArrayField( + models.CharField(_("genre"), blank=True, default="", max_length=50), + null=True, + blank=True, + default=list, + ) # , choices=MovieGenreEnum.choices + showtime = jsondata.ArrayField( + null=True, + blank=True, + default=list, + ) + site = jsondata.URLField(_("site url"), blank=True, default="", max_length=200) + area = jsondata.ArrayField( + models.CharField( + _("country or region"), + blank=True, + default="", + max_length=100, + ), + null=True, + blank=True, + default=list, + ) + language = jsondata.ArrayField( + models.CharField( + blank=True, + default="", + max_length=100, + ), + null=True, + blank=True, + default=list, + ) year = jsondata.IntegerField(null=True, blank=True) season_number = jsondata.IntegerField(null=True, blank=True) single_episode_length = jsondata.IntegerField(null=True, blank=True) - duration = jsondata.CharField(blank=True, default='', max_length=200) + duration = jsondata.CharField(blank=True, default="", max_length=200) class TVSeason(Item): category = ItemCategory.TV - url_path = 'tv/season' - demonstrative = _('这部剧集') + url_path = "tv/season" + demonstrative = _("这部剧集") douban_movie = PrimaryLookupIdDescriptor(IdType.DoubanMovie) imdb = PrimaryLookupIdDescriptor(IdType.IMDB) tmdb_tvseason = PrimaryLookupIdDescriptor(IdType.TMDB_TVSeason) - show = models.ForeignKey(TVShow, null=True, on_delete=models.SET_NULL, related_name='seasons') + show = models.ForeignKey( + TVShow, null=True, on_delete=models.SET_NULL, related_name="seasons" + ) season_number = models.PositiveIntegerField(null=True) episode_count = models.PositiveIntegerField(null=True) METADATA_COPY_LIST = [ - 'title', - 'orig_title', - 'other_title', - 'director', - 'playwright', - 'actor', - 'genre', - 'showtime', - 'site', - 'area', - 'language', - 'year', - 'duration', - 'season_number', - 'episode_count', - 'single_episode_length', - 'brief', + "title", + "orig_title", + "other_title", + "director", + "playwright", + "actor", + "genre", + "showtime", + "site", + "area", + "language", + "year", + "duration", + "season_number", + "episode_count", + "single_episode_length", + "brief", ] - orig_title = jsondata.CharField(_("original title"), blank=True, default='', max_length=500) - other_title = jsondata.ArrayField(models.CharField(_("other title"), blank=True, default='', max_length=500), null=True, blank=True, default=list, ) - director = jsondata.ArrayField(models.CharField(_("director"), blank=True, default='', max_length=200), null=True, blank=True, default=list, ) - playwright = jsondata.ArrayField(models.CharField(_("playwright"), blank=True, default='', max_length=200), null=True, blank=True, default=list, ) - actor = jsondata.ArrayField(models.CharField(_("actor"), blank=True, default='', max_length=200), null=True, blank=True, default=list, ) - genre = jsondata.ArrayField(models.CharField(_("genre"), blank=True, default='', max_length=50), null=True, blank=True, default=list, ) # , choices=MovieGenreEnum.choices - showtime = jsondata.ArrayField(null=True, blank=True, default=list, ) - site = jsondata.URLField(_('site url'), blank=True, default='', max_length=200) - area = jsondata.ArrayField(models.CharField(_("country or region"), blank=True, default='', max_length=100, ), null=True, blank=True, default=list, ) - language = jsondata.ArrayField(models.CharField(blank=True, default='', max_length=100, ), null=True, blank=True, default=list, ) + orig_title = jsondata.CharField( + _("original title"), blank=True, default="", max_length=500 + ) + other_title = jsondata.ArrayField( + models.CharField(_("other title"), blank=True, default="", max_length=500), + null=True, + blank=True, + default=list, + ) + director = jsondata.ArrayField( + models.CharField(_("director"), blank=True, default="", max_length=200), + null=True, + blank=True, + default=list, + ) + playwright = jsondata.ArrayField( + models.CharField(_("playwright"), blank=True, default="", max_length=200), + null=True, + blank=True, + default=list, + ) + actor = jsondata.ArrayField( + models.CharField(_("actor"), blank=True, default="", max_length=200), + null=True, + blank=True, + default=list, + ) + genre = jsondata.ArrayField( + models.CharField(_("genre"), blank=True, default="", max_length=50), + null=True, + blank=True, + default=list, + ) # , choices=MovieGenreEnum.choices + showtime = jsondata.ArrayField( + null=True, + blank=True, + default=list, + ) + site = jsondata.URLField(_("site url"), blank=True, default="", max_length=200) + area = jsondata.ArrayField( + models.CharField( + _("country or region"), + blank=True, + default="", + max_length=100, + ), + null=True, + blank=True, + default=list, + ) + language = jsondata.ArrayField( + models.CharField( + blank=True, + default="", + max_length=100, + ), + null=True, + blank=True, + default=list, + ) year = jsondata.IntegerField(null=True, blank=True) single_episode_length = jsondata.IntegerField(null=True, blank=True) - duration = jsondata.CharField(blank=True, default='', max_length=200) + duration = jsondata.CharField(blank=True, default="", max_length=200) def update_linked_items_from_external_resource(self, resource): """add Work from resource.metadata['work'] if not yet""" links = resource.required_resources + resource.related_resources for w in links: - if w['model'] == 'TVShow': - p = ExternalResource.objects.filter(id_type=w['id_type'], id_value=w['id_value']).first() + if w["model"] == "TVShow": + p = ExternalResource.objects.filter( + id_type=w["id_type"], id_value=w["id_value"] + ).first() if p and p.item and self.show != p.item: self.show = p.item class TVEpisode(Item): category = ItemCategory.TV - url_path = 'tv/episode' - show = models.ForeignKey(TVShow, null=True, on_delete=models.SET_NULL, related_name='episodes') - season = models.ForeignKey(TVSeason, null=True, on_delete=models.SET_NULL, related_name='episodes') + url_path = "tv/episode" + show = models.ForeignKey( + TVShow, null=True, on_delete=models.SET_NULL, related_name="episodes" + ) + season = models.ForeignKey( + TVSeason, null=True, on_delete=models.SET_NULL, related_name="episodes" + ) episode_number = models.PositiveIntegerField(null=True) imdb = PrimaryLookupIdDescriptor(IdType.IMDB) - METADATA_COPY_LIST = ['title', 'brief', 'episode_number'] + METADATA_COPY_LIST = ["title", "brief", "episode_number"] diff --git a/catalog/tv/tests.py b/catalog/tv/tests.py index ffc7ab05..210d514d 100644 --- a/catalog/tv/tests.py +++ b/catalog/tv/tests.py @@ -5,10 +5,10 @@ from catalog.tv.models import * class TMDBTVTestCase(TestCase): def test_parse(self): - t_id = '57243' - t_url = 'https://www.themoviedb.org/tv/57243-doctor-who' - t_url1 = 'https://www.themoviedb.org/tv/57243-doctor-who/seasons' - t_url2 = 'https://www.themoviedb.org/tv/57243' + t_id = "57243" + t_url = "https://www.themoviedb.org/tv/57243-doctor-who" + t_url1 = "https://www.themoviedb.org/tv/57243-doctor-who/seasons" + t_url2 = "https://www.themoviedb.org/tv/57243" p1 = SiteManager.get_site_by_id_type(IdType.TMDB_TV) self.assertIsNotNone(p1) self.assertEqual(p1.validate_url(t_url), True) @@ -17,29 +17,29 @@ class TMDBTVTestCase(TestCase): p2 = SiteManager.get_site_by_url(t_url) self.assertEqual(p1.id_to_url(t_id), t_url2) self.assertEqual(p2.url_to_id(t_url), t_id) - wrong_url = 'https://www.themoviedb.org/tv/57243-doctor-who/season/13' + wrong_url = "https://www.themoviedb.org/tv/57243-doctor-who/season/13" s1 = SiteManager.get_site_by_url(wrong_url) self.assertNotIsInstance(s1, TVShow) @use_local_response def test_scrape(self): - t_url = 'https://www.themoviedb.org/tv/57243-doctor-who' + t_url = "https://www.themoviedb.org/tv/57243-doctor-who" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) - self.assertEqual(site.id_value, '57243') + self.assertEqual(site.id_value, "57243") site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.resource.metadata['title'], '神秘博士') + self.assertEqual(site.resource.metadata["title"], "神秘博士") self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB) - self.assertEqual(site.resource.item.__class__.__name__, 'TVShow') - self.assertEqual(site.resource.item.imdb, 'tt0436992') + self.assertEqual(site.resource.item.__class__.__name__, "TVShow") + self.assertEqual(site.resource.item.imdb, "tt0436992") class TMDBTVSeasonTestCase(TestCase): def test_parse(self): - t_id = '57243-11' - t_url = 'https://www.themoviedb.org/tv/57243-doctor-who/season/11' - t_url_unique = 'https://www.themoviedb.org/tv/57243/season/11' + t_id = "57243-11" + t_url = "https://www.themoviedb.org/tv/57243-doctor-who/season/11" + t_url_unique = "https://www.themoviedb.org/tv/57243/season/11" p1 = SiteManager.get_site_by_id_type(IdType.TMDB_TVSeason) self.assertIsNotNone(p1) self.assertEqual(p1.validate_url(t_url), True) @@ -50,48 +50,48 @@ class TMDBTVSeasonTestCase(TestCase): @use_local_response def test_scrape(self): - t_url = 'https://www.themoviedb.org/tv/57243-doctor-who/season/4' + t_url = "https://www.themoviedb.org/tv/57243-doctor-who/season/4" site = SiteManager.get_site_by_url(t_url) self.assertEqual(site.ready, False) - self.assertEqual(site.id_value, '57243-4') + self.assertEqual(site.id_value, "57243-4") site.get_resource_ready() self.assertEqual(site.ready, True) - self.assertEqual(site.resource.metadata['title'], '第 4 季') + self.assertEqual(site.resource.metadata["title"], "第 4 季") self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB) - self.assertEqual(site.resource.item.__class__.__name__, 'TVSeason') - self.assertEqual(site.resource.item.imdb, 'tt1159991') + self.assertEqual(site.resource.item.__class__.__name__, "TVSeason") + self.assertEqual(site.resource.item.imdb, "tt1159991") self.assertIsNotNone(site.resource.item.show) - self.assertEqual(site.resource.item.show.imdb, 'tt0436992') + self.assertEqual(site.resource.item.show.imdb, "tt0436992") class DoubanMovieTVTestCase(TestCase): @use_local_response def test_scrape(self): - url3 = 'https://movie.douban.com/subject/3627919/' + url3 = "https://movie.douban.com/subject/3627919/" p3 = SiteManager.get_site_by_url(url3).get_resource_ready() - self.assertEqual(p3.item.__class__.__name__, 'TVSeason') + self.assertEqual(p3.item.__class__.__name__, "TVSeason") self.assertIsNotNone(p3.item.show) - self.assertEqual(p3.item.show.imdb, 'tt0436992') + self.assertEqual(p3.item.show.imdb, "tt0436992") @use_local_response def test_scrape_singleseason(self): - url3 = 'https://movie.douban.com/subject/26895436/' + url3 = "https://movie.douban.com/subject/26895436/" p3 = SiteManager.get_site_by_url(url3).get_resource_ready() - self.assertEqual(p3.item.__class__.__name__, 'TVShow') + self.assertEqual(p3.item.__class__.__name__, "TVShow") @use_local_response def test_scrape_fix_imdb(self): - url = 'https://movie.douban.com/subject/35597581/' + url = "https://movie.douban.com/subject/35597581/" item = SiteManager.get_site_by_url(url).get_resource_ready().item # this douban links to S6E3, we'll reset it to S6E1 to keep consistant - self.assertEqual(item.imdb, 'tt21599650') + self.assertEqual(item.imdb, "tt21599650") class MultiTVSitesTestCase(TestCase): @use_local_response def test_tvshows(self): - url1 = 'https://www.themoviedb.org/tv/57243-doctor-who' - url2 = 'https://www.imdb.com/title/tt0436992/' + url1 = "https://www.themoviedb.org/tv/57243-doctor-who" + url2 = "https://www.imdb.com/title/tt0436992/" # url3 = 'https://movie.douban.com/subject/3541415/' p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p2 = SiteManager.get_site_by_url(url2).get_resource_ready() @@ -101,9 +101,9 @@ class MultiTVSitesTestCase(TestCase): @use_local_response def test_tvseasons(self): - url1 = 'https://www.themoviedb.org/tv/57243-doctor-who/season/4' - url2 = 'https://www.imdb.com/title/tt1159991/' - url3 = 'https://movie.douban.com/subject/3627919/' + url1 = "https://www.themoviedb.org/tv/57243-doctor-who/season/4" + url2 = "https://www.imdb.com/title/tt1159991/" + url3 = "https://movie.douban.com/subject/3627919/" p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p2 = SiteManager.get_site_by_url(url2).get_resource_ready() p3 = SiteManager.get_site_by_url(url3).get_resource_ready() @@ -114,18 +114,18 @@ class MultiTVSitesTestCase(TestCase): @use_local_response def test_miniseries(self): - url1 = 'https://www.themoviedb.org/tv/86941-the-north-water' - url3 = 'https://movie.douban.com/subject/26895436/' + url1 = "https://www.themoviedb.org/tv/86941-the-north-water" + url3 = "https://movie.douban.com/subject/26895436/" p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p3 = SiteManager.get_site_by_url(url3).get_resource_ready() - self.assertEqual(p3.item.__class__.__name__, 'TVShow') + self.assertEqual(p3.item.__class__.__name__, "TVShow") self.assertEqual(p1.item.id, p3.item.id) @use_local_response def test_tvspecial(self): - url1 = 'https://www.themoviedb.org/movie/282758-doctor-who-the-runaway-bride' - url2 = 'hhttps://www.imdb.com/title/tt0827573/' - url3 = 'https://movie.douban.com/subject/4296866/' + url1 = "https://www.themoviedb.org/movie/282758-doctor-who-the-runaway-bride" + url2 = "hhttps://www.imdb.com/title/tt0827573/" + url3 = "https://movie.douban.com/subject/4296866/" p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p2 = SiteManager.get_site_by_url(url2).get_resource_ready() p3 = SiteManager.get_site_by_url(url3).get_resource_ready() diff --git a/catalog/urls.py b/catalog/urls.py index 723fedf8..66dde9c2 100644 --- a/catalog/urls.py +++ b/catalog/urls.py @@ -3,13 +3,13 @@ from .api import api from .views import * from .models import * -app_name = 'catalog' +app_name = "catalog" def _get_all_url_paths(): - paths = ['item'] + paths = ["item"] for cls in Item.__subclasses__(): - p = getattr(cls, 'url_path', None) + p = getattr(cls, "url_path", None) if p: paths.append(p) res = "|".join(paths) @@ -17,9 +17,31 @@ def _get_all_url_paths(): urlpatterns = [ - re_path(r'^item/(?P[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})?$', retrieve_by_uuid, name='retrieve_by_uuid'), - re_path(r'^(?P' + _get_all_url_paths() + ')/(?P[A-Za-z0-9]{21,22})$', retrieve, name='retrieve'), - re_path(r'^(?P' + _get_all_url_paths() + ')/(?P[A-Za-z0-9]{21,22})/reviews', review_list, name='review_list'), - re_path(r'^(?P' + _get_all_url_paths() + ')/(?P[A-Za-z0-9]{21,22})/marks(?:/(?P\\w+))?', mark_list, name='mark_list'), + re_path( + r"^item/(?P[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})?$", + retrieve_by_uuid, + name="retrieve_by_uuid", + ), + re_path( + r"^(?P" + + _get_all_url_paths() + + ")/(?P[A-Za-z0-9]{21,22})$", + retrieve, + name="retrieve", + ), + re_path( + r"^(?P" + + _get_all_url_paths() + + ")/(?P[A-Za-z0-9]{21,22})/reviews", + review_list, + name="review_list", + ), + re_path( + r"^(?P" + + _get_all_url_paths() + + ")/(?P[A-Za-z0-9]{21,22})/marks(?:/(?P\\w+))?", + mark_list, + name="mark_list", + ), path("api/", api.urls), ] diff --git a/catalog/views.py b/catalog/views.py index 956e1523..37fb0882 100644 --- a/catalog/views.py +++ b/catalog/views.py @@ -34,9 +34,9 @@ def retrieve_by_uuid(request, item_uid): def retrieve(request, item_path, item_uuid): - if request.method == 'GET': + if request.method == "GET": item = get_object_or_404(Item, uid=base62.decode(item_uuid)) - item_url = f'/{item_path}/{item_uuid}' + item_url = f"/{item_path}/{item_uuid}" if item.url != item_url: return redirect(item.url) mark = None @@ -44,26 +44,46 @@ def retrieve(request, item_path, item_uuid): mark_list = None review_list = None collection_list = [] - shelf_types = [(n[1], n[2]) for n in iter(ShelfTypeNames) if n[0] == item.category] + shelf_types = [ + (n[1], n[2]) for n in iter(ShelfTypeNames) if n[0] == item.category + ] if request.user.is_authenticated: visible = query_visible(request.user) mark = Mark(request.user, item) _logger.info(mark.rating) review = mark.review - collection_list = item.collections.all().filter(visible).annotate(like_counts=Count('likes')).order_by('-like_counts') - mark_query = ShelfMember.objects.filter(item=item).filter(visible).order_by('-created_time') - mark_list = [member.mark for member in mark_query[:NUM_REVIEWS_ON_ITEM_PAGE]] - review_list = Review.objects.filter(item=item).filter(visible).order_by('-created_time')[:NUM_REVIEWS_ON_ITEM_PAGE] + collection_list = ( + item.collections.all() + .filter(visible) + .annotate(like_counts=Count("likes")) + .order_by("-like_counts") + ) + mark_query = ( + ShelfMember.objects.filter(item=item) + .filter(visible) + .order_by("-created_time") + ) + mark_list = [ + member.mark for member in mark_query[:NUM_REVIEWS_ON_ITEM_PAGE] + ] + review_list = ( + Review.objects.filter(item=item) + .filter(visible) + .order_by("-created_time")[:NUM_REVIEWS_ON_ITEM_PAGE] + ) - return render(request, item.class_name + '.html', { - 'item': item, - 'mark': mark, - 'review': review, - 'mark_list': mark_list, - 'review_list': review_list, - 'collection_list': collection_list, - 'shelf_types': shelf_types, - } + return render( + request, + item.class_name + ".html", + { + "item": item, + "mark": mark, + "review": review, + "mark_list": mark_list, + "review_list": review_list, + "collection_list": collection_list, + "shelf_types": shelf_types, + }, ) else: return HttpResponseBadRequest() @@ -73,23 +93,24 @@ def mark_list(request, item_path, item_uuid, following_only=False): item = get_object_or_404(Item, uid=base62.decode(item_uuid)) if not item: return HttpResponseNotFound("item not found") - queryset = ShelfMember.objects.filter(item=item).order_by('-created_time') + queryset = ShelfMember.objects.filter(item=item).order_by("-created_time") if following_only: queryset = queryset.filter(query_following(request.user)) else: queryset = queryset.filter(query_visible(request.user)) paginator = Paginator(queryset, NUM_REVIEWS_ON_LIST_PAGE) - page_number = request.GET.get('page', default=1) + page_number = request.GET.get("page", default=1) marks = paginator.get_page(page_number) marks.pagination = PageLinksGenerator( - PAGE_LINK_NUMBER, page_number, paginator.num_pages) + PAGE_LINK_NUMBER, page_number, paginator.num_pages + ) return render( request, - 'item_mark_list.html', + "item_mark_list.html", { - 'marks': marks, - 'item': item, - } + "marks": marks, + "item": item, + }, ) @@ -97,18 +118,19 @@ def review_list(request, item_path, item_uuid): item = get_object_or_404(Item, uid=base62.decode(item_uuid)) if not item: return HttpResponseNotFound("item not found") - queryset = Review.objects.filter(item=item).order_by('-created_time') + queryset = Review.objects.filter(item=item).order_by("-created_time") queryset = queryset.filter(query_visible(request.user)) paginator = Paginator(queryset, NUM_REVIEWS_ON_LIST_PAGE) - page_number = request.GET.get('page', default=1) + page_number = request.GET.get("page", default=1) reviews = paginator.get_page(page_number) reviews.pagination = PageLinksGenerator( - PAGE_LINK_NUMBER, page_number, paginator.num_pages) + PAGE_LINK_NUMBER, page_number, paginator.num_pages + ) return render( request, - 'item_review_list.html', + "item_review_list.html", { - 'reviews': reviews, - 'item': item, - } + "reviews": reviews, + "item": item, + }, ) diff --git a/journal/apps.py b/journal/apps.py index afe76cb9..e10a1714 100644 --- a/journal/apps.py +++ b/journal/apps.py @@ -2,5 +2,5 @@ from django.apps import AppConfig class JournalConfig(AppConfig): - default_auto_field = 'django.db.models.BigAutoField' - name = 'journal' + default_auto_field = "django.db.models.BigAutoField" + name = "journal" diff --git a/journal/forms.py b/journal/forms.py index d64ee29b..56dcd292 100644 --- a/journal/forms.py +++ b/journal/forms.py @@ -12,27 +12,23 @@ from common.forms import PreviewImageInput class ReviewForm(forms.ModelForm): class Meta: model = Review - fields = [ - 'id', - 'item', - 'title', - 'body', - 'visibility' - ] + fields = ["id", "item", "title", "body", "visibility"] widgets = { - 'item': forms.TextInput(attrs={"hidden": ""}), + "item": forms.TextInput(attrs={"hidden": ""}), } + title = forms.CharField(label=_("评论标题")) body = MarkdownxFormField(label=_("评论正文 (Markdown)")) share_to_mastodon = forms.BooleanField( - label=_("分享到联邦网络"), initial=True, required=False) + label=_("分享到联邦网络"), initial=True, required=False + ) id = forms.IntegerField(required=False, widget=forms.HiddenInput()) visibility = forms.TypedChoiceField( label=_("可见性"), initial=0, coerce=int, choices=VisibilityType.choices, - widget=forms.RadioSelect + widget=forms.RadioSelect, ) @@ -52,26 +48,26 @@ class CollectionForm(forms.ModelForm): initial=0, coerce=int, choices=VisibilityType.choices, - widget=forms.RadioSelect + widget=forms.RadioSelect, ) collaborative = forms.TypedChoiceField( label=_("协作整理权限"), initial=0, coerce=int, choices=COLLABORATIVE_CHOICES, - widget=forms.RadioSelect + widget=forms.RadioSelect, ) class Meta: model = Collection fields = [ - 'title', - 'cover', - 'visibility', - 'collaborative', - 'brief', + "title", + "cover", + "visibility", + "collaborative", + "brief", ] widgets = { - 'cover': PreviewImageInput(), + "cover": PreviewImageInput(), } diff --git a/journal/mixins.py b/journal/mixins.py index f4d6d529..2aa70efa 100644 --- a/journal/mixins.py +++ b/journal/mixins.py @@ -17,7 +17,11 @@ class UserOwnedObjectMixin: return False if self.visibility == 2: return False - if viewer.is_blocking(owner) or owner.is_blocking(viewer) or viewer.is_muting(owner): + if ( + viewer.is_blocking(owner) + or owner.is_blocking(viewer) + or viewer.is_muting(owner) + ): return False if self.visibility == 1: return viewer.is_following(owner) @@ -25,12 +29,26 @@ class UserOwnedObjectMixin: return True def is_editable_by(self, viewer): - return viewer.is_authenticated and (viewer.is_staff or viewer.is_superuser or viewer == self.owner) + return viewer.is_authenticated and ( + viewer.is_staff or viewer.is_superuser or viewer == self.owner + ) @classmethod def get_available(cls, entity, request_user, following_only=False): # e.g. SongMark.get_available(song, request.user) query_kwargs = {entity.__class__.__name__.lower(): entity} - all_entities = cls.objects.filter(**query_kwargs).order_by("-created_time") # get all marks for song - visible_entities = list(filter(lambda _entity: _entity.is_visible_to(request_user) and (_entity.owner.mastodon_username in request_user.mastodon_following if following_only else True), all_entities)) + all_entities = cls.objects.filter(**query_kwargs).order_by( + "-created_time" + ) # get all marks for song + visible_entities = list( + filter( + lambda _entity: _entity.is_visible_to(request_user) + and ( + _entity.owner.mastodon_username in request_user.mastodon_following + if following_only + else True + ), + all_entities, + ) + ) return visible_entities diff --git a/journal/templatetags/user_actions.py b/journal/templatetags/user_actions.py index 3c08ae1c..b16eec7f 100644 --- a/journal/templatetags/user_actions.py +++ b/journal/templatetags/user_actions.py @@ -7,21 +7,21 @@ register = template.Library() @register.simple_tag(takes_context=True) def wish_item_action(context, item): - user = context['request'].user + user = context["request"].user if user and user.is_authenticated: action = { - 'taken': user.shelf_manager.locate_item(item) is not None, - 'url': reverse("journal:wish", args=[item.uuid]), + "taken": user.shelf_manager.locate_item(item) is not None, + "url": reverse("journal:wish", args=[item.uuid]), } return action @register.simple_tag(takes_context=True) def like_piece_action(context, piece): - user = context['request'].user + user = context["request"].user if user and user.is_authenticated: action = { - 'taken': Like.objects.filter(target=piece, owner=user).first() is not None, - 'url': reverse("journal:like", args=[piece.uuid]), + "taken": Like.objects.filter(target=piece, owner=user).first() is not None, + "url": reverse("journal:like", args=[piece.uuid]), } return action diff --git a/social/apps.py b/social/apps.py index 8af48774..b11df7b3 100644 --- a/social/apps.py +++ b/social/apps.py @@ -2,8 +2,8 @@ from django.apps import AppConfig class SocialConfig(AppConfig): - default_auto_field = 'django.db.models.BigAutoField' - name = 'social' + default_auto_field = "django.db.models.BigAutoField" + name = "social" def ready(self): # load key modules in proper order, make sure class inject and signal works as expected diff --git a/social/models.py b/social/models.py index f47be08c..f1951c54 100644 --- a/social/models.py +++ b/social/models.py @@ -21,23 +21,27 @@ _logger = logging.getLogger(__name__) class ActivityTemplate(models.TextChoices): - """ - """ - MarkItem = 'mark_item' - ReviewItem = 'review_item' - CreateCollection = 'create_collection' - LikeCollection = 'like_collection' + """ """ + + MarkItem = "mark_item" + ReviewItem = "review_item" + CreateCollection = "create_collection" + LikeCollection = "like_collection" class LocalActivity(models.Model, UserOwnedObjectMixin): owner = models.ForeignKey(User, on_delete=models.CASCADE) - visibility = models.PositiveSmallIntegerField(default=0) # 0: Public / 1: Follower only / 2: Self only - template = models.CharField(blank=False, choices=ActivityTemplate.choices, max_length=50) + visibility = models.PositiveSmallIntegerField( + default=0 + ) # 0: Public / 1: Follower only / 2: Self only + template = models.CharField( + blank=False, choices=ActivityTemplate.choices, max_length=50 + ) action_object = models.ForeignKey(Piece, on_delete=models.CASCADE) created_time = models.DateTimeField(default=timezone.now, db_index=True) def __str__(self): - return f'Activity [{self.owner}:{self.template}:{self.action_object}]' + return f"Activity [{self.owner}:{self.template}:{self.action_object}]" class ActivityManager: @@ -48,7 +52,11 @@ class ActivityManager: q = Q(owner_id__in=self.owner.following, visibility__lt=2) | Q(owner=self.owner) if before_time: q = q & Q(created_time__lt=before_time) - return LocalActivity.objects.filter(q).order_by('-created_time').prefetch_related('action_object') # .select_related() https://github.com/django-polymorphic/django-polymorphic/pull/531 + return ( + LocalActivity.objects.filter(q) + .order_by("-created_time") + .prefetch_related("action_object") + ) # .select_related() https://github.com/django-polymorphic/django-polymorphic/pull/531 @staticmethod def get_manager_for_user(user): @@ -56,7 +64,7 @@ class ActivityManager: User.activity_manager = cached_property(ActivityManager.get_manager_for_user) -User.activity_manager.__set_name__(User, 'activity_manager') +User.activity_manager.__set_name__(User, "activity_manager") class DataSignalManager: @@ -68,9 +76,9 @@ class DataSignalManager: if processor_class: processor = processor_class(instance) if created: - if hasattr(processor, 'created'): + if hasattr(processor, "created"): processor.created() - elif hasattr(processor, 'updated'): + elif hasattr(processor, "updated"): processor.updated() @staticmethod @@ -78,7 +86,7 @@ class DataSignalManager: processor_class = DataSignalManager.processors.get(instance.__class__) if processor_class: processor = processor_class(instance) - if hasattr(processor, 'deleted'): + if hasattr(processor, "deleted"): processor.deleted() @staticmethod @@ -103,15 +111,17 @@ class DefaultActivityProcessor: def created(self): params = { - 'owner': self.action_object.owner, - 'visibility': self.action_object.visibility, - 'template': self.template, - 'action_object': self.action_object, + "owner": self.action_object.owner, + "visibility": self.action_object.visibility, + "template": self.template, + "action_object": self.action_object, } LocalActivity.objects.create(**params) def updated(self): - activity = LocalActivity.objects.filter(action_object=self.action_object).first() + activity = LocalActivity.objects.filter( + action_object=self.action_object + ).first() if not activity: self.created() elif activity.visibility != self.action_object.visibility: diff --git a/social/urls.py b/social/urls.py index 75a2664f..8df11801 100644 --- a/social/urls.py +++ b/social/urls.py @@ -2,8 +2,8 @@ from django.urls import path, re_path from .views import * -app_name = 'social' +app_name = "social" urlpatterns = [ - path('', feed, name='feed'), - path('data', data, name='data'), + path("", feed, name="feed"), + path("data", data, name="data"), ] diff --git a/social/views.py b/social/views.py index cbeb5bc8..27b715a3 100644 --- a/social/views.py +++ b/social/views.py @@ -23,31 +23,35 @@ PAGE_SIZE = 10 @login_required def feed(request): - if request.method != 'GET': + if request.method != "GET": return user = request.user - unread = Announcement.objects.filter(pk__gt=user.read_announcement_index).order_by('-pk') + unread = Announcement.objects.filter(pk__gt=user.read_announcement_index).order_by( + "-pk" + ) if unread: - user.read_announcement_index = Announcement.objects.latest('pk').pk - user.save(update_fields=['read_announcement_index']) + user.read_announcement_index = Announcement.objects.latest("pk").pk + user.save(update_fields=["read_announcement_index"]) return render( request, - 'feed.html', + "feed.html", { - 'top_tags': user.tag_manager.all_tags[:10], - 'unread_announcements': unread, - } + "top_tags": user.tag_manager.all_tags[:10], + "unread_announcements": unread, + }, ) @login_required def data(request): - if request.method != 'GET': + if request.method != "GET": return return render( request, - 'feed_data.html', + "feed_data.html", { - 'activities': ActivityManager(request.user).get_timeline(before_time=request.GET.get('last'))[:PAGE_SIZE], - } + "activities": ActivityManager(request.user).get_timeline( + before_time=request.GET.get("last") + )[:PAGE_SIZE], + }, )