reformat new code with black

This commit is contained in:
Your Name 2022-12-29 23:57:02 -05:00
parent bde7ce47a3
commit 4ee560f6b4
49 changed files with 1915 additions and 1168 deletions

View file

@ -10,7 +10,11 @@ from django.utils.baseconv import base62
from django.shortcuts import render, get_object_or_404, redirect, reverse
from django.http import Http404
api = NinjaAPI(title=settings.SITE_INFO['site_name'], version="1.0.0", description=f"{settings.SITE_INFO['site_name']} API <hr/><a href='{settings.APP_WEBSITE}'>Learn more</a>")
api = NinjaAPI(
title=settings.SITE_INFO["site_name"],
version="1.0.0",
description=f"{settings.SITE_INFO['site_name']} API <hr/><a href='{settings.APP_WEBSITE}'>Learn more</a>",
)
class ItemIn(Schema):

View file

@ -2,8 +2,8 @@ from django.apps import AppConfig
class CatalogConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'catalog'
default_auto_field = "django.db.models.BigAutoField"
name = "catalog"
def ready(self):
# load key modules in proper order, make sure class inject and signal works as expected

View file

@ -25,8 +25,8 @@ from .utils import *
class Edition(Item):
category = ItemCategory.Book
url_path = 'book'
demonstrative = _('这本书')
url_path = "book"
demonstrative = _("这本书")
isbn = PrimaryLookupIdDescriptor(IdType.ISBN)
asin = PrimaryLookupIdDescriptor(IdType.ASIN)
@ -35,30 +35,30 @@ class Edition(Item):
# goodreads = LookupIdDescriptor(IdType.Goodreads)
METADATA_COPY_LIST = [
'title',
'brief',
"title",
"brief",
# legacy fields
'subtitle',
'orig_title',
'author',
'translator',
'language',
'pub_house',
'pub_year',
'pub_month',
'binding',
'price',
'pages',
'contents',
'series',
'imprint',
"subtitle",
"orig_title",
"author",
"translator",
"language",
"pub_house",
"pub_year",
"pub_month",
"binding",
"price",
"pages",
"contents",
"series",
"imprint",
]
subtitle = jsondata.CharField(null=True, blank=True, default=None)
orig_title = jsondata.CharField(null=True, blank=True, default=None)
author = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list)
translator = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list)
author = jsondata.ArrayField(_("作者"), null=False, blank=False, default=list)
translator = jsondata.ArrayField(_("译者"), null=True, blank=True, default=list)
language = jsondata.CharField(_("语言"), null=True, blank=True, default=None)
pub_house = jsondata.CharField(_('出版方'), null=True, blank=True, default=None)
pub_house = jsondata.CharField(_("出版方"), null=True, blank=True, default=None)
pub_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True)
pub_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True)
binding = jsondata.CharField(null=True, blank=True, default=None)
@ -80,8 +80,11 @@ class Edition(Item):
"""add Work from resource.metadata['work'] if not yet"""
links = resource.required_resources + resource.related_resources
for w in links:
if w['model'] == 'Work':
work = Work.objects.filter(primary_lookup_id_type=w['id_type'], primary_lookup_id_value=w['id_value']).first()
if w["model"] == "Work":
work = Work.objects.filter(
primary_lookup_id_type=w["id_type"],
primary_lookup_id_value=w["id_value"],
).first()
if work and work not in self.works.all():
self.works.add(work)
# if not work:
@ -90,15 +93,15 @@ class Edition(Item):
class Work(Item):
category = ItemCategory.Book
url_path = 'book/work'
url_path = "book/work"
douban_work = PrimaryLookupIdDescriptor(IdType.DoubanBook_Work)
goodreads_work = PrimaryLookupIdDescriptor(IdType.Goodreads_Work)
editions = models.ManyToManyField(Edition, related_name='works')
editions = models.ManyToManyField(Edition, related_name="works")
class Series(Item):
category = ItemCategory.Book
url_path = 'book/series'
url_path = "book/series"
# douban_serie = LookupIdDescriptor(IdType.DoubanBook_Serie)
# goodreads_serie = LookupIdDescriptor(IdType.Goodreads_Serie)

View file

@ -8,7 +8,7 @@ class BookTestCase(TestCase):
def setUp(self):
hyperion = Edition.objects.create(title="Hyperion")
hyperion.pages = 500
hyperion.isbn = '9780553283686'
hyperion.isbn = "9780553283686"
hyperion.save()
# hyperion.isbn10 = '0553283685'
@ -22,39 +22,39 @@ class BookTestCase(TestCase):
self.assertEqual(hyperion.title, "Hyperion")
self.assertEqual(hyperion.pages, 500)
self.assertEqual(hyperion.primary_lookup_id_type, IdType.ISBN)
self.assertEqual(hyperion.primary_lookup_id_value, '9780553283686')
self.assertEqual(hyperion.primary_lookup_id_value, "9780553283686")
andymion = Edition(title="Andymion", pages=42)
self.assertEqual(andymion.pages, 42)
def test_lookupids(self):
hyperion = Edition.objects.get(title="Hyperion")
hyperion.asin = 'B004G60EHS'
hyperion.asin = "B004G60EHS"
self.assertEqual(hyperion.primary_lookup_id_type, IdType.ASIN)
self.assertEqual(hyperion.primary_lookup_id_value, 'B004G60EHS')
self.assertEqual(hyperion.primary_lookup_id_value, "B004G60EHS")
self.assertEqual(hyperion.isbn, None)
self.assertEqual(hyperion.isbn10, None)
def test_isbn(self):
t, n = detect_isbn_asin('0553283685')
t, n = detect_isbn_asin("0553283685")
self.assertEqual(t, IdType.ISBN)
self.assertEqual(n, '9780553283686')
t, n = detect_isbn_asin('9780553283686')
self.assertEqual(n, "9780553283686")
t, n = detect_isbn_asin("9780553283686")
self.assertEqual(t, IdType.ISBN)
t, n = detect_isbn_asin(' b0043M6780')
t, n = detect_isbn_asin(" b0043M6780")
self.assertEqual(t, IdType.ASIN)
hyperion = Edition.objects.get(title="Hyperion")
self.assertEqual(hyperion.isbn, '9780553283686')
self.assertEqual(hyperion.isbn10, '0553283685')
hyperion.isbn10 = '0575099437'
self.assertEqual(hyperion.isbn, '9780575099432')
self.assertEqual(hyperion.isbn10, '0575099437')
self.assertEqual(hyperion.isbn, "9780553283686")
self.assertEqual(hyperion.isbn10, "0553283685")
hyperion.isbn10 = "0575099437"
self.assertEqual(hyperion.isbn, "9780575099432")
self.assertEqual(hyperion.isbn10, "0575099437")
def test_work(self):
hyperion_print = Edition.objects.get(title="Hyperion")
hyperion_ebook = Edition(title="Hyperion")
hyperion_ebook.save()
hyperion_ebook.asin = 'B0043M6780'
hyperion_ebook.asin = "B0043M6780"
hyperion = Work(title="Hyperion")
hyperion.save()
hyperion.editions.add(hyperion_print)
@ -69,9 +69,9 @@ class GoodreadsTestCase(TestCase):
def test_parse(self):
t_type = IdType.Goodreads
t_id = '77566'
t_url = 'https://www.goodreads.com/zh/book/show/77566.Hyperion'
t_url2 = 'https://www.goodreads.com/book/show/77566'
t_id = "77566"
t_url = "https://www.goodreads.com/zh/book/show/77566.Hyperion"
t_url2 = "https://www.goodreads.com/book/show/77566"
p1 = SiteManager.get_site_by_id_type(t_type)
p2 = SiteManager.get_site_by_url(t_url)
self.assertEqual(p1.id_to_url(t_id), t_url2)
@ -79,9 +79,9 @@ class GoodreadsTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://www.goodreads.com/book/show/77566.Hyperion'
t_url2 = 'https://www.goodreads.com/book/show/77566'
isbn = '9780553283686'
t_url = "https://www.goodreads.com/book/show/77566.Hyperion"
t_url2 = "https://www.goodreads.com/book/show/77566"
isbn = "9780553283686"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.url, t_url2)
@ -90,39 +90,43 @@ class GoodreadsTestCase(TestCase):
self.assertIsNotNone(site.resource)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata.get('title'), 'Hyperion')
self.assertEqual(site.resource.metadata.get("title"), "Hyperion")
self.assertEqual(site.resource.get_all_lookup_ids().get(IdType.ISBN), isbn)
self.assertEqual(site.resource.required_resources[0]['id_value'], '1383900')
edition = Edition.objects.get(primary_lookup_id_type=IdType.ISBN, primary_lookup_id_value=isbn)
self.assertEqual(site.resource.required_resources[0]["id_value"], "1383900")
edition = Edition.objects.get(
primary_lookup_id_type=IdType.ISBN, primary_lookup_id_value=isbn
)
resource = edition.external_resources.all().first()
self.assertEqual(resource.id_type, IdType.Goodreads)
self.assertEqual(resource.id_value, '77566')
self.assertNotEqual(resource.cover, '/media/item/default.svg')
self.assertEqual(edition.isbn, '9780553283686')
self.assertEqual(edition.title, 'Hyperion')
self.assertEqual(resource.id_value, "77566")
self.assertNotEqual(resource.cover, "/media/item/default.svg")
self.assertEqual(edition.isbn, "9780553283686")
self.assertEqual(edition.title, "Hyperion")
edition.delete()
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.url, t_url2)
site.get_resource()
self.assertEqual(site.ready, True, 'previous resource should still exist with data')
self.assertEqual(
site.ready, True, "previous resource should still exist with data"
)
@use_local_response
def test_asin(self):
t_url = 'https://www.goodreads.com/book/show/45064996-hyperion'
t_url = "https://www.goodreads.com/book/show/45064996-hyperion"
site = SiteManager.get_site_by_url(t_url)
site.get_resource_ready()
self.assertEqual(site.resource.item.title, 'Hyperion')
self.assertEqual(site.resource.item.asin, 'B004G60EHS')
self.assertEqual(site.resource.item.title, "Hyperion")
self.assertEqual(site.resource.item.asin, "B004G60EHS")
@use_local_response
def test_work(self):
url = 'https://www.goodreads.com/work/editions/153313'
url = "https://www.goodreads.com/work/editions/153313"
p = SiteManager.get_site_by_url(url).get_resource_ready()
self.assertEqual(p.item.title, '1984')
url1 = 'https://www.goodreads.com/book/show/3597767-rok-1984'
url2 = 'https://www.goodreads.com/book/show/40961427-1984'
self.assertEqual(p.item.title, "1984")
url1 = "https://www.goodreads.com/book/show/3597767-rok-1984"
url2 = "https://www.goodreads.com/book/show/40961427-1984"
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
w1 = p1.item.works.all().first()
@ -133,9 +137,9 @@ class GoodreadsTestCase(TestCase):
class GoogleBooksTestCase(TestCase):
def test_parse(self):
t_type = IdType.GoogleBooks
t_id = 'hV--zQEACAAJ'
t_url = 'https://books.google.com.bn/books?id=hV--zQEACAAJ&hl=ms'
t_url2 = 'https://books.google.com/books?id=hV--zQEACAAJ'
t_id = "hV--zQEACAAJ"
t_url = "https://books.google.com.bn/books?id=hV--zQEACAAJ&hl=ms"
t_url2 = "https://books.google.com/books?id=hV--zQEACAAJ"
p1 = SiteManager.get_site_by_url(t_url)
p2 = SiteManager.get_site_by_url(t_url2)
self.assertIsNotNone(p1)
@ -146,17 +150,19 @@ class GoogleBooksTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://books.google.com.bn/books?id=hV--zQEACAAJ'
t_url = "https://books.google.com.bn/books?id=hV--zQEACAAJ"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata.get('title'), '1984 Nineteen Eighty-Four')
self.assertEqual(site.resource.metadata.get('isbn'), '9781847498571')
self.assertEqual(
site.resource.metadata.get("title"), "1984 Nineteen Eighty-Four"
)
self.assertEqual(site.resource.metadata.get("isbn"), "9781847498571")
self.assertEqual(site.resource.id_type, IdType.GoogleBooks)
self.assertEqual(site.resource.id_value, 'hV--zQEACAAJ')
self.assertEqual(site.resource.item.isbn, '9781847498571')
self.assertEqual(site.resource.item.title, '1984 Nineteen Eighty-Four')
self.assertEqual(site.resource.id_value, "hV--zQEACAAJ")
self.assertEqual(site.resource.item.isbn, "9781847498571")
self.assertEqual(site.resource.item.title, "1984 Nineteen Eighty-Four")
class DoubanBookTestCase(TestCase):
@ -165,9 +171,9 @@ class DoubanBookTestCase(TestCase):
def test_parse(self):
t_type = IdType.DoubanBook
t_id = '35902899'
t_url = 'https://m.douban.com/book/subject/35902899/'
t_url2 = 'https://book.douban.com/subject/35902899/'
t_id = "35902899"
t_url = "https://m.douban.com/book/subject/35902899/"
t_url2 = "https://book.douban.com/subject/35902899/"
p1 = SiteManager.get_site_by_url(t_url)
p2 = SiteManager.get_site_by_url(t_url2)
self.assertEqual(p1.url, t_url2)
@ -177,44 +183,46 @@ class DoubanBookTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://book.douban.com/subject/35902899/'
t_url = "https://book.douban.com/subject/35902899/"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.site_name, SiteName.Douban)
self.assertEqual(site.resource.metadata.get('title'), '1984 Nineteen Eighty-Four')
self.assertEqual(site.resource.metadata.get('isbn'), '9781847498571')
self.assertEqual(
site.resource.metadata.get("title"), "1984 Nineteen Eighty-Four"
)
self.assertEqual(site.resource.metadata.get("isbn"), "9781847498571")
self.assertEqual(site.resource.id_type, IdType.DoubanBook)
self.assertEqual(site.resource.id_value, '35902899')
self.assertEqual(site.resource.item.isbn, '9781847498571')
self.assertEqual(site.resource.item.title, '1984 Nineteen Eighty-Four')
self.assertEqual(site.resource.id_value, "35902899")
self.assertEqual(site.resource.item.isbn, "9781847498571")
self.assertEqual(site.resource.item.title, "1984 Nineteen Eighty-Four")
@use_local_response
def test_work(self):
# url = 'https://www.goodreads.com/work/editions/153313'
url1 = 'https://book.douban.com/subject/1089243/'
url2 = 'https://book.douban.com/subject/2037260/'
url1 = "https://book.douban.com/subject/1089243/"
url2 = "https://book.douban.com/subject/2037260/"
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
w1 = p1.item.works.all().first()
w2 = p2.item.works.all().first()
self.assertEqual(w1.title, '黄金时代')
self.assertEqual(w2.title, '黄金时代')
self.assertEqual(w1.title, "黄金时代")
self.assertEqual(w2.title, "黄金时代")
self.assertEqual(w1, w2)
editions = w1.editions.all().order_by('title')
editions = w1.editions.all().order_by("title")
self.assertEqual(editions.count(), 2)
self.assertEqual(editions[0].title, 'Wang in Love and Bondage')
self.assertEqual(editions[1].title, '黄金时代')
self.assertEqual(editions[0].title, "Wang in Love and Bondage")
self.assertEqual(editions[1].title, "黄金时代")
class MultiBookSitesTestCase(TestCase):
@use_local_response
def test_editions(self):
# isbn = '9781847498571'
url1 = 'https://www.goodreads.com/book/show/56821625-1984'
url2 = 'https://book.douban.com/subject/35902899/'
url3 = 'https://books.google.com/books?id=hV--zQEACAAJ'
url1 = "https://www.goodreads.com/book/show/56821625-1984"
url2 = "https://book.douban.com/subject/35902899/"
url3 = "https://books.google.com/books?id=hV--zQEACAAJ"
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
@ -224,11 +232,13 @@ class MultiBookSitesTestCase(TestCase):
@use_local_response
def test_works(self):
# url1 and url4 has same ISBN, hence they share same Edition instance, which belongs to 2 Work instances
url1 = 'https://book.douban.com/subject/1089243/'
url2 = 'https://book.douban.com/subject/2037260/'
url3 = 'https://www.goodreads.com/book/show/59952545-golden-age'
url4 = 'https://www.goodreads.com/book/show/11798823'
p1 = SiteManager.get_site_by_url(url1).get_resource_ready() # lxml bug may break this
url1 = "https://book.douban.com/subject/1089243/"
url2 = "https://book.douban.com/subject/2037260/"
url3 = "https://www.goodreads.com/book/show/59952545-golden-age"
url4 = "https://www.goodreads.com/book/show/11798823"
p1 = SiteManager.get_site_by_url(
url1
).get_resource_ready() # lxml bug may break this
w1 = p1.item.works.all().first()
p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
w2 = p2.item.works.all().first()
@ -241,13 +251,13 @@ class MultiBookSitesTestCase(TestCase):
self.assertEqual(p4.item.id, p1.item.id)
self.assertEqual(p4.item.works.all().count(), 2)
self.assertEqual(p1.item.works.all().count(), 2)
w2e = w2.editions.all().order_by('title')
w2e = w2.editions.all().order_by("title")
self.assertEqual(w2e.count(), 2)
self.assertEqual(w2e[0].title, 'Wang in Love and Bondage')
self.assertEqual(w2e[1].title, '黄金时代')
w3e = w3.editions.all().order_by('title')
self.assertEqual(w2e[0].title, "Wang in Love and Bondage")
self.assertEqual(w2e[1].title, "黄金时代")
w3e = w3.editions.all().order_by("title")
self.assertEqual(w3e.count(), 2)
self.assertEqual(w3e[0].title, 'Golden Age: A Novel')
self.assertEqual(w3e[1].title, '黄金时代')
self.assertEqual(w3e[0].title, "Golden Age: A Novel")
self.assertEqual(w3e[1].title, "黄金时代")
e = Edition.objects.get(primary_lookup_id_value=9781662601217)
self.assertEqual(e.title, 'Golden Age: A Novel')
self.assertEqual(e.title, "Golden Age: A Novel")

View file

@ -10,7 +10,7 @@ def check_digit_10(isbn):
w = i + 1
sum += w * c
r = sum % 11
return 'X' if r == 10 else str(r)
return "X" if r == 10 else str(r)
def check_digit_13(isbn):
@ -21,38 +21,38 @@ def check_digit_13(isbn):
w = 3 if i % 2 else 1
sum += w * c
r = 10 - (sum % 10)
return '0' if r == 10 else str(r)
return "0" if r == 10 else str(r)
def isbn_10_to_13(isbn):
if not isbn or len(isbn) != 10:
return None
return '978' + isbn[:-1] + check_digit_13('978' + isbn[:-1])
return "978" + isbn[:-1] + check_digit_13("978" + isbn[:-1])
def isbn_13_to_10(isbn):
if not isbn or len(isbn) != 13 or isbn[:3] != '978':
if not isbn or len(isbn) != 13 or isbn[:3] != "978":
return None
else:
return isbn[3:12] + check_digit_10(isbn[3:12])
def is_isbn_13(isbn):
return re.match(r'\d{13}', isbn) is not None
return re.match(r"\d{13}", isbn) is not None
def is_isbn_10(isbn):
return re.match(r'\d{9}[X0-9]', isbn) is not None
return re.match(r"\d{9}[X0-9]", isbn) is not None
def is_asin(asin):
return re.match(r'B[A-Z0-9]{9}', asin) is not None
return re.match(r"B[A-Z0-9]{9}", asin) is not None
def detect_isbn_asin(s):
if not s:
return None, None
n = re.sub(r'[^0-9A-Z]', '', s.upper())
n = re.sub(r"[^0-9A-Z]", "", s.upper())
if is_isbn_13(n):
return IdType.ISBN, n
if is_isbn_10(n):

View file

@ -5,4 +5,28 @@ from .scrapers import *
from . import jsondata
__all__ = ('IdType', 'SiteName', 'ItemCategory', 'Item', 'ExternalResource', 'ResourceContent', 'ParseError', 'AbstractSite', 'SiteManager', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'get_mock_mode', 'get_mock_file', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP')
__all__ = (
"IdType",
"SiteName",
"ItemCategory",
"Item",
"ExternalResource",
"ResourceContent",
"ParseError",
"AbstractSite",
"SiteManager",
"jsondata",
"PrimaryLookupIdDescriptor",
"LookupIdDescriptor",
"get_mock_mode",
"get_mock_file",
"use_local_response",
"RetryDownloader",
"BasicDownloader",
"ProxiedDownloader",
"BasicImageDownloader",
"RESPONSE_OK",
"RESPONSE_NETWORK_ERROR",
"RESPONSE_INVALID_CONTENT",
"RESPONSE_CENSORSHIP",
)

View file

@ -29,6 +29,7 @@ def use_local_response(func):
set_mock_mode(True)
func(args)
set_mock_mode(False)
return _func
@ -43,9 +44,9 @@ def get_mock_mode():
def get_mock_file(url):
fn = url.replace('***REMOVED***', '1234') # Thank you, Github Action -_-!
fn = re.sub(r'[^\w]', '_', fn)
fn = re.sub(r'_key_[*A-Za-z0-9]+', '_key_8964', fn)
fn = url.replace("***REMOVED***", "1234") # Thank you, Github Action -_-!
fn = re.sub(r"[^\w]", "_", fn)
fn = re.sub(r"_key_[*A-Za-z0-9]+", "_key_8964", fn)
return fn
@ -61,21 +62,23 @@ class DownloadError(Exception):
error = "Censored Content"
else:
error = "Unknown Error"
self.message = f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}"
self.message = (
f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}"
)
super().__init__(self.message)
class BasicDownloader:
headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0',
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'no-cache',
"User-Agent": "Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"DNT": "1",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": "no-cache",
}
def __init__(self, url, headers=None):
@ -100,18 +103,28 @@ class BasicDownloader:
try:
if not _mock_mode:
# TODO cache = get/set from redis
resp = requests.get(url, headers=self.headers, timeout=self.get_timeout())
resp = requests.get(
url, headers=self.headers, timeout=self.get_timeout()
)
if settings.DOWNLOADER_SAVEDIR:
with open(settings.DOWNLOADER_SAVEDIR + '/' + get_mock_file(url), 'w', encoding='utf-8') as fp:
with open(
settings.DOWNLOADER_SAVEDIR + "/" + get_mock_file(url),
"w",
encoding="utf-8",
) as fp:
fp.write(resp.text)
else:
resp = MockResponse(self.url)
response_type = self.validate_response(resp)
self.logs.append({'response_type': response_type, 'url': url, 'exception': None})
self.logs.append(
{"response_type": response_type, "url": url, "exception": None}
)
return resp, response_type
except RequestException as e:
self.logs.append({'response_type': RESPONSE_NETWORK_ERROR, 'url': url, 'exception': e})
self.logs.append(
{"response_type": RESPONSE_NETWORK_ERROR, "url": url, "exception": e}
)
return None, RESPONSE_NETWORK_ERROR
def download(self):
@ -126,16 +139,26 @@ class ProxiedDownloader(BasicDownloader):
def get_proxied_urls(self):
urls = []
if settings.PROXYCRAWL_KEY is not None:
urls.append(f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={self.url}')
urls.append(
f"https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={self.url}"
)
if settings.SCRAPESTACK_KEY is not None:
# urls.append(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
urls.append(f'http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
urls.append(
f"http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={self.url}"
)
if settings.SCRAPERAPI_KEY is not None:
urls.append(f'http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={self.url}')
urls.append(
f"http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={self.url}"
)
return urls
def get_special_proxied_url(self):
return f'{settings.LOCAL_PROXY}?url={self.url}' if settings.LOCAL_PROXY is not None else None
return (
f"{settings.LOCAL_PROXY}?url={self.url}"
if settings.LOCAL_PROXY is not None
else None
)
def download(self):
urls = self.get_proxied_urls()
@ -144,7 +167,11 @@ class ProxiedDownloader(BasicDownloader):
resp = None
while url:
resp, resp_type = self._download(url)
if resp_type == RESPONSE_OK or resp_type == RESPONSE_INVALID_CONTENT or last_try:
if (
resp_type == RESPONSE_OK
or resp_type == RESPONSE_INVALID_CONTENT
or last_try
):
url = None
elif resp_type == RESPONSE_CENSORSHIP:
url = self.get_special_proxied_url()
@ -169,15 +196,15 @@ class RetryDownloader(BasicDownloader):
elif self.response_type != RESPONSE_NETWORK_ERROR and retries == 0:
raise DownloadError(self)
elif retries > 0:
_logger.debug('Retry ' + self.url)
_logger.debug("Retry " + self.url)
time.sleep((settings.DOWNLOADER_RETRIES - retries) * 0.5)
raise DownloadError(self, 'max out of retries')
raise DownloadError(self, "max out of retries")
class ImageDownloaderMixin:
def __init__(self, url, referer=None):
if referer is not None:
self.headers['Referer'] = referer
self.headers["Referer"] = referer
super().__init__(url)
def validate_response(self, response):
@ -186,8 +213,10 @@ class ImageDownloaderMixin:
raw_img = response.content
img = Image.open(BytesIO(raw_img))
img.load() # corrupted image will trigger exception
content_type = response.headers.get('Content-Type')
self.extention = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
content_type = response.headers.get("Content-Type")
self.extention = filetype.get_type(
mime=content_type.partition(";")[0].strip()
).extension
return RESPONSE_OK
except Exception:
return RESPONSE_NETWORK_ERROR
@ -213,7 +242,9 @@ class ProxiedImageDownloader(ImageDownloaderMixin, ProxiedDownloader):
pass
_local_response_path = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/'
_local_response_path = (
str(Path(__file__).parent.parent.parent.absolute()) + "/test_data/"
)
class MockResponse:
@ -225,23 +256,27 @@ class MockResponse:
self.status_code = 200
_logger.debug(f"use local response for {url} from {fn}")
except Exception:
self.content = b'Error: response file not found'
self.content = b"Error: response file not found"
self.status_code = 404
_logger.debug(f"local response not found for {url} at {fn}")
@property
def text(self):
return self.content.decode('utf-8')
return self.content.decode("utf-8")
def json(self):
return json.load(StringIO(self.text))
def html(self):
return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5
return html.fromstring(
self.text
) # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5
@property
def headers(self):
return {'Content-Type': 'image/jpeg' if self.url.endswith('jpg') else 'text/html'}
return {
"Content-Type": "image/jpeg" if self.url.endswith("jpg") else "text/html"
}
requests.Response.html = MockResponse.html

View file

@ -24,25 +24,29 @@ class ResourceContent:
cover_image_extention: str = None
def dict(self):
return {'metadata': self.metadata, 'lookup_ids': self.lookup_ids}
return {"metadata": self.metadata, "lookup_ids": self.lookup_ids}
def to_json(self) -> str:
return json.dumps({'metadata': self.metadata, 'lookup_ids': self.lookup_ids})
return json.dumps({"metadata": self.metadata, "lookup_ids": self.lookup_ids})
class AbstractSite:
"""
Abstract class to represent a site
"""
SITE_NAME = None
ID_TYPE = None
WIKI_PROPERTY_ID = 'P0undefined0'
WIKI_PROPERTY_ID = "P0undefined0"
DEFAULT_MODEL = None
URL_PATTERNS = [r"\w+://undefined/(\d+)"]
@classmethod
def validate_url(self, url: str):
u = next(iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), None)
u = next(
iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]),
None,
)
return u is not None
@classmethod
@ -51,15 +55,18 @@ class AbstractSite:
@classmethod
def id_to_url(self, id_value):
return 'https://undefined/' + id_value
return "https://undefined/" + id_value
@classmethod
def url_to_id(self, url: str):
u = next(iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), None)
u = next(
iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]),
None,
)
return u[1] if u else None
def __str__(self):
return f'<{self.__class__.__name__}: {self.url}>'
return f"<{self.__class__.__name__}: {self.url}>"
def __init__(self, url=None):
self.id_value = self.url_to_id(url) if url else None
@ -70,7 +77,9 @@ class AbstractSite:
if not self.resource:
self.resource = ExternalResource.objects.filter(url=self.url).first()
if self.resource is None:
self.resource = ExternalResource(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url)
self.resource = ExternalResource(
id_type=self.ID_TYPE, id_value=self.id_value, url=self.url
)
return self.resource
def scrape(self) -> ResourceContent:
@ -91,11 +100,13 @@ class AbstractSite:
model = self.DEFAULT_MODEL
t, v = model.get_best_lookup_id(p.get_all_lookup_ids())
if t is not None:
p.item = model.objects.filter(primary_lookup_id_type=t, primary_lookup_id_value=v).first()
p.item = model.objects.filter(
primary_lookup_id_type=t, primary_lookup_id_value=v
).first()
if p.item is None:
obj = model.copy_metadata(p.metadata)
obj['primary_lookup_id_type'] = t
obj['primary_lookup_id_value'] = v
obj["primary_lookup_id_type"] = t
obj["primary_lookup_id_value"] = v
p.item = model.objects.create(**obj)
return p.item
@ -103,10 +114,17 @@ class AbstractSite:
def ready(self):
return bool(self.resource and self.resource.ready)
def get_resource_ready(self, auto_save=True, auto_create=True, auto_link=True, preloaded_content=None, ignore_existing_content=False):
def get_resource_ready(
self,
auto_save=True,
auto_create=True,
auto_link=True,
preloaded_content=None,
ignore_existing_content=False,
):
"""
Returns an ExternalResource in scraped state if possible
Parameters
----------
auto_save : bool
@ -137,7 +155,7 @@ class AbstractSite:
resource_content = self.scrape()
p.update_content(resource_content)
if not p.ready:
_logger.error(f'unable to get resource {self.url} ready')
_logger.error(f"unable to get resource {self.url} ready")
return None
if auto_create and p.item is None:
self.get_item()
@ -148,9 +166,12 @@ class AbstractSite:
p.item.save()
if auto_link:
for linked_resource in p.required_resources:
linked_site = SiteManager.get_site_by_url(linked_resource['url'])
linked_site = SiteManager.get_site_by_url(linked_resource["url"])
if linked_site:
linked_site.get_resource_ready(auto_link=False, preloaded_content=linked_resource.get('content'))
linked_site.get_resource_ready(
auto_link=False,
preloaded_content=linked_resource.get("content"),
)
else:
_logger.error(f'unable to get site for {linked_resource["url"]}')
p.item.update_linked_items_from_external_resource(p)
@ -165,7 +186,7 @@ class SiteManager:
def register(target) -> Callable:
id_type = target.ID_TYPE
if id_type in SiteManager.registry:
raise ValueError(f'Site for {id_type} already exists')
raise ValueError(f"Site for {id_type} already exists")
SiteManager.registry[id_type] = target
return target
@ -175,9 +196,17 @@ class SiteManager:
@staticmethod
def get_site_by_url(url: str):
cls = next(filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None)
cls = next(
filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None
)
if cls is None:
cls = next(filter(lambda p: p.validate_url_fallback(url), SiteManager.registry.values()), None)
cls = next(
filter(
lambda p: p.validate_url_fallback(url),
SiteManager.registry.values(),
),
None,
)
return cls(url) if cls else None
@staticmethod
@ -190,5 +219,7 @@ class SiteManager:
return SiteManager.get_site_by_id_type(resource.id_type)
ExternalResource.get_site = lambda resource: SiteManager.get_site_by_id_type(resource.id_type)
ExternalResource.get_site = lambda resource: SiteManager.get_site_by_id_type(
resource.id_type
)
# ExternalResource.get_site = SiteManager.get_site_by_resource

View file

@ -6,9 +6,14 @@ import uuid
_logger = logging.getLogger(__name__)
DEFAULT_ITEM_COVER = 'item/default.svg'
DEFAULT_ITEM_COVER = "item/default.svg"
def item_cover_path(resource, filename):
fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1]
return 'item/' + resource.id_type + '/' + fn
fn = (
timezone.now().strftime("%Y/%m/%d/")
+ str(uuid.uuid4())
+ "."
+ filename.split(".")[-1]
)
return "item/" + resource.id_type + "/" + fn

View file

@ -5,66 +5,63 @@ from django.db import models
class Game(Item):
category = ItemCategory.Game
url_path = 'game'
demonstrative = _('这个游戏')
url_path = "game"
demonstrative = _("这个游戏")
igdb = PrimaryLookupIdDescriptor(IdType.IGDB)
steam = PrimaryLookupIdDescriptor(IdType.Steam)
douban_game = PrimaryLookupIdDescriptor(IdType.DoubanGame)
METADATA_COPY_LIST = [
'title',
'brief',
'other_title',
'developer',
'publisher',
'release_date',
'genre',
'platform',
'official_site',
"title",
"brief",
"other_title",
"developer",
"publisher",
"release_date",
"genre",
"platform",
"official_site",
]
other_title = jsondata.ArrayField(
models.CharField(blank=True, default='', max_length=500),
models.CharField(blank=True, default="", max_length=500),
null=True,
blank=True,
default=list,
)
developer = jsondata.ArrayField(
models.CharField(blank=True, default='', max_length=500),
models.CharField(blank=True, default="", max_length=500),
null=True,
blank=True,
default=list,
)
publisher = jsondata.ArrayField(
models.CharField(blank=True, default='', max_length=500),
models.CharField(blank=True, default="", max_length=500),
null=True,
blank=True,
default=list,
)
release_date = jsondata.DateField(
auto_now=False,
auto_now_add=False,
null=True,
blank=True
auto_now=False, auto_now_add=False, null=True, blank=True
)
genre = jsondata.ArrayField(
models.CharField(blank=True, default='', max_length=200),
models.CharField(blank=True, default="", max_length=200),
null=True,
blank=True,
default=list,
)
platform = jsondata.ArrayField(
models.CharField(blank=True, default='', max_length=200),
models.CharField(blank=True, default="", max_length=200),
null=True,
blank=True,
default=list,
)
official_site = jsondata.CharField(
default='',
default="",
)

View file

@ -6,8 +6,8 @@ from catalog.models import *
class IGDBTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.IGDB
t_id_value = 'portal-2'
t_url = 'https://www.igdb.com/games/portal-2'
t_id_value = "portal-2"
t_url = "https://www.igdb.com/games/portal-2"
site = SiteManager.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
@ -17,34 +17,39 @@ class IGDBTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://www.igdb.com/games/portal-2'
t_url = "https://www.igdb.com/games/portal-2"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'Portal 2')
self.assertEqual(site.resource.metadata["title"], "Portal 2")
self.assertIsInstance(site.resource.item, Game)
self.assertEqual(site.resource.item.steam, '620')
self.assertEqual(site.resource.item.steam, "620")
@use_local_response
def test_scrape_non_steam(self):
t_url = 'https://www.igdb.com/games/the-legend-of-zelda-breath-of-the-wild'
t_url = "https://www.igdb.com/games/the-legend-of-zelda-breath-of-the-wild"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'The Legend of Zelda: Breath of the Wild')
self.assertEqual(
site.resource.metadata["title"], "The Legend of Zelda: Breath of the Wild"
)
self.assertIsInstance(site.resource.item, Game)
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IGDB)
self.assertEqual(site.resource.item.primary_lookup_id_value, 'the-legend-of-zelda-breath-of-the-wild')
self.assertEqual(
site.resource.item.primary_lookup_id_value,
"the-legend-of-zelda-breath-of-the-wild",
)
class SteamTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.Steam
t_id_value = '620'
t_url = 'https://store.steampowered.com/app/620/Portal_2/'
t_url2 = 'https://store.steampowered.com/app/620'
t_id_value = "620"
t_url = "https://store.steampowered.com/app/620/Portal_2/"
t_url2 = "https://store.steampowered.com/app/620"
site = SiteManager.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
@ -54,22 +59,24 @@ class SteamTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://store.steampowered.com/app/620/Portal_2/'
t_url = "https://store.steampowered.com/app/620/Portal_2/"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'Portal 2')
self.assertEqual(site.resource.metadata['brief'], '“终身测试计划”现已升级,您可以为您自己或您的好友设计合作谜题!')
self.assertEqual(site.resource.metadata["title"], "Portal 2")
self.assertEqual(
site.resource.metadata["brief"], "“终身测试计划”现已升级,您可以为您自己或您的好友设计合作谜题!"
)
self.assertIsInstance(site.resource.item, Game)
self.assertEqual(site.resource.item.steam, '620')
self.assertEqual(site.resource.item.steam, "620")
class DoubanGameTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.DoubanGame
t_id_value = '10734307'
t_url = 'https://www.douban.com/game/10734307/'
t_id_value = "10734307"
t_url = "https://www.douban.com/game/10734307/"
site = SiteManager.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
@ -79,21 +86,21 @@ class DoubanGameTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://www.douban.com/game/10734307/'
t_url = "https://www.douban.com/game/10734307/"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], '传送门2 Portal 2')
self.assertEqual(site.resource.metadata["title"], "传送门2 Portal 2")
self.assertIsInstance(site.resource.item, Game)
self.assertEqual(site.resource.item.douban_game, '10734307')
self.assertEqual(site.resource.item.douban_game, "10734307")
class BangumiGameTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.Bangumi
t_id_value = '15912'
t_url = 'https://bgm.tv/subject/15912'
t_id_value = "15912"
t_url = "https://bgm.tv/subject/15912"
site = SiteManager.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
@ -110,8 +117,8 @@ class BangumiGameTestCase(TestCase):
class MultiGameSitesTestCase(TestCase):
@use_local_response
def test_games(self):
url1 = 'https://www.igdb.com/games/portal-2'
url2 = 'https://store.steampowered.com/app/620/Portal_2/'
url1 = "https://www.igdb.com/games/portal-2"
url2 = "https://store.steampowered.com/app/620/Portal_2/"
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
self.assertEqual(p1.item.id, p2.item.id)

View file

@ -5,24 +5,24 @@ from catalog.sites import *
class Command(BaseCommand):
help = 'Scrape a catalog item from external resource (and save it)'
help = "Scrape a catalog item from external resource (and save it)"
def add_arguments(self, parser):
parser.add_argument('url', type=str, help='URL to scrape')
parser.add_argument("url", type=str, help="URL to scrape")
parser.add_argument(
'--save',
action='store_true',
help='save to database',
"--save",
action="store_true",
help="save to database",
)
def handle(self, *args, **options):
url = str(options['url'])
url = str(options["url"])
site = SiteManager.get_site_by_url(url)
if site is None:
self.stdout.write(self.style.ERROR(f'Unknown site for {url}'))
self.stdout.write(self.style.ERROR(f"Unknown site for {url}"))
return
self.stdout.write(f'Fetching from {site}')
if options['save']:
self.stdout.write(f"Fetching from {site}")
if options["save"]:
resource = site.get_resource_ready()
pprint.pp(resource.metadata)
pprint.pp(site.get_item())
@ -31,4 +31,4 @@ class Command(BaseCommand):
resource = site.scrape()
pprint.pp(resource.metadata)
pprint.pp(resource.lookup_ids)
self.stdout.write(self.style.SUCCESS(f'Done.'))
self.stdout.write(self.style.SUCCESS(f"Done."))

View file

@ -37,7 +37,9 @@ def all_content_types():
if _CONTENT_TYPE_LIST is None:
_CONTENT_TYPE_LIST = {}
for cls in Item.__subclasses__():
_CONTENT_TYPE_LIST[cls] = ContentType.objects.get(app_label='catalog', model=cls.__name__.lower()).id
_CONTENT_TYPE_LIST[cls] = ContentType.objects.get(
app_label="catalog", model=cls.__name__.lower()
).id
return _CONTENT_TYPE_LIST
@ -46,7 +48,7 @@ def all_categories():
if _CATEGORY_LIST is None:
_CATEGORY_LIST = {}
for cls in Item.__subclasses__():
c = getattr(cls, 'category', None)
c = getattr(cls, "category", None)
if c not in _CATEGORY_LIST:
_CATEGORY_LIST[c] = [cls]
else:

View file

@ -5,43 +5,93 @@ from django.db import models
class Movie(Item):
category = ItemCategory.Movie
url_path = 'movie'
url_path = "movie"
imdb = PrimaryLookupIdDescriptor(IdType.IMDB)
tmdb_movie = PrimaryLookupIdDescriptor(IdType.TMDB_Movie)
douban_movie = PrimaryLookupIdDescriptor(IdType.DoubanMovie)
demonstrative = _('这部电影')
demonstrative = _("这部电影")
METADATA_COPY_LIST = [
'title',
'orig_title',
'other_title',
'director',
'playwright',
'actor',
'genre',
'showtime',
'site',
'area',
'language',
'year',
'duration',
'season_number',
'episodes',
'single_episode_length',
'brief',
"title",
"orig_title",
"other_title",
"director",
"playwright",
"actor",
"genre",
"showtime",
"site",
"area",
"language",
"year",
"duration",
"season_number",
"episodes",
"single_episode_length",
"brief",
]
orig_title = jsondata.CharField(_("original title"), blank=True, default='', max_length=500)
other_title = jsondata.ArrayField(models.CharField(_("other title"), blank=True, default='', max_length=500), null=True, blank=True, default=list, )
director = jsondata.ArrayField(models.CharField(_("director"), blank=True, default='', max_length=200), null=True, blank=True, default=list, )
playwright = jsondata.ArrayField(models.CharField(_("playwright"), blank=True, default='', max_length=200), null=True, blank=True, default=list, )
actor = jsondata.ArrayField(models.CharField(_("actor"), blank=True, default='', max_length=200), null=True, blank=True, default=list, )
genre = jsondata.ArrayField(models.CharField(_("genre"), blank=True, default='', max_length=50), null=True, blank=True, default=list, ) # , choices=MovieGenreEnum.choices
showtime = jsondata.ArrayField(null=True, blank=True, default=list, )
site = jsondata.URLField(_('site url'), blank=True, default='', max_length=200)
area = jsondata.ArrayField(models.CharField(_("country or region"), blank=True, default='', max_length=100, ), null=True, blank=True, default=list, )
language = jsondata.ArrayField(models.CharField(blank=True, default='', max_length=100, ), null=True, blank=True, default=list, )
orig_title = jsondata.CharField(
_("original title"), blank=True, default="", max_length=500
)
other_title = jsondata.ArrayField(
models.CharField(_("other title"), blank=True, default="", max_length=500),
null=True,
blank=True,
default=list,
)
director = jsondata.ArrayField(
models.CharField(_("director"), blank=True, default="", max_length=200),
null=True,
blank=True,
default=list,
)
playwright = jsondata.ArrayField(
models.CharField(_("playwright"), blank=True, default="", max_length=200),
null=True,
blank=True,
default=list,
)
actor = jsondata.ArrayField(
models.CharField(_("actor"), blank=True, default="", max_length=200),
null=True,
blank=True,
default=list,
)
genre = jsondata.ArrayField(
models.CharField(_("genre"), blank=True, default="", max_length=50),
null=True,
blank=True,
default=list,
) # , choices=MovieGenreEnum.choices
showtime = jsondata.ArrayField(
null=True,
blank=True,
default=list,
)
site = jsondata.URLField(_("site url"), blank=True, default="", max_length=200)
area = jsondata.ArrayField(
models.CharField(
_("country or region"),
blank=True,
default="",
max_length=100,
),
null=True,
blank=True,
default=list,
)
language = jsondata.ArrayField(
models.CharField(
blank=True,
default="",
max_length=100,
),
null=True,
blank=True,
default=list,
)
year = jsondata.IntegerField(null=True, blank=True)
season_number = jsondata.IntegerField(null=True, blank=True)
episodes = jsondata.IntegerField(null=True, blank=True)
single_episode_length = jsondata.IntegerField(null=True, blank=True)
duration = jsondata.CharField(blank=True, default='', max_length=200)
duration = jsondata.CharField(blank=True, default="", max_length=200)

View file

@ -4,8 +4,8 @@ from catalog.common import *
class DoubanMovieTestCase(TestCase):
def test_parse(self):
t_id = '3541415'
t_url = 'https://movie.douban.com/subject/3541415/'
t_id = "3541415"
t_url = "https://movie.douban.com/subject/3541415/"
p1 = SiteManager.get_site_by_id_type(IdType.DoubanMovie)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
@ -15,22 +15,24 @@ class DoubanMovieTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://movie.douban.com/subject/3541415/'
t_url = "https://movie.douban.com/subject/3541415/"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '3541415')
self.assertEqual(site.id_value, "3541415")
site.get_resource_ready()
self.assertEqual(site.resource.metadata['title'], '盗梦空间')
self.assertEqual(site.resource.metadata["title"], "盗梦空间")
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, 'Movie')
self.assertEqual(site.resource.item.imdb, 'tt1375666')
self.assertEqual(site.resource.item.__class__.__name__, "Movie")
self.assertEqual(site.resource.item.imdb, "tt1375666")
class TMDBMovieTestCase(TestCase):
def test_parse(self):
t_id = '293767'
t_url = 'https://www.themoviedb.org/movie/293767-billy-lynn-s-long-halftime-walk'
t_url2 = 'https://www.themoviedb.org/movie/293767'
t_id = "293767"
t_url = (
"https://www.themoviedb.org/movie/293767-billy-lynn-s-long-halftime-walk"
)
t_url2 = "https://www.themoviedb.org/movie/293767"
p1 = SiteManager.get_site_by_id_type(IdType.TMDB_Movie)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
@ -41,22 +43,22 @@ class TMDBMovieTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://www.themoviedb.org/movie/293767'
t_url = "https://www.themoviedb.org/movie/293767"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '293767')
self.assertEqual(site.id_value, "293767")
site.get_resource_ready()
self.assertEqual(site.resource.metadata['title'], '比利·林恩的中场战事')
self.assertEqual(site.resource.metadata["title"], "比利·林恩的中场战事")
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, 'Movie')
self.assertEqual(site.resource.item.imdb, 'tt2513074')
self.assertEqual(site.resource.item.__class__.__name__, "Movie")
self.assertEqual(site.resource.item.imdb, "tt2513074")
class IMDBMovieTestCase(TestCase):
def test_parse(self):
t_id = 'tt1375666'
t_url = 'https://www.imdb.com/title/tt1375666/'
t_url2 = 'https://www.imdb.com/title/tt1375666/'
t_id = "tt1375666"
t_url = "https://www.imdb.com/title/tt1375666/"
t_url2 = "https://www.imdb.com/title/tt1375666/"
p1 = SiteManager.get_site_by_id_type(IdType.IMDB)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
@ -67,22 +69,22 @@ class IMDBMovieTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://www.imdb.com/title/tt1375666/'
t_url = "https://www.imdb.com/title/tt1375666/"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, 'tt1375666')
self.assertEqual(site.id_value, "tt1375666")
site.get_resource_ready()
self.assertEqual(site.resource.metadata['title'], '盗梦空间')
self.assertEqual(site.resource.metadata["title"], "盗梦空间")
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.imdb, 'tt1375666')
self.assertEqual(site.resource.item.imdb, "tt1375666")
class MultiMovieSitesTestCase(TestCase):
@use_local_response
def test_movies(self):
url1 = 'https://www.themoviedb.org/movie/27205-inception'
url2 = 'https://movie.douban.com/subject/3541415/'
url3 = 'https://www.imdb.com/title/tt1375666/'
url1 = "https://www.themoviedb.org/movie/27205-inception"
url2 = "https://movie.douban.com/subject/3541415/"
url3 = "https://www.imdb.com/title/tt1375666/"
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()

View file

@ -4,35 +4,47 @@ from django.db import models
class Album(Item):
url_path = 'album'
url_path = "album"
category = ItemCategory.Music
demonstrative = _('这张专辑')
demonstrative = _("这张专辑")
barcode = PrimaryLookupIdDescriptor(IdType.GTIN)
douban_music = PrimaryLookupIdDescriptor(IdType.DoubanMusic)
spotify_album = PrimaryLookupIdDescriptor(IdType.Spotify_Album)
METADATA_COPY_LIST = [
'title',
'other_title',
'album_type',
'media',
'disc_count',
'artist',
'genre',
'release_date',
'duration',
'company',
'track_list',
'brief',
'bandcamp_album_id',
"title",
"other_title",
"album_type",
"media",
"disc_count",
"artist",
"genre",
"release_date",
"duration",
"company",
"track_list",
"brief",
"bandcamp_album_id",
]
release_date = jsondata.DateField(_('发行日期'), auto_now=False, auto_now_add=False, null=True, blank=True)
release_date = jsondata.DateField(
_("发行日期"), auto_now=False, auto_now_add=False, null=True, blank=True
)
duration = jsondata.IntegerField(_("时长"), null=True, blank=True)
artist = jsondata.ArrayField(models.CharField(_("artist"), blank=True, default='', max_length=200), null=True, blank=True, default=list)
genre = jsondata.CharField(_("流派"), blank=True, default='', max_length=100)
company = jsondata.ArrayField(models.CharField(blank=True, default='', max_length=500), null=True, blank=True, default=list)
artist = jsondata.ArrayField(
models.CharField(_("artist"), blank=True, default="", max_length=200),
null=True,
blank=True,
default=list,
)
genre = jsondata.CharField(_("流派"), blank=True, default="", max_length=100)
company = jsondata.ArrayField(
models.CharField(blank=True, default="", max_length=500),
null=True,
blank=True,
default=list,
)
track_list = jsondata.TextField(_("曲目"), blank=True, default="")
other_title = jsondata.CharField(blank=True, default='', max_length=500)
album_type = jsondata.CharField(blank=True, default='', max_length=500)
media = jsondata.CharField(blank=True, default='', max_length=500)
bandcamp_album_id = jsondata.CharField(blank=True, default='', max_length=500)
disc_count = jsondata.IntegerField(blank=True, default='', max_length=500)
other_title = jsondata.CharField(blank=True, default="", max_length=500)
album_type = jsondata.CharField(blank=True, default="", max_length=500)
media = jsondata.CharField(blank=True, default="", max_length=500)
bandcamp_album_id = jsondata.CharField(blank=True, default="", max_length=500)
disc_count = jsondata.IntegerField(blank=True, default="", max_length=500)

View file

@ -6,8 +6,8 @@ from catalog.models import *
class SpotifyTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.Spotify_Album
t_id_value = '65KwtzkJXw7oT819NFWmEP'
t_url = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP'
t_id_value = "65KwtzkJXw7oT819NFWmEP"
t_url = "https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP"
site = SiteManager.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
@ -17,21 +17,21 @@ class SpotifyTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP'
t_url = "https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
self.assertEqual(site.resource.metadata["title"], "The Race For Space")
self.assertIsInstance(site.resource.item, Album)
self.assertEqual(site.resource.item.barcode, '3610159662676')
self.assertEqual(site.resource.item.barcode, "3610159662676")
class DoubanMusicTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.DoubanMusic
t_id_value = '33551231'
t_url = 'https://music.douban.com/subject/33551231/'
t_id_value = "33551231"
t_url = "https://music.douban.com/subject/33551231/"
site = SiteManager.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
@ -41,21 +41,21 @@ class DoubanMusicTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://music.douban.com/subject/33551231/'
t_url = "https://music.douban.com/subject/33551231/"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
self.assertEqual(site.resource.metadata["title"], "The Race For Space")
self.assertIsInstance(site.resource.item, Album)
self.assertEqual(site.resource.item.barcode, '3610159662676')
self.assertEqual(site.resource.item.barcode, "3610159662676")
class MultiMusicSitesTestCase(TestCase):
@use_local_response
def test_albums(self):
url1 = 'https://music.douban.com/subject/33551231/'
url2 = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP'
url1 = "https://music.douban.com/subject/33551231/"
url2 = "https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP"
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
self.assertEqual(p1.item.id, p2.item.id)
@ -64,9 +64,9 @@ class MultiMusicSitesTestCase(TestCase):
class BandcampTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.Bandcamp
t_id_value = 'intlanthem.bandcamp.com/album/in-these-times'
t_url = 'https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw'
t_url2 = 'https://intlanthem.bandcamp.com/album/in-these-times'
t_id_value = "intlanthem.bandcamp.com/album/in-these-times"
t_url = "https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw"
t_url2 = "https://intlanthem.bandcamp.com/album/in-these-times"
site = SiteManager.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
@ -76,11 +76,11 @@ class BandcampTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw'
t_url = "https://intlanthem.bandcamp.com/album/in-these-times?from=hpbcw"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'In These Times')
self.assertEqual(site.resource.metadata['artist'], ['Makaya McCraven'])
self.assertEqual(site.resource.metadata["title"], "In These Times")
self.assertEqual(site.resource.metadata["artist"], ["Makaya McCraven"])
self.assertIsInstance(site.resource.item, Album)

View file

@ -4,12 +4,12 @@ from django.utils.translation import gettext_lazy as _
class Performance(Item):
category = ItemCategory.Performance
url_path = 'performance'
url_path = "performance"
douban_drama = LookupIdDescriptor(IdType.DoubanDrama)
versions = jsondata.ArrayField(_('版本'), null=False, blank=False, default=list)
directors = jsondata.ArrayField(_('导演'), null=False, blank=False, default=list)
playwrights = jsondata.ArrayField(_('编剧'), null=False, blank=False, default=list)
actors = jsondata.ArrayField(_('主演'), null=False, blank=False, default=list)
versions = jsondata.ArrayField(_("版本"), null=False, blank=False, default=list)
directors = jsondata.ArrayField(_("导演"), null=False, blank=False, default=list)
playwrights = jsondata.ArrayField(_("编剧"), null=False, blank=False, default=list)
actors = jsondata.ArrayField(_("主演"), null=False, blank=False, default=list)
class Meta:
proxy = True

View file

@ -7,8 +7,8 @@ class DoubanDramaTestCase(TestCase):
pass
def test_parse(self):
t_id = '24849279'
t_url = 'https://www.douban.com/location/drama/24849279/'
t_id = "24849279"
t_url = "https://www.douban.com/location/drama/24849279/"
p1 = SiteManager.get_site_by_id_type(IdType.DoubanDrama)
self.assertIsNotNone(p1)
p1 = SiteManager.get_site_by_url(t_url)
@ -19,14 +19,14 @@ class DoubanDramaTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://www.douban.com/location/drama/24849279/'
t_url = "https://www.douban.com/location/drama/24849279/"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
resource = site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(resource.metadata['title'], '红花侠')
self.assertEqual(resource.metadata["title"], "红花侠")
item = site.get_item()
self.assertEqual(item.title, '红花侠')
self.assertEqual(item.title, "红花侠")
# self.assertEqual(i.other_titles, ['スカーレットピンパーネル', 'THE SCARLET PIMPERNEL'])
# self.assertEqual(len(i.brief), 545)

View file

@ -3,7 +3,7 @@ from catalog.common import *
class Podcast(Item):
category = ItemCategory.Podcast
url_path = 'podcast'
url_path = "podcast"
feed_url = PrimaryLookupIdDescriptor(IdType.Feed)
apple_podcast = PrimaryLookupIdDescriptor(IdType.ApplePodcast)
# ximalaya = LookupIdDescriptor(IdType.Ximalaya)

View file

@ -8,9 +8,9 @@ class ApplePodcastTestCase(TestCase):
pass
def test_parse(self):
t_id = '657765158'
t_url = 'https://podcasts.apple.com/us/podcast/%E5%A4%A7%E5%86%85%E5%AF%86%E8%B0%88/id657765158'
t_url2 = 'https://podcasts.apple.com/us/podcast/id657765158'
t_id = "657765158"
t_url = "https://podcasts.apple.com/us/podcast/%E5%A4%A7%E5%86%85%E5%AF%86%E8%B0%88/id657765158"
t_url2 = "https://podcasts.apple.com/us/podcast/id657765158"
p1 = SiteManager.get_site_by_id_type(IdType.ApplePodcast)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
@ -20,11 +20,14 @@ class ApplePodcastTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://podcasts.apple.com/gb/podcast/the-new-yorker-radio-hour/id1050430296'
t_url = "https://podcasts.apple.com/gb/podcast/the-new-yorker-radio-hour/id1050430296"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '1050430296')
self.assertEqual(site.id_value, "1050430296")
site.get_resource_ready()
self.assertEqual(site.resource.metadata['title'], 'The New Yorker Radio Hour')
self.assertEqual(site.resource.metadata["title"], "The New Yorker Radio Hour")
# self.assertEqual(site.resource.metadata['feed_url'], 'http://feeds.wnyc.org/newyorkerradiohour')
self.assertEqual(site.resource.metadata['feed_url'], 'http://feeds.feedburner.com/newyorkerradiohour')
self.assertEqual(
site.resource.metadata["feed_url"],
"http://feeds.feedburner.com/newyorkerradiohour",
)

View file

@ -11,7 +11,7 @@ class ApplePodcast(AbstractSite):
SITE_NAME = SiteName.ApplePodcast
ID_TYPE = IdType.ApplePodcast
URL_PATTERNS = [r"https://[^.]+.apple.com/\w+/podcast/*[^/?]*/id(\d+)"]
WIKI_PROPERTY_ID = 'P5842'
WIKI_PROPERTY_ID = "P5842"
DEFAULT_MODEL = Podcast
@classmethod
@ -19,23 +19,27 @@ class ApplePodcast(AbstractSite):
return "https://podcasts.apple.com/us/podcast/id" + id_value
def scrape(self):
api_url = f'https://itunes.apple.com/lookup?id={self.id_value}'
api_url = f"https://itunes.apple.com/lookup?id={self.id_value}"
dl = BasicDownloader(api_url)
resp = dl.download()
r = resp.json()['results'][0]
pd = ResourceContent(metadata={
'title': r['trackName'],
'feed_url': r['feedUrl'],
'hosts': [r['artistName']],
'genres': r['genres'],
'cover_image_url': r['artworkUrl600'],
})
pd.lookup_ids[IdType.Feed] = pd.metadata.get('feed_url')
r = resp.json()["results"][0]
pd = ResourceContent(
metadata={
"title": r["trackName"],
"feed_url": r["feedUrl"],
"hosts": [r["artistName"]],
"genres": r["genres"],
"cover_image_url": r["artworkUrl600"],
}
)
pd.lookup_ids[IdType.Feed] = pd.metadata.get("feed_url")
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -14,11 +14,9 @@ _logger = logging.getLogger(__name__)
class Bandcamp(AbstractSite):
SITE_NAME = SiteName.Bandcamp
ID_TYPE = IdType.Bandcamp
URL_PATTERNS = [
r"https://([a-z0-9\-]+.bandcamp.com/album/[^?#/]+)"
]
URL_PATTERNS = [r"https://([a-z0-9\-]+.bandcamp.com/album/[^?#/]+)"]
URL_PATTERN_FALLBACK = r"https://([a-z0-9\-\.]+/album/[^?#/]+)"
WIKI_PROPERTY_ID = ''
WIKI_PROPERTY_ID = ""
DEFAULT_MODEL = Album
@classmethod
@ -32,16 +30,16 @@ class Bandcamp(AbstractSite):
parsed_url = urllib.parse.urlparse(url)
hostname = parsed_url.netloc
try:
answers = dns.resolver.query(hostname, 'CNAME')
answers = dns.resolver.query(hostname, "CNAME")
for rdata in answers:
if str(rdata.target) == 'dom.bandcamp.com.':
if str(rdata.target) == "dom.bandcamp.com.":
return True
except Exception:
pass
try:
answers = dns.resolver.query(hostname, 'A')
answers = dns.resolver.query(hostname, "A")
for rdata in answers:
if str(rdata.address) == '35.241.62.186':
if str(rdata.address) == "35.241.62.186":
return True
except Exception:
pass
@ -50,34 +48,45 @@ class Bandcamp(AbstractSite):
content = BasicDownloader(self.url).download().html()
try:
title = content.xpath("//h2[@class='trackTitle']/text()")[0].strip()
artist = [content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip()]
artist = [
content.xpath("//div[@id='name-section']/h3/span/a/text()")[0].strip()
]
except IndexError:
raise ValueError("given url contains no valid info")
genre = [] # TODO: parse tags
track_list = []
release_nodes = content.xpath("//div[@class='tralbumData tralbum-credits']/text()")
release_date = dateparser.parse(re.sub(r'releas\w+ ', '', release_nodes[0].strip())).strftime('%Y-%m-%d') if release_nodes else None
release_nodes = content.xpath(
"//div[@class='tralbumData tralbum-credits']/text()"
)
release_date = (
dateparser.parse(
re.sub(r"releas\w+ ", "", release_nodes[0].strip())
).strftime("%Y-%m-%d")
if release_nodes
else None
)
duration = None
company = None
brief_nodes = content.xpath("//div[@class='tralbumData tralbum-about']/text()")
brief = "".join(brief_nodes) if brief_nodes else None
cover_url = content.xpath("//div[@id='tralbumArt']/a/@href")[0].strip()
bandcamp_page_data = json.loads(content.xpath(
"//meta[@name='bc-page-properties']/@content")[0].strip())
bandcamp_album_id = bandcamp_page_data['item_id']
bandcamp_page_data = json.loads(
content.xpath("//meta[@name='bc-page-properties']/@content")[0].strip()
)
bandcamp_album_id = bandcamp_page_data["item_id"]
data = {
'title': title,
'artist': artist,
'genre': genre,
'track_list': track_list,
'release_date': release_date,
'duration': duration,
'company': company,
'brief': brief,
'bandcamp_album_id': bandcamp_album_id,
'cover_image_url': cover_url,
"title": title,
"artist": artist,
"genre": genre,
"track_list": track_list,
"release_date": release_date,
"duration": duration,
"company": company,
"brief": brief,
"bandcamp_album_id": bandcamp_album_id,
"cover_image_url": cover_url,
}
pd = ResourceContent(metadata=data)
if data["cover_image_url"]:
@ -86,5 +95,7 @@ class Bandcamp(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {data["cover_image_url"]}')
_logger.debug(
f'failed to download cover for {self.url} from {data["cover_image_url"]}'
)
return pd

View file

@ -13,7 +13,7 @@ class Bangumi(AbstractSite):
URL_PATTERNS = [
r"https://bgm\.tv/subject/(\d+)",
]
WIKI_PROPERTY_ID = ''
WIKI_PROPERTY_ID = ""
DEFAULT_MODEL = None
@classmethod

View file

@ -13,14 +13,17 @@ class DoubanDownloader(ProxiedDownloader):
elif response.status_code == 204:
return RESPONSE_CENSORSHIP
elif response.status_code == 200:
content = response.content.decode('utf-8')
if content.find('关于豆瓣') == -1:
content = response.content.decode("utf-8")
if content.find("关于豆瓣") == -1:
# if content.find('你的 IP 发出') == -1:
# error = error + 'Content not authentic' # response is garbage
# else:
# error = error + 'IP banned'
return RESPONSE_NETWORK_ERROR
elif content.find('<title>页面不存在</title>') != -1 or content.find('呃... 你想访问的条目豆瓣不收录。') != -1: # re.search('不存在[^<]+</title>', content, re.MULTILINE):
elif (
content.find("<title>页面不存在</title>") != -1
or content.find("呃... 你想访问的条目豆瓣不收录。") != -1
): # re.search('不存在[^<]+</title>', content, re.MULTILINE):
return RESPONSE_CENSORSHIP
else:
return RESPONSE_OK

View file

@ -12,8 +12,11 @@ _logger = logging.getLogger(__name__)
class DoubanBook(AbstractSite):
SITE_NAME = SiteName.Douban
ID_TYPE = IdType.DoubanBook
URL_PATTERNS = [r"\w+://book\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/book/subject/(\d+)/{0,1}"]
WIKI_PROPERTY_ID = '?'
URL_PATTERNS = [
r"\w+://book\.douban\.com/subject/(\d+)/{0,1}",
r"\w+://m.douban.com/book/subject/(\d+)/{0,1}",
]
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = Edition
@classmethod
@ -23,31 +26,40 @@ class DoubanBook(AbstractSite):
def scrape(self):
content = DoubanDownloader(self.url).download().html()
isbn_elem = content.xpath("//div[@id='info']//span[text()='ISBN:']/following::text()")
isbn_elem = content.xpath(
"//div[@id='info']//span[text()='ISBN:']/following::text()"
)
isbn = isbn_elem[0].strip() if isbn_elem else None
title_elem = content.xpath("/html/body//h1/span/text()")
title = title_elem[0].strip() if title_elem else f"Unknown Title {self.id_value}"
title = (
title_elem[0].strip() if title_elem else f"Unknown Title {self.id_value}"
)
subtitle_elem = content.xpath(
"//div[@id='info']//span[text()='副标题:']/following::text()")
"//div[@id='info']//span[text()='副标题:']/following::text()"
)
subtitle = subtitle_elem[0].strip()[:500] if subtitle_elem else None
orig_title_elem = content.xpath(
"//div[@id='info']//span[text()='原作名:']/following::text()")
"//div[@id='info']//span[text()='原作名:']/following::text()"
)
orig_title = orig_title_elem[0].strip()[:500] if orig_title_elem else None
language_elem = content.xpath(
"//div[@id='info']//span[text()='语言:']/following::text()")
"//div[@id='info']//span[text()='语言:']/following::text()"
)
language = language_elem[0].strip() if language_elem else None
pub_house_elem = content.xpath(
"//div[@id='info']//span[text()='出版社:']/following::text()")
"//div[@id='info']//span[text()='出版社:']/following::text()"
)
pub_house = pub_house_elem[0].strip() if pub_house_elem else None
pub_date_elem = content.xpath(
"//div[@id='info']//span[text()='出版年:']/following::text()")
pub_date = pub_date_elem[0].strip() if pub_date_elem else ''
"//div[@id='info']//span[text()='出版年:']/following::text()"
)
pub_date = pub_date_elem[0].strip() if pub_date_elem else ""
year_month_day = RE_NUMBERS.findall(pub_date)
if len(year_month_day) in (2, 3):
pub_year = int(year_month_day[0])
@ -60,45 +72,62 @@ class DoubanBook(AbstractSite):
pub_month = None
if pub_year and pub_month and pub_year < pub_month:
pub_year, pub_month = pub_month, pub_year
pub_year = None if pub_year is not None and pub_year not in range(
0, 3000) else pub_year
pub_month = None if pub_month is not None and pub_month not in range(
1, 12) else pub_month
pub_year = (
None
if pub_year is not None and pub_year not in range(0, 3000)
else pub_year
)
pub_month = (
None
if pub_month is not None and pub_month not in range(1, 12)
else pub_month
)
binding_elem = content.xpath(
"//div[@id='info']//span[text()='装帧:']/following::text()")
"//div[@id='info']//span[text()='装帧:']/following::text()"
)
binding = binding_elem[0].strip() if binding_elem else None
price_elem = content.xpath(
"//div[@id='info']//span[text()='定价:']/following::text()")
"//div[@id='info']//span[text()='定价:']/following::text()"
)
price = price_elem[0].strip() if price_elem else None
pages_elem = content.xpath(
"//div[@id='info']//span[text()='页数:']/following::text()")
"//div[@id='info']//span[text()='页数:']/following::text()"
)
pages = pages_elem[0].strip() if pages_elem else None
if pages is not None:
pages = int(RE_NUMBERS.findall(pages)[
0]) if RE_NUMBERS.findall(pages) else None
pages = (
int(RE_NUMBERS.findall(pages)[0]) if RE_NUMBERS.findall(pages) else None
)
if pages and (pages > 999999 or pages < 1):
pages = None
brief_elem = content.xpath(
"//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
brief = '\n'.join(p.strip()
for p in brief_elem) if brief_elem else None
"//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()"
)
brief = "\n".join(p.strip() for p in brief_elem) if brief_elem else None
contents = None
try:
contents_elem = content.xpath(
"//h2/span[text()='目录']/../following-sibling::div[1]")[0]
"//h2/span[text()='目录']/../following-sibling::div[1]"
)[0]
# if next the id of next sibling contains `dir`, that would be the full contents
if "dir" in contents_elem.getnext().xpath("@id")[0]:
contents_elem = contents_elem.getnext()
contents = '\n'.join(p.strip() for p in contents_elem.xpath(
"text()")[:-2]) if contents_elem is not None else None
contents = (
"\n".join(p.strip() for p in contents_elem.xpath("text()")[:-2])
if contents_elem is not None
else None
)
else:
contents = '\n'.join(p.strip() for p in contents_elem.xpath(
"text()")) if contents_elem is not None else None
contents = (
"\n".join(p.strip() for p in contents_elem.xpath("text()"))
if contents_elem is not None
else None
)
except Exception:
pass
@ -106,82 +135,97 @@ class DoubanBook(AbstractSite):
img_url = img_url_elem[0].strip() if img_url_elem else None
# there are two html formats for authors and translators
authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""")
authors_elem = content.xpath(
"""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()"""
)
if not authors_elem:
authors_elem = content.xpath(
"""//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""")
"""//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()"""
)
if authors_elem:
authors = []
for author in authors_elem:
authors.append(RE_WHITESPACES.sub(' ', author.strip())[:200])
authors.append(RE_WHITESPACES.sub(" ", author.strip())[:200])
else:
authors = None
translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""")
translators_elem = content.xpath(
"""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()"""
)
if not translators_elem:
translators_elem = content.xpath(
"""//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""")
"""//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()"""
)
if translators_elem:
translators = []
for translator in translators_elem:
translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
translators.append(RE_WHITESPACES.sub(" ", translator.strip()))
else:
translators = None
cncode_elem = content.xpath(
"//div[@id='info']//span[text()='统一书号:']/following::text()")
"//div[@id='info']//span[text()='统一书号:']/following::text()"
)
cubn = cncode_elem[0].strip() if cncode_elem else None
series_elem = content.xpath(
"//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
"//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()"
)
series = series_elem[0].strip() if series_elem else None
imprint_elem = content.xpath(
"//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
"//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()"
)
imprint = imprint_elem[0].strip() if imprint_elem else None
data = {
'title': title,
'subtitle': subtitle,
'orig_title': orig_title,
'author': authors,
'translator': translators,
'language': language,
'pub_house': pub_house,
'pub_year': pub_year,
'pub_month': pub_month,
'binding': binding,
'price': price,
'pages': pages,
'isbn': isbn,
'cubn': cubn,
'brief': brief,
'contents': contents,
'series': series,
'imprint': imprint,
'cover_image_url': img_url,
"title": title,
"subtitle": subtitle,
"orig_title": orig_title,
"author": authors,
"translator": translators,
"language": language,
"pub_house": pub_house,
"pub_year": pub_year,
"pub_month": pub_month,
"binding": binding,
"price": price,
"pages": pages,
"isbn": isbn,
"cubn": cubn,
"brief": brief,
"contents": contents,
"series": series,
"imprint": imprint,
"cover_image_url": img_url,
}
works_element = content.xpath('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
works_element = content.xpath(
'//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href'
)
if works_element:
r = re.match(r'\w+://book.douban.com/works/(\d+)', works_element[0])
data['required_resources'] = [{
'model': 'Work',
'id_type': IdType.DoubanBook_Work,
'id_value': r[1] if r else None,
'title': data['title'],
'url': works_element[0],
'content': {'metadata': {'title': data['title']}}
}]
r = re.match(r"\w+://book.douban.com/works/(\d+)", works_element[0])
data["required_resources"] = [
{
"model": "Work",
"id_type": IdType.DoubanBook_Work,
"id_value": r[1] if r else None,
"title": data["title"],
"url": works_element[0],
"content": {"metadata": {"title": data["title"]}},
}
]
pd = ResourceContent(metadata=data)
t, n = detect_isbn_asin(isbn)
if t:
pd.lookup_ids[t] = n
pd.lookup_ids[IdType.CUBN] = cubn
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(img_url, self.url)
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(
img_url, self.url
)
return pd
@ -189,7 +233,7 @@ class DoubanBook(AbstractSite):
class DoubanBook_Work(AbstractSite):
ID_TYPE = IdType.DoubanBook_Work
URL_PATTERNS = [r"\w+://book\.douban\.com/works/(\d+)"]
WIKI_PROPERTY_ID = '?'
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = Work
@classmethod
@ -199,10 +243,12 @@ class DoubanBook_Work(AbstractSite):
def scrape(self):
content = DoubanDownloader(self.url).download().html()
title_elem = content.xpath("//h1/text()")
title = title_elem[0].split('全部版本(')[0].strip() if title_elem else None
title = title_elem[0].split("全部版本(")[0].strip() if title_elem else None
if not title:
raise ParseError(self, 'title')
pd = ResourceContent(metadata={
'title': title,
})
raise ParseError(self, "title")
pd = ResourceContent(
metadata={
"title": title,
}
)
return pd

View file

@ -12,7 +12,7 @@ class DoubanDrama(AbstractSite):
SITE_NAME = SiteName.Douban
ID_TYPE = IdType.DoubanDrama
URL_PATTERNS = [r"\w+://www.douban.com/location/drama/(\d+)/"]
WIKI_PROPERTY_ID = 'P6443'
WIKI_PROPERTY_ID = "P6443"
DEFAULT_MODEL = Performance
@classmethod
@ -29,24 +29,51 @@ class DoubanDrama(AbstractSite):
else:
raise ParseError(self, "title")
data['other_titles'] = [s.strip() for s in title_elem[1:]]
other_title_elem = h.xpath("//dl//dt[text()='又名:']/following::dd[@itemprop='name']/text()")
data["other_titles"] = [s.strip() for s in title_elem[1:]]
other_title_elem = h.xpath(
"//dl//dt[text()='又名:']/following::dd[@itemprop='name']/text()"
)
if len(other_title_elem) > 0:
data['other_titles'].append(other_title_elem[0].strip())
data["other_titles"].append(other_title_elem[0].strip())
plot_elem = h.xpath("//div[@id='link-report']/text()")
if len(plot_elem) == 0:
plot_elem = h.xpath("//div[@class='abstract']/text()")
data['brief'] = '\n'.join(plot_elem) if len(plot_elem) > 0 else ''
data["brief"] = "\n".join(plot_elem) if len(plot_elem) > 0 else ""
data['genres'] = [s.strip() for s in h.xpath("//dl//dt[text()='类型:']/following-sibling::dd[@itemprop='genre']/text()")]
data['versions'] = [s.strip() for s in h.xpath("//dl//dt[text()='版本:']/following-sibling::dd[@class='titles']/a//text()")]
data['directors'] = [s.strip() for s in h.xpath("//div[@class='meta']/dl//dt[text()='导演:']/following-sibling::dd/a[@itemprop='director']//text()")]
data['playwrights'] = [s.strip() for s in h.xpath("//div[@class='meta']/dl//dt[text()='编剧:']/following-sibling::dd/a[@itemprop='author']//text()")]
data['actors'] = [s.strip() for s in h.xpath("//div[@class='meta']/dl//dt[text()='主演:']/following-sibling::dd/a[@itemprop='actor']//text()")]
data["genres"] = [
s.strip()
for s in h.xpath(
"//dl//dt[text()='类型:']/following-sibling::dd[@itemprop='genre']/text()"
)
]
data["versions"] = [
s.strip()
for s in h.xpath(
"//dl//dt[text()='版本:']/following-sibling::dd[@class='titles']/a//text()"
)
]
data["directors"] = [
s.strip()
for s in h.xpath(
"//div[@class='meta']/dl//dt[text()='导演:']/following-sibling::dd/a[@itemprop='director']//text()"
)
]
data["playwrights"] = [
s.strip()
for s in h.xpath(
"//div[@class='meta']/dl//dt[text()='编剧:']/following-sibling::dd/a[@itemprop='author']//text()"
)
]
data["actors"] = [
s.strip()
for s in h.xpath(
"//div[@class='meta']/dl//dt[text()='主演:']/following-sibling::dd/a[@itemprop='actor']//text()"
)
]
img_url_elem = h.xpath("//img[@itemprop='image']/@src")
data['cover_image_url'] = img_url_elem[0].strip() if img_url_elem else None
data["cover_image_url"] = img_url_elem[0].strip() if img_url_elem else None
pd = ResourceContent(metadata=data)
if pd.metadata["cover_image_url"]:
@ -55,5 +82,7 @@ class DoubanDrama(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -12,8 +12,11 @@ _logger = logging.getLogger(__name__)
class DoubanGame(AbstractSite):
SITE_NAME = SiteName.Douban
ID_TYPE = IdType.DoubanGame
URL_PATTERNS = [r"\w+://www\.douban\.com/game/(\d+)/{0,1}", r"\w+://m.douban.com/game/subject/(\d+)/{0,1}"]
WIKI_PROPERTY_ID = ''
URL_PATTERNS = [
r"\w+://www\.douban\.com/game/(\d+)/{0,1}",
r"\w+://m.douban.com/game/subject/(\d+)/{0,1}",
]
WIKI_PROPERTY_ID = ""
DEFAULT_MODEL = Game
@classmethod
@ -29,49 +32,69 @@ class DoubanGame(AbstractSite):
raise ParseError(self, "title")
other_title_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()")
other_title = other_title_elem[0].strip().split(' / ') if other_title_elem else None
"//dl[@class='game-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()"
)
other_title = (
other_title_elem[0].strip().split(" / ") if other_title_elem else None
)
developer_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()")
developer = developer_elem[0].strip().split(' / ') if developer_elem else None
"//dl[@class='game-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()"
)
developer = developer_elem[0].strip().split(" / ") if developer_elem else None
publisher_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()")
publisher = publisher_elem[0].strip().split(' / ') if publisher_elem else None
"//dl[@class='game-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()"
)
publisher = publisher_elem[0].strip().split(" / ") if publisher_elem else None
platform_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()")
"//dl[@class='game-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()"
)
platform = platform_elem if platform_elem else None
genre_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()")
"//dl[@class='game-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()"
)
genre = None
if genre_elem:
genre = [g for g in genre_elem if g != '游戏']
genre = [g for g in genre_elem if g != "游戏"]
date_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()")
release_date = dateparser.parse(date_elem[0].strip()).strftime('%Y-%m-%d') if date_elem else None
"//dl[@class='game-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()"
)
release_date = (
dateparser.parse(date_elem[0].strip()).strftime("%Y-%m-%d")
if date_elem
else None
)
brief_elem = content.xpath("//div[@class='mod item-desc']/p/text()")
brief = '\n'.join(brief_elem) if brief_elem else None
brief = "\n".join(brief_elem) if brief_elem else None
img_url_elem = content.xpath(
"//div[@class='item-subject-info']/div[@class='pic']//img/@src")
"//div[@class='item-subject-info']/div[@class='pic']//img/@src"
)
img_url = img_url_elem[0].strip() if img_url_elem else None
pd = ResourceContent(metadata={
'title': title,
'other_title': other_title,
'developer': developer,
'publisher': publisher,
'release_date': release_date,
'genre': genre,
'platform': platform,
'brief': brief,
'cover_image_url': img_url
})
pd = ResourceContent(
metadata={
"title": title,
"other_title": other_title,
"developer": developer,
"publisher": publisher,
"release_date": release_date,
"genre": genre,
"platform": platform,
"brief": brief,
"cover_image_url": img_url,
}
)
if pd.metadata["cover_image_url"]:
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(pd.metadata['cover_image_url'], self.url)
(
pd.cover_image,
pd.cover_image_extention,
) = BasicImageDownloader.download_image(
pd.metadata["cover_image_url"], self.url
)
return pd

View file

@ -15,8 +15,11 @@ _logger = logging.getLogger(__name__)
class DoubanMovie(AbstractSite):
SITE_NAME = SiteName.Douban
ID_TYPE = IdType.DoubanMovie
URL_PATTERNS = [r"\w+://movie\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/movie/subject/(\d+)/{0,1}"]
WIKI_PROPERTY_ID = '?'
URL_PATTERNS = [
r"\w+://movie\.douban\.com/subject/(\d+)/{0,1}",
r"\w+://m.douban.com/movie/subject/(\d+)/{0,1}",
]
WIKI_PROPERTY_ID = "?"
# no DEFAULT_MODEL as it may be either TV Season and Movie
@classmethod
@ -27,16 +30,16 @@ class DoubanMovie(AbstractSite):
content = DoubanDownloader(self.url).download().html()
try:
raw_title = content.xpath(
"//span[@property='v:itemreviewed']/text()")[0].strip()
raw_title = content.xpath("//span[@property='v:itemreviewed']/text()")[
0
].strip()
except IndexError:
raise ParseError(self, 'title')
raise ParseError(self, "title")
orig_title = content.xpath(
"//img[@rel='v:image']/@alt")[0].strip()
orig_title = content.xpath("//img[@rel='v:image']/@alt")[0].strip()
title = raw_title.split(orig_title)[0].strip()
# if has no chinese title
if title == '':
if title == "":
title = orig_title
if title == orig_title:
@ -44,107 +47,134 @@ class DoubanMovie(AbstractSite):
# there are two html formats for authors and translators
other_title_elem = content.xpath(
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
other_title = other_title_elem[0].strip().split(
' / ') if other_title_elem else None
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]"
)
other_title = (
other_title_elem[0].strip().split(" / ") if other_title_elem else None
)
imdb_elem = content.xpath(
"//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()")
"//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()"
)
if not imdb_elem:
imdb_elem = content.xpath(
"//div[@id='info']//span[text()='IMDb:']/following-sibling::text()[1]")
"//div[@id='info']//span[text()='IMDb:']/following-sibling::text()[1]"
)
imdb_code = imdb_elem[0].strip() if imdb_elem else None
director_elem = content.xpath(
"//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()")
"//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()"
)
director = director_elem if director_elem else None
playwright_elem = content.xpath(
"//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()")
playwright = list(map(lambda a: a[:200], playwright_elem)) if playwright_elem else None
"//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()"
)
playwright = (
list(map(lambda a: a[:200], playwright_elem)) if playwright_elem else None
)
actor_elem = content.xpath(
"//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()")
"//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()"
)
actor = list(map(lambda a: a[:200], actor_elem)) if actor_elem else None
genre_elem = content.xpath("//span[@property='v:genre']/text()")
genre = []
if genre_elem:
for g in genre_elem:
g = g.split(' ')[0]
if g == '紀錄片': # likely some original data on douban was corrupted
g = '纪录片'
elif g == '鬼怪':
g = '惊悚'
g = g.split(" ")[0]
if g == "紀錄片": # likely some original data on douban was corrupted
g = "纪录片"
elif g == "鬼怪":
g = "惊悚"
genre.append(g)
showtime_elem = content.xpath(
"//span[@property='v:initialReleaseDate']/text()")
showtime_elem = content.xpath("//span[@property='v:initialReleaseDate']/text()")
if showtime_elem:
showtime = []
for st in showtime_elem:
parts = st.split('(')
parts = st.split("(")
if len(parts) == 1:
time = st.split('(')[0]
region = ''
time = st.split("(")[0]
region = ""
else:
time = st.split('(')[0]
region = st.split('(')[1][0:-1]
time = st.split("(")[0]
region = st.split("(")[1][0:-1]
showtime.append({time: region})
else:
showtime = None
site_elem = content.xpath(
"//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href")
"//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href"
)
site = site_elem[0].strip()[:200] if site_elem else None
if site and not re.match(r'http.+', site):
if site and not re.match(r"http.+", site):
site = None
area_elem = content.xpath(
"//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]")
"//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]"
)
if area_elem:
area = [a.strip()[:100] for a in area_elem[0].split('/')]
area = [a.strip()[:100] for a in area_elem[0].split("/")]
else:
area = None
language_elem = content.xpath(
"//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]")
"//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]"
)
if language_elem:
language = [a.strip() for a in language_elem[0].split(' / ')]
language = [a.strip() for a in language_elem[0].split(" / ")]
else:
language = None
year_elem = content.xpath("//span[@class='year']/text()")
year = int(re.search(r'\d+', year_elem[0])[0]) if year_elem and re.search(r'\d+', year_elem[0]) else None
year = (
int(re.search(r"\d+", year_elem[0])[0])
if year_elem and re.search(r"\d+", year_elem[0])
else None
)
duration_elem = content.xpath("//span[@property='v:runtime']/text()")
other_duration_elem = content.xpath(
"//span[@property='v:runtime']/following-sibling::text()[1]")
"//span[@property='v:runtime']/following-sibling::text()[1]"
)
if duration_elem:
duration = duration_elem[0].strip()
if other_duration_elem:
duration += other_duration_elem[0].rstrip()
duration = duration.split('/')[0].strip()
duration = duration.split("/")[0].strip()
else:
duration = None
season_elem = content.xpath(
"//*[@id='season']/option[@selected='selected']/text()")
"//*[@id='season']/option[@selected='selected']/text()"
)
if not season_elem:
season_elem = content.xpath(
"//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]")
"//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]"
)
season = int(season_elem[0].strip()) if season_elem else None
else:
season = int(season_elem[0].strip())
episodes_elem = content.xpath(
"//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]")
episodes = int(episodes_elem[0].strip()) if episodes_elem and episodes_elem[0].strip().isdigit() else None
"//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]"
)
episodes = (
int(episodes_elem[0].strip())
if episodes_elem and episodes_elem[0].strip().isdigit()
else None
)
single_episode_length_elem = content.xpath(
"//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]")
single_episode_length = single_episode_length_elem[0].strip(
)[:100] if single_episode_length_elem else None
"//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]"
)
single_episode_length = (
single_episode_length_elem[0].strip()[:100]
if single_episode_length_elem
else None
)
# if has field `episodes` not none then must be series
is_series = True if episodes else False
@ -152,64 +182,87 @@ class DoubanMovie(AbstractSite):
brief_elem = content.xpath("//span[@class='all hidden']")
if not brief_elem:
brief_elem = content.xpath("//span[@property='v:summary']")
brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
'./text()')]) if brief_elem else None
brief = (
"\n".join([e.strip() for e in brief_elem[0].xpath("./text()")])
if brief_elem
else None
)
img_url_elem = content.xpath("//img[@rel='v:image']/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
pd = ResourceContent(metadata={
'title': title,
'orig_title': orig_title,
'other_title': other_title,
'imdb_code': imdb_code,
'director': director,
'playwright': playwright,
'actor': actor,
'genre': genre,
'showtime': showtime,
'site': site,
'area': area,
'language': language,
'year': year,
'duration': duration,
'season_number': season,
'episode_count': episodes,
'single_episode_length': single_episode_length,
'brief': brief,
'is_series': is_series,
'cover_image_url': img_url,
})
pd.metadata['preferred_model'] = ('TVSeason' if season else 'TVShow') if is_series else 'Movie'
pd = ResourceContent(
metadata={
"title": title,
"orig_title": orig_title,
"other_title": other_title,
"imdb_code": imdb_code,
"director": director,
"playwright": playwright,
"actor": actor,
"genre": genre,
"showtime": showtime,
"site": site,
"area": area,
"language": language,
"year": year,
"duration": duration,
"season_number": season,
"episode_count": episodes,
"single_episode_length": single_episode_length,
"brief": brief,
"is_series": is_series,
"cover_image_url": img_url,
}
)
pd.metadata["preferred_model"] = (
("TVSeason" if season else "TVShow") if is_series else "Movie"
)
if imdb_code:
res_data = search_tmdb_by_imdb_id(imdb_code)
tmdb_show_id = None
if 'movie_results' in res_data and len(res_data['movie_results']) > 0:
pd.metadata['preferred_model'] = 'Movie'
elif 'tv_results' in res_data and len(res_data['tv_results']) > 0:
pd.metadata['preferred_model'] = 'TVShow'
elif 'tv_season_results' in res_data and len(res_data['tv_season_results']) > 0:
pd.metadata['preferred_model'] = 'TVSeason'
tmdb_show_id = res_data['tv_season_results'][0]['show_id']
elif 'tv_episode_results' in res_data and len(res_data['tv_episode_results']) > 0:
pd.metadata['preferred_model'] = 'TVSeason'
tmdb_show_id = res_data['tv_episode_results'][0]['show_id']
if res_data['tv_episode_results'][0]['episode_number'] != 1:
_logger.warning(f'Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}')
resp = query_tmdb_tv_episode(tmdb_show_id, res_data['tv_episode_results'][0]['season_number'], 1)
imdb_code = resp['external_ids']['imdb_id']
_logger.warning(f'Douban Movie {self.url} re-mapped to imdb episode {imdb_code}')
if "movie_results" in res_data and len(res_data["movie_results"]) > 0:
pd.metadata["preferred_model"] = "Movie"
elif "tv_results" in res_data and len(res_data["tv_results"]) > 0:
pd.metadata["preferred_model"] = "TVShow"
elif (
"tv_season_results" in res_data
and len(res_data["tv_season_results"]) > 0
):
pd.metadata["preferred_model"] = "TVSeason"
tmdb_show_id = res_data["tv_season_results"][0]["show_id"]
elif (
"tv_episode_results" in res_data
and len(res_data["tv_episode_results"]) > 0
):
pd.metadata["preferred_model"] = "TVSeason"
tmdb_show_id = res_data["tv_episode_results"][0]["show_id"]
if res_data["tv_episode_results"][0]["episode_number"] != 1:
_logger.warning(
f"Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}"
)
resp = query_tmdb_tv_episode(
tmdb_show_id,
res_data["tv_episode_results"][0]["season_number"],
1,
)
imdb_code = resp["external_ids"]["imdb_id"]
_logger.warning(
f"Douban Movie {self.url} re-mapped to imdb episode {imdb_code}"
)
pd.lookup_ids[IdType.IMDB] = imdb_code
if tmdb_show_id:
pd.metadata['required_resources'] = [{
'model': 'TVShow',
'id_type': IdType.TMDB_TV,
'id_value': tmdb_show_id,
'title': title,
'url': TMDB_TV.id_to_url(tmdb_show_id),
}]
pd.metadata["required_resources"] = [
{
"model": "TVShow",
"id_type": IdType.TMDB_TV,
"id_value": tmdb_show_id,
"title": title,
"url": TMDB_TV.id_to_url(tmdb_show_id),
}
]
# TODO parse sister seasons
# pd.metadata['related_resources'] = []
if pd.metadata["cover_image_url"]:
@ -218,5 +271,7 @@ class DoubanMovie(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -12,8 +12,11 @@ _logger = logging.getLogger(__name__)
class DoubanMusic(AbstractSite):
SITE_NAME = SiteName.Douban
ID_TYPE = IdType.DoubanMusic
URL_PATTERNS = [r"\w+://music\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/music/subject/(\d+)/{0,1}"]
WIKI_PROPERTY_ID = ''
URL_PATTERNS = [
r"\w+://music\.douban\.com/subject/(\d+)/{0,1}",
r"\w+://m.douban.com/music/subject/(\d+)/{0,1}",
]
WIKI_PROPERTY_ID = ""
DEFAULT_MODEL = Album
@classmethod
@ -28,75 +31,95 @@ class DoubanMusic(AbstractSite):
if not title:
raise ParseError(self, "title")
artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()")
artist = None if not artists_elem else list(map(lambda a: a[:200], artists_elem))
artists_elem = content.xpath(
"//div[@id='info']/span/span[@class='pl']/a/text()"
)
artist = (
None if not artists_elem else list(map(lambda a: a[:200], artists_elem))
)
genre_elem = content.xpath(
"//div[@id='info']//span[text()='流派:']/following::text()[1]")
"//div[@id='info']//span[text()='流派:']/following::text()[1]"
)
genre = genre_elem[0].strip() if genre_elem else None
date_elem = content.xpath(
"//div[@id='info']//span[text()='发行时间:']/following::text()[1]")
release_date = dateparser.parse(date_elem[0].strip()).strftime('%Y-%m-%d') if date_elem else None
"//div[@id='info']//span[text()='发行时间:']/following::text()[1]"
)
release_date = (
dateparser.parse(date_elem[0].strip()).strftime("%Y-%m-%d")
if date_elem
else None
)
company_elem = content.xpath(
"//div[@id='info']//span[text()='出版者:']/following::text()[1]")
"//div[@id='info']//span[text()='出版者:']/following::text()[1]"
)
company = company_elem[0].strip() if company_elem else None
track_list_elem = content.xpath(
"//div[@class='track-list']/div[@class='indent']/div/text()"
)
if track_list_elem:
track_list = '\n'.join([track.strip() for track in track_list_elem])
track_list = "\n".join([track.strip() for track in track_list_elem])
else:
track_list = None
brief_elem = content.xpath("//span[@class='all hidden']")
if not brief_elem:
brief_elem = content.xpath("//span[@property='v:summary']")
brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
'./text()')]) if brief_elem else None
brief = (
"\n".join([e.strip() for e in brief_elem[0].xpath("./text()")])
if brief_elem
else None
)
img_url_elem = content.xpath("//div[@id='mainpic']//img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
data = {
'title': title,
'artist': artist,
'genre': genre,
'release_date': release_date,
'duration': None,
'company': [company],
'track_list': track_list,
'brief': brief,
'cover_image_url': img_url
"title": title,
"artist": artist,
"genre": genre,
"release_date": release_date,
"duration": None,
"company": [company],
"track_list": track_list,
"brief": brief,
"cover_image_url": img_url,
}
gtin = None
isrc = None
other_elem = content.xpath(
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]"
)
if other_elem:
data['other_title'] = other_elem[0].strip()
data["other_title"] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]")
"//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]"
)
if other_elem:
data['album_type'] = other_elem[0].strip()
data["album_type"] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]")
"//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]"
)
if other_elem:
data['media'] = other_elem[0].strip()
data["media"] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]")
"//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]"
)
if other_elem:
isrc = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]")
"//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]"
)
if other_elem:
gtin = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]")
"//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]"
)
if other_elem:
data['disc_count'] = other_elem[0].strip()
data["disc_count"] = other_elem[0].strip()
pd = ResourceContent(metadata=data)
if gtin:
@ -109,5 +132,7 @@ class DoubanMusic(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -14,7 +14,7 @@ class GoodreadsDownloader(RetryDownloader):
if response is None:
return RESPONSE_NETWORK_ERROR
elif response.status_code == 200:
if response.text.find('__NEXT_DATA__') != -1:
if response.text.find("__NEXT_DATA__") != -1:
return RESPONSE_OK
else:
# Goodreads may return legacy version for a/b testing
@ -28,9 +28,12 @@ class GoodreadsDownloader(RetryDownloader):
class Goodreads(AbstractSite):
SITE_NAME = SiteName.Goodreads
ID_TYPE = IdType.Goodreads
WIKI_PROPERTY_ID = 'P2968'
WIKI_PROPERTY_ID = "P2968"
DEFAULT_MODEL = Edition
URL_PATTERNS = [r".+goodreads.com/.*book/show/(\d+)", r".+goodreads.com/.*book/(\d+)"]
URL_PATTERNS = [
r".+goodreads.com/.*book/show/(\d+)",
r".+goodreads.com/.*book/(\d+)",
]
@classmethod
def id_to_url(self, id_value):
@ -48,39 +51,41 @@ class Goodreads(AbstractSite):
elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()')
src = elem[0].strip() if elem else None
if not src:
raise ParseError(self, '__NEXT_DATA__ element')
d = json.loads(src)['props']['pageProps']['apolloState']
o = {'Book': [], 'Work': [], 'Series': [], 'Contributor': []}
raise ParseError(self, "__NEXT_DATA__ element")
d = json.loads(src)["props"]["pageProps"]["apolloState"]
o = {"Book": [], "Work": [], "Series": [], "Contributor": []}
for v in d.values():
t = v.get('__typename')
t = v.get("__typename")
if t and t in o:
o[t].append(v)
b = next(filter(lambda x: x.get('title'), o['Book']), None)
b = next(filter(lambda x: x.get("title"), o["Book"]), None)
if not b:
# Goodreads may return empty page template when internal service timeouts
raise ParseError(self, 'Book in __NEXT_DATA__ json')
data['title'] = b['title']
data['brief'] = b['description']
raise ParseError(self, "Book in __NEXT_DATA__ json")
data["title"] = b["title"]
data["brief"] = b["description"]
ids = {}
t, n = detect_isbn_asin(b['details'].get('asin'))
t, n = detect_isbn_asin(b["details"].get("asin"))
if t:
ids[t] = n
t, n = detect_isbn_asin(b['details'].get('isbn13'))
t, n = detect_isbn_asin(b["details"].get("isbn13"))
if t:
ids[t] = n
# amazon has a known problem to use another book's isbn as asin
# so we alway overwrite asin-converted isbn with real isbn
data['pages'] = b['details'].get('numPages')
data['cover_image_url'] = b['imageUrl']
w = next(filter(lambda x: x.get('details'), o['Work']), None)
data["pages"] = b["details"].get("numPages")
data["cover_image_url"] = b["imageUrl"]
w = next(filter(lambda x: x.get("details"), o["Work"]), None)
if w:
data['required_resources'] = [{
'model': 'Work',
'id_type': IdType.Goodreads_Work,
'id_value': str(w['legacyId']),
'title': w['details']['originalTitle'],
'url': w['editions']['webUrl'],
}]
data["required_resources"] = [
{
"model": "Work",
"id_type": IdType.Goodreads_Work,
"id_value": str(w["legacyId"]),
"title": w["details"]["originalTitle"],
"url": w["editions"]["webUrl"],
}
]
pd = ResourceContent(metadata=data)
pd.lookup_ids[IdType.ISBN] = ids.get(IdType.ISBN)
pd.lookup_ids[IdType.ASIN] = ids.get(IdType.ASIN)
@ -90,7 +95,9 @@ class Goodreads(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {data["cover_image_url"]}')
_logger.debug(
f'failed to download cover for {self.url} from {data["cover_image_url"]}'
)
return pd
@ -98,7 +105,7 @@ class Goodreads(AbstractSite):
class Goodreads_Work(AbstractSite):
SITE_NAME = SiteName.Goodreads
ID_TYPE = IdType.Goodreads_Work
WIKI_PROPERTY_ID = ''
WIKI_PROPERTY_ID = ""
DEFAULT_MODEL = Work
URL_PATTERNS = [r".+goodreads.com/work/editions/(\d+)"]
@ -111,14 +118,18 @@ class Goodreads_Work(AbstractSite):
title_elem = content.xpath("//h1/a/text()")
title = title_elem[0].strip() if title_elem else None
if not title:
raise ParseError(self, 'title')
raise ParseError(self, "title")
author_elem = content.xpath("//h2/a/text()")
author = author_elem[0].strip() if author_elem else None
first_published_elem = content.xpath("//h2/span/text()")
first_published = first_published_elem[0].strip() if first_published_elem else None
pd = ResourceContent(metadata={
'title': title,
'author': author,
'first_published': first_published
})
first_published = (
first_published_elem[0].strip() if first_published_elem else None
)
pd = ResourceContent(
metadata={
"title": title,
"author": author,
"first_published": first_published,
}
)
return pd

View file

@ -16,7 +16,7 @@ class GoogleBooks(AbstractSite):
r"https://www\.google\.co[^/]+/books/edition/[^/]+/([^&#?]+)",
r"https://books\.google\.co[^/]+/books/about/[^?]+?id=([^&#?]+)",
]
WIKI_PROPERTY_ID = ''
WIKI_PROPERTY_ID = ""
DEFAULT_MODEL = Edition
@classmethod
@ -24,57 +24,76 @@ class GoogleBooks(AbstractSite):
return "https://books.google.com/books?id=" + id_value
def scrape(self):
api_url = f'https://www.googleapis.com/books/v1/volumes/{self.id_value}'
api_url = f"https://www.googleapis.com/books/v1/volumes/{self.id_value}"
b = BasicDownloader(api_url).download().json()
other = {}
title = b['volumeInfo']['title']
subtitle = b['volumeInfo']['subtitle'] if 'subtitle' in b['volumeInfo'] else None
title = b["volumeInfo"]["title"]
subtitle = (
b["volumeInfo"]["subtitle"] if "subtitle" in b["volumeInfo"] else None
)
pub_year = None
pub_month = None
if 'publishedDate' in b['volumeInfo']:
pub_date = b['volumeInfo']['publishedDate'].split('-')
if "publishedDate" in b["volumeInfo"]:
pub_date = b["volumeInfo"]["publishedDate"].split("-")
pub_year = pub_date[0]
pub_month = pub_date[1] if len(pub_date) > 1 else None
pub_house = b['volumeInfo']['publisher'] if 'publisher' in b['volumeInfo'] else None
language = b['volumeInfo']['language'] if 'language' in b['volumeInfo'] else None
pages = b['volumeInfo']['pageCount'] if 'pageCount' in b['volumeInfo'] else None
if 'mainCategory' in b['volumeInfo']:
other['分类'] = b['volumeInfo']['mainCategory']
authors = b['volumeInfo']['authors'] if 'authors' in b['volumeInfo'] else None
if 'description' in b['volumeInfo']:
brief = b['volumeInfo']['description']
elif 'textSnippet' in b['volumeInfo']:
pub_house = (
b["volumeInfo"]["publisher"] if "publisher" in b["volumeInfo"] else None
)
language = (
b["volumeInfo"]["language"] if "language" in b["volumeInfo"] else None
)
pages = b["volumeInfo"]["pageCount"] if "pageCount" in b["volumeInfo"] else None
if "mainCategory" in b["volumeInfo"]:
other["分类"] = b["volumeInfo"]["mainCategory"]
authors = b["volumeInfo"]["authors"] if "authors" in b["volumeInfo"] else None
if "description" in b["volumeInfo"]:
brief = b["volumeInfo"]["description"]
elif "textSnippet" in b["volumeInfo"]:
brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
else:
brief = ''
brief = re.sub(r'<.*?>', '', brief.replace('<br', '\n<br'))
img_url = b['volumeInfo']['imageLinks']['thumbnail'] if 'imageLinks' in b['volumeInfo'] else None
brief = ""
brief = re.sub(r"<.*?>", "", brief.replace("<br", "\n<br"))
img_url = (
b["volumeInfo"]["imageLinks"]["thumbnail"]
if "imageLinks" in b["volumeInfo"]
else None
)
isbn10 = None
isbn13 = None
for iid in b['volumeInfo']['industryIdentifiers'] if 'industryIdentifiers' in b['volumeInfo'] else []:
if iid['type'] == 'ISBN_10':
isbn10 = iid['identifier']
if iid['type'] == 'ISBN_13':
isbn13 = iid['identifier']
for iid in (
b["volumeInfo"]["industryIdentifiers"]
if "industryIdentifiers" in b["volumeInfo"]
else []
):
if iid["type"] == "ISBN_10":
isbn10 = iid["identifier"]
if iid["type"] == "ISBN_13":
isbn13 = iid["identifier"]
isbn = isbn13 # if isbn13 is not None else isbn10
raw_img, ext = BasicImageDownloader.download_image(img_url, self.url)
data = {
'title': title,
'subtitle': subtitle,
'orig_title': None,
'author': authors,
'translator': None,
'language': language,
'pub_house': pub_house,
'pub_year': pub_year,
'pub_month': pub_month,
'binding': None,
'pages': pages,
'isbn': isbn,
'brief': brief,
'contents': None,
'other_info': other,
'cover_image_url': img_url,
"title": title,
"subtitle": subtitle,
"orig_title": None,
"author": authors,
"translator": None,
"language": language,
"pub_house": pub_house,
"pub_year": pub_year,
"pub_month": pub_month,
"binding": None,
"pages": pages,
"isbn": isbn,
"brief": brief,
"contents": None,
"other_info": other,
"cover_image_url": img_url,
}
return ResourceContent(metadata=data, cover_image=raw_img, cover_image_extention=ext, lookup_ids={IdType.ISBN: isbn13})
return ResourceContent(
metadata=data,
cover_image=raw_img,
cover_image_extention=ext,
lookup_ids={IdType.ISBN: isbn13},
)

View file

@ -19,10 +19,12 @@ _logger = logging.getLogger(__name__)
def _igdb_access_token():
try:
token = requests.post(f'https://id.twitch.tv/oauth2/token?client_id={settings.IGDB_CLIENT_ID}&client_secret={settings.IGDB_CLIENT_SECRET}&grant_type=client_credentials').json()['access_token']
token = requests.post(
f"https://id.twitch.tv/oauth2/token?client_id={settings.IGDB_CLIENT_ID}&client_secret={settings.IGDB_CLIENT_SECRET}&grant_type=client_credentials"
).json()["access_token"]
except Exception:
_logger.error('unable to obtain IGDB token')
token = '<invalid>'
_logger.error("unable to obtain IGDB token")
token = "<invalid>"
return token
@ -30,11 +32,11 @@ _wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, _igdb_access_token())
def search_igdb_by_3p_url(steam_url):
r = IGDB.api_query('websites', f'fields *, game.*; where url = "{steam_url}";')
r = IGDB.api_query("websites", f'fields *, game.*; where url = "{steam_url}";')
if not r:
return None
r = sorted(r, key=lambda w: w['game']['id'])
return IGDB(url=r[0]['game']['url'])
r = sorted(r, key=lambda w: w["game"]["id"])
return IGDB(url=r[0]["game"]["url"])
@SiteManager.register
@ -42,7 +44,7 @@ class IGDB(AbstractSite):
SITE_NAME = SiteName.IGDB
ID_TYPE = IdType.IGDB
URL_PATTERNS = [r"\w+://www\.igdb\.com/games/([a-zA-Z0-9\-_]+)"]
WIKI_PROPERTY_ID = '?'
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = Game
@classmethod
@ -51,64 +53,97 @@ class IGDB(AbstractSite):
@classmethod
def api_query(cls, p, q):
key = 'igdb:' + p + '/' + q
key = "igdb:" + p + "/" + q
if get_mock_mode():
r = BasicDownloader(key).download().json()
else:
r = json.loads(_wrapper.api_request(p, q))
if settings.DOWNLOADER_SAVEDIR:
with open(settings.DOWNLOADER_SAVEDIR + '/' + get_mock_file(key), 'w', encoding='utf-8') as fp:
with open(
settings.DOWNLOADER_SAVEDIR + "/" + get_mock_file(key),
"w",
encoding="utf-8",
) as fp:
fp.write(json.dumps(r))
return r
def scrape(self):
fields = '*, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name'
r = self.api_query('games', f'fields {fields}; where url = "{self.url}";')[0]
brief = r['summary'] if 'summary' in r else ''
brief += "\n\n" + r['storyline'] if 'storyline' in r else ''
fields = "*, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name"
r = self.api_query("games", f'fields {fields}; where url = "{self.url}";')[0]
brief = r["summary"] if "summary" in r else ""
brief += "\n\n" + r["storyline"] if "storyline" in r else ""
developer = None
publisher = None
release_date = None
genre = None
platform = None
if 'involved_companies' in r:
developer = next(iter([c['company']['name'] for c in r['involved_companies'] if c['developer']]), None)
publisher = next(iter([c['company']['name'] for c in r['involved_companies'] if c['publisher']]), None)
if 'platforms' in r:
ps = sorted(r['platforms'], key=lambda p: p['id'])
platform = [(p['name'] if p['id'] != 6 else 'Windows') for p in ps]
if 'first_release_date' in r:
release_date = datetime.datetime.fromtimestamp(r['first_release_date'], datetime.timezone.utc).strftime('%Y-%m-%d')
if 'genres' in r:
genre = [g['name'] for g in r['genres']]
websites = self.api_query('websites', f'fields *; where game.url = "{self.url}";')
if "involved_companies" in r:
developer = next(
iter(
[
c["company"]["name"]
for c in r["involved_companies"]
if c["developer"]
]
),
None,
)
publisher = next(
iter(
[
c["company"]["name"]
for c in r["involved_companies"]
if c["publisher"]
]
),
None,
)
if "platforms" in r:
ps = sorted(r["platforms"], key=lambda p: p["id"])
platform = [(p["name"] if p["id"] != 6 else "Windows") for p in ps]
if "first_release_date" in r:
release_date = datetime.datetime.fromtimestamp(
r["first_release_date"], datetime.timezone.utc
).strftime("%Y-%m-%d")
if "genres" in r:
genre = [g["name"] for g in r["genres"]]
websites = self.api_query(
"websites", f'fields *; where game.url = "{self.url}";'
)
steam_url = None
official_site = None
for website in websites:
if website['category'] == 1:
official_site = website['url']
elif website['category'] == 13:
steam_url = website['url']
pd = ResourceContent(metadata={
'title': r['name'],
'other_title': [],
'developer': [developer],
'publisher': [publisher],
'release_date': release_date,
'genre': genre,
'platform': platform,
'brief': brief,
'official_site': official_site,
'igdb_id': r['id'],
'cover_image_url': 'https:' + r['cover']['url'].replace('t_thumb', 't_cover_big'),
})
if website["category"] == 1:
official_site = website["url"]
elif website["category"] == 13:
steam_url = website["url"]
pd = ResourceContent(
metadata={
"title": r["name"],
"other_title": [],
"developer": [developer],
"publisher": [publisher],
"release_date": release_date,
"genre": genre,
"platform": platform,
"brief": brief,
"official_site": official_site,
"igdb_id": r["id"],
"cover_image_url": "https:"
+ r["cover"]["url"].replace("t_thumb", "t_cover_big"),
}
)
if steam_url:
pd.lookup_ids[IdType.Steam] = SiteManager.get_site_by_id_type(IdType.Steam).url_to_id(steam_url)
pd.lookup_ids[IdType.Steam] = SiteManager.get_site_by_id_type(
IdType.Steam
).url_to_id(steam_url)
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd

View file

@ -12,8 +12,8 @@ _logger = logging.getLogger(__name__)
class IMDB(AbstractSite):
SITE_NAME = SiteName.IMDB
ID_TYPE = IdType.IMDB
URL_PATTERNS = [r'\w+://www.imdb.com/title/(tt\d+)']
WIKI_PROPERTY_ID = '?'
URL_PATTERNS = [r"\w+://www.imdb.com/title/(tt\d+)"]
WIKI_PROPERTY_ID = "?"
@classmethod
def id_to_url(self, id_value):
@ -22,28 +22,35 @@ class IMDB(AbstractSite):
def scrape(self):
self.scraped = False
res_data = search_tmdb_by_imdb_id(self.id_value)
if 'movie_results' in res_data and len(res_data['movie_results']) > 0:
url = f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}"
elif 'tv_results' in res_data and len(res_data['tv_results']) > 0:
if "movie_results" in res_data and len(res_data["movie_results"]) > 0:
url = (
f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}"
)
elif "tv_results" in res_data and len(res_data["tv_results"]) > 0:
url = f"https://www.themoviedb.org/tv/{res_data['tv_results'][0]['id']}"
elif 'tv_season_results' in res_data and len(res_data['tv_season_results']) > 0:
elif "tv_season_results" in res_data and len(res_data["tv_season_results"]) > 0:
# this should not happen given IMDB only has ids for either show or episode
tv_id = res_data['tv_season_results'][0]['show_id']
season_number = res_data['tv_season_results'][0]['season_number']
tv_id = res_data["tv_season_results"][0]["show_id"]
season_number = res_data["tv_season_results"][0]["season_number"]
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}"
elif 'tv_episode_results' in res_data and len(res_data['tv_episode_results']) > 0:
tv_id = res_data['tv_episode_results'][0]['show_id']
season_number = res_data['tv_episode_results'][0]['season_number']
episode_number = res_data['tv_episode_results'][0]['episode_number']
elif (
"tv_episode_results" in res_data and len(res_data["tv_episode_results"]) > 0
):
tv_id = res_data["tv_episode_results"][0]["show_id"]
season_number = res_data["tv_episode_results"][0]["season_number"]
episode_number = res_data["tv_episode_results"][0]["episode_number"]
if season_number == 0:
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}"
elif episode_number == 1:
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}"
else:
raise ParseError(self, "IMDB id matching TMDB but not first episode, this is not supported")
raise ParseError(
self,
"IMDB id matching TMDB but not first episode, this is not supported",
)
else:
raise ParseError(self, "IMDB id not found in TMDB")
tmdb = SiteManager.get_site_by_url(url)
pd = tmdb.scrape()
pd.metadata['preferred_model'] = tmdb.DEFAULT_MODEL.__name__
pd.metadata["preferred_model"] = tmdb.DEFAULT_MODEL.__name__
return pd

View file

@ -23,8 +23,8 @@ spotify_token_expire_time = time.time()
class Spotify(AbstractSite):
SITE_NAME = SiteName.Spotify
ID_TYPE = IdType.Spotify_Album
URL_PATTERNS = [r'\w+://open\.spotify\.com/album/([a-zA-Z0-9]+)']
WIKI_PROPERTY_ID = '?'
URL_PATTERNS = [r"\w+://open\.spotify\.com/album/([a-zA-Z0-9]+)"]
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = Album
@classmethod
@ -33,58 +33,63 @@ class Spotify(AbstractSite):
def scrape(self):
api_url = "https://api.spotify.com/v1/albums/" + self.id_value
headers = {
'Authorization': f"Bearer {get_spotify_token()}"
}
headers = {"Authorization": f"Bearer {get_spotify_token()}"}
res_data = BasicDownloader(api_url, headers=headers).download().json()
artist = []
for artist_dict in res_data['artists']:
artist.append(artist_dict['name'])
for artist_dict in res_data["artists"]:
artist.append(artist_dict["name"])
title = res_data['name']
title = res_data["name"]
genre = ', '.join(res_data['genres'])
genre = ", ".join(res_data["genres"])
company = []
for com in res_data['copyrights']:
company.append(com['text'])
for com in res_data["copyrights"]:
company.append(com["text"])
duration = 0
track_list = []
track_urls = []
for track in res_data['tracks']['items']:
track_urls.append(track['external_urls']['spotify'])
duration += track['duration_ms']
if res_data['tracks']['items'][-1]['disc_number'] > 1:
for track in res_data["tracks"]["items"]:
track_urls.append(track["external_urls"]["spotify"])
duration += track["duration_ms"]
if res_data["tracks"]["items"][-1]["disc_number"] > 1:
# more than one disc
track_list.append(str(
track['disc_number']) + '-' + str(track['track_number']) + '. ' + track['name'])
track_list.append(
str(track["disc_number"])
+ "-"
+ str(track["track_number"])
+ ". "
+ track["name"]
)
else:
track_list.append(str(track['track_number']) + '. ' + track['name'])
track_list = '\n'.join(track_list)
track_list.append(str(track["track_number"]) + ". " + track["name"])
track_list = "\n".join(track_list)
release_date = dateparser.parse(res_data['release_date']).strftime('%Y-%m-%d')
release_date = dateparser.parse(res_data["release_date"]).strftime("%Y-%m-%d")
gtin = None
if res_data['external_ids'].get('upc'):
gtin = res_data['external_ids'].get('upc')
if res_data['external_ids'].get('ean'):
gtin = res_data['external_ids'].get('ean')
if res_data["external_ids"].get("upc"):
gtin = res_data["external_ids"].get("upc")
if res_data["external_ids"].get("ean"):
gtin = res_data["external_ids"].get("ean")
isrc = None
if res_data['external_ids'].get('isrc'):
isrc = res_data['external_ids'].get('isrc')
if res_data["external_ids"].get("isrc"):
isrc = res_data["external_ids"].get("isrc")
pd = ResourceContent(metadata={
'title': title,
'artist': artist,
'genre': genre,
'track_list': track_list,
'release_date': release_date,
'duration': duration,
'company': company,
'brief': None,
'cover_image_url': res_data['images'][0]['url']
})
pd = ResourceContent(
metadata={
"title": title,
"artist": artist,
"genre": genre,
"track_list": track_list,
"release_date": release_date,
"duration": duration,
"company": company,
"brief": None,
"cover_image_url": res_data["images"][0]["url"],
}
)
if gtin:
pd.lookup_ids[IdType.GTIN] = gtin
if isrc:
@ -95,14 +100,16 @@ class Spotify(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd
def get_spotify_token():
global spotify_token, spotify_token_expire_time
if get_mock_mode():
return 'mocked'
return "mocked"
if spotify_token is None or is_spotify_token_expired():
invoke_spotify_token()
return spotify_token
@ -117,12 +124,8 @@ def invoke_spotify_token():
global spotify_token, spotify_token_expire_time
r = requests.post(
"https://accounts.spotify.com/api/token",
data={
"grant_type": "client_credentials"
},
headers={
"Authorization": f"Basic {settings.SPOTIFY_CREDENTIAL}"
}
data={"grant_type": "client_credentials"},
headers={"Authorization": f"Basic {settings.SPOTIFY_CREDENTIAL}"},
)
data = r.json()
if r.status_code == 401:
@ -131,16 +134,12 @@ def invoke_spotify_token():
# for example debugging using a http client
r = requests.post(
"https://accounts.spotify.com/api/token",
data={
"grant_type": "client_credentials"
},
headers={
"Authorization": f"Basic {settings.SPOTIFY_CREDENTIAL}"
}
data={"grant_type": "client_credentials"},
headers={"Authorization": f"Basic {settings.SPOTIFY_CREDENTIAL}"},
)
data = r.json()
elif r.status_code != 200:
raise Exception(f"Request to spotify API fails. Reason: {r.reason}")
# minus 2 for execution time error
spotify_token_expire_time = int(data['expires_in']) + time.time() - 2
spotify_token = data['access_token']
spotify_token_expire_time = int(data["expires_in"]) + time.time() - 2
spotify_token = data["access_token"]

View file

@ -13,7 +13,7 @@ class Steam(AbstractSite):
SITE_NAME = SiteName.Steam
ID_TYPE = IdType.Steam
URL_PATTERNS = [r"\w+://store\.steampowered\.com/app/(\d+)"]
WIKI_PROPERTY_ID = '?'
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = Game
@classmethod
@ -25,41 +25,58 @@ class Steam(AbstractSite):
pd = i.scrape() if i else ResourceContent()
headers = BasicDownloader.headers.copy()
headers['Host'] = 'store.steampowered.com'
headers['Cookie'] = "wants_mature_content=1; birthtime=754700401;"
headers["Host"] = "store.steampowered.com"
headers["Cookie"] = "wants_mature_content=1; birthtime=754700401;"
content = BasicDownloader(self.url, headers=headers).download().html()
title = content.xpath("//div[@class='apphub_AppName']/text()")[0]
developer = content.xpath("//div[@id='developers_list']/a/text()")
publisher = content.xpath("//div[@class='glance_ctn']//div[@class='dev_row'][2]//a/text()")
publisher = content.xpath(
"//div[@class='glance_ctn']//div[@class='dev_row'][2]//a/text()"
)
release_date = dateparser.parse(
content.xpath(
"//div[@class='release_date']/div[@class='date']/text()")[0]
).strftime('%Y-%m-%d')
content.xpath("//div[@class='release_date']/div[@class='date']/text()")[0]
).strftime("%Y-%m-%d")
genre = content.xpath(
"//div[@class='details_block']/b[2]/following-sibling::a/text()")
platform = ['PC']
brief = content.xpath(
"//div[@class='game_description_snippet']/text()")[0].strip()
"//div[@class='details_block']/b[2]/following-sibling::a/text()"
)
platform = ["PC"]
brief = content.xpath("//div[@class='game_description_snippet']/text()")[
0
].strip()
# try Steam images if no image from IGDB
if pd.cover_image is None:
pd.metadata['cover_image_url'] = content.xpath("//img[@class='game_header_image_full']/@src")[0].replace("header.jpg", "library_600x900.jpg")
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(pd.metadata['cover_image_url'], self.url)
pd.metadata["cover_image_url"] = content.xpath(
"//img[@class='game_header_image_full']/@src"
)[0].replace("header.jpg", "library_600x900.jpg")
(
pd.cover_image,
pd.cover_image_extention,
) = BasicImageDownloader.download_image(
pd.metadata["cover_image_url"], self.url
)
if pd.cover_image is None:
pd.metadata['cover_image_url'] = content.xpath("//img[@class='game_header_image_full']/@src")[0]
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(pd.metadata['cover_image_url'], self.url)
pd.metadata["cover_image_url"] = content.xpath(
"//img[@class='game_header_image_full']/@src"
)[0]
(
pd.cover_image,
pd.cover_image_extention,
) = BasicImageDownloader.download_image(
pd.metadata["cover_image_url"], self.url
)
# merge data from IGDB, use localized Steam data if available
d = {
'developer': developer,
'publisher': publisher,
'release_date': release_date,
'genre': genre,
'platform': platform,
"developer": developer,
"publisher": publisher,
"release_date": release_date,
"genre": genre,
"platform": platform,
}
d.update(pd.metadata)
pd.metadata = d
if title:
pd.metadata['title'] = title
pd.metadata["title"] = title
if brief:
pd.metadata['brief'] = brief
pd.metadata["brief"] = brief
return pd

View file

@ -37,8 +37,8 @@ def _copy_dict(s, key_map):
class TMDB_Movie(AbstractSite):
SITE_NAME = SiteName.TMDB
ID_TYPE = IdType.TMDB_Movie
URL_PATTERNS = [r'\w+://www.themoviedb.org/movie/(\d+)']
WIKI_PROPERTY_ID = '?'
URL_PATTERNS = [r"\w+://www.themoviedb.org/movie/(\d+)"]
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = Movie
@classmethod
@ -55,37 +55,59 @@ class TMDB_Movie(AbstractSite):
res_data = BasicDownloader(api_url).download().json()
if is_series:
title = res_data['name']
orig_title = res_data['original_name']
year = int(res_data['first_air_date'].split(
'-')[0]) if res_data['first_air_date'] else None
imdb_code = res_data['external_ids']['imdb_id']
showtime = [{res_data['first_air_date']: "首播日期"}
] if res_data['first_air_date'] else None
title = res_data["name"]
orig_title = res_data["original_name"]
year = (
int(res_data["first_air_date"].split("-")[0])
if res_data["first_air_date"]
else None
)
imdb_code = res_data["external_ids"]["imdb_id"]
showtime = (
[{res_data["first_air_date"]: "首播日期"}]
if res_data["first_air_date"]
else None
)
duration = None
else:
title = res_data['title']
orig_title = res_data['original_title']
year = int(res_data['release_date'].split('-')
[0]) if res_data['release_date'] else None
showtime = [{res_data['release_date']: "发布日期"}
] if res_data['release_date'] else None
imdb_code = res_data['imdb_id']
title = res_data["title"]
orig_title = res_data["original_title"]
year = (
int(res_data["release_date"].split("-")[0])
if res_data["release_date"]
else None
)
showtime = (
[{res_data["release_date"]: "发布日期"}]
if res_data["release_date"]
else None
)
imdb_code = res_data["imdb_id"]
# in minutes
duration = res_data['runtime'] if res_data['runtime'] else None
duration = res_data["runtime"] if res_data["runtime"] else None
genre = [x['name'] for x in res_data['genres']]
language = list(map(lambda x: x['name'], res_data['spoken_languages']))
brief = res_data['overview']
genre = [x["name"] for x in res_data["genres"]]
language = list(map(lambda x: x["name"], res_data["spoken_languages"]))
brief = res_data["overview"]
if is_series:
director = list(map(lambda x: x['name'], res_data['created_by']))
director = list(map(lambda x: x["name"], res_data["created_by"]))
else:
director = list(map(lambda x: x['name'], filter(
lambda c: c['job'] == 'Director', res_data['credits']['crew'])))
playwright = list(map(lambda x: x['name'], filter(
lambda c: c['job'] == 'Screenplay', res_data['credits']['crew'])))
actor = list(map(lambda x: x['name'], res_data['credits']['cast']))
director = list(
map(
lambda x: x["name"],
filter(
lambda c: c["job"] == "Director", res_data["credits"]["crew"]
),
)
)
playwright = list(
map(
lambda x: x["name"],
filter(lambda c: c["job"] == "Screenplay", res_data["credits"]["crew"]),
)
)
actor = list(map(lambda x: x["name"], res_data["credits"]["cast"]))
area = []
other_info = {}
@ -95,33 +117,39 @@ class TMDB_Movie(AbstractSite):
# other_info['奖项'] = res_data['awards']
# other_info['TMDB_ID'] = id
if is_series:
other_info['Seasons'] = res_data['number_of_seasons']
other_info['Episodes'] = res_data['number_of_episodes']
other_info["Seasons"] = res_data["number_of_seasons"]
other_info["Episodes"] = res_data["number_of_episodes"]
# TODO: use GET /configuration to get base url
img_url = ('https://image.tmdb.org/t/p/original/' + res_data['poster_path']) if res_data['poster_path'] is not None else None
img_url = (
("https://image.tmdb.org/t/p/original/" + res_data["poster_path"])
if res_data["poster_path"] is not None
else None
)
pd = ResourceContent(metadata={
'title': title,
'orig_title': orig_title,
'other_title': None,
'imdb_code': imdb_code,
'director': director,
'playwright': playwright,
'actor': actor,
'genre': genre,
'showtime': showtime,
'site': None,
'area': area,
'language': language,
'year': year,
'duration': duration,
'season': None,
'episodes': None,
'single_episode_length': None,
'brief': brief,
'cover_image_url': img_url,
})
pd = ResourceContent(
metadata={
"title": title,
"orig_title": orig_title,
"other_title": None,
"imdb_code": imdb_code,
"director": director,
"playwright": playwright,
"actor": actor,
"genre": genre,
"showtime": showtime,
"site": None,
"area": area,
"language": language,
"year": year,
"duration": duration,
"season": None,
"episodes": None,
"single_episode_length": None,
"brief": brief,
"cover_image_url": img_url,
}
)
if imdb_code:
pd.lookup_ids[IdType.IMDB] = imdb_code
if pd.metadata["cover_image_url"]:
@ -130,7 +158,9 @@ class TMDB_Movie(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd
@ -138,8 +168,11 @@ class TMDB_Movie(AbstractSite):
class TMDB_TV(AbstractSite):
SITE_NAME = SiteName.TMDB
ID_TYPE = IdType.TMDB_TV
URL_PATTERNS = [r'\w+://www.themoviedb.org/tv/(\d+)[^/]*$', r'\w+://www.themoviedb.org/tv/(\d+)[^/]*/seasons']
WIKI_PROPERTY_ID = '?'
URL_PATTERNS = [
r"\w+://www.themoviedb.org/tv/(\d+)[^/]*$",
r"\w+://www.themoviedb.org/tv/(\d+)[^/]*/seasons",
]
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = TVShow
@classmethod
@ -156,38 +189,60 @@ class TMDB_TV(AbstractSite):
res_data = BasicDownloader(api_url).download().json()
if is_series:
title = res_data['name']
orig_title = res_data['original_name']
year = int(res_data['first_air_date'].split(
'-')[0]) if res_data['first_air_date'] else None
imdb_code = res_data['external_ids']['imdb_id']
showtime = [{res_data['first_air_date']: "首播日期"}
] if res_data['first_air_date'] else None
title = res_data["name"]
orig_title = res_data["original_name"]
year = (
int(res_data["first_air_date"].split("-")[0])
if res_data["first_air_date"]
else None
)
imdb_code = res_data["external_ids"]["imdb_id"]
showtime = (
[{res_data["first_air_date"]: "首播日期"}]
if res_data["first_air_date"]
else None
)
duration = None
else:
title = res_data['title']
orig_title = res_data['original_title']
year = int(res_data['release_date'].split('-')
[0]) if res_data['release_date'] else None
showtime = [{res_data['release_date']: "发布日期"}
] if res_data['release_date'] else None
imdb_code = res_data['imdb_id']
title = res_data["title"]
orig_title = res_data["original_title"]
year = (
int(res_data["release_date"].split("-")[0])
if res_data["release_date"]
else None
)
showtime = (
[{res_data["release_date"]: "发布日期"}]
if res_data["release_date"]
else None
)
imdb_code = res_data["imdb_id"]
# in minutes
duration = res_data['runtime'] if res_data['runtime'] else None
duration = res_data["runtime"] if res_data["runtime"] else None
genre = [x['name'] for x in res_data['genres']]
genre = [x["name"] for x in res_data["genres"]]
language = list(map(lambda x: x['name'], res_data['spoken_languages']))
brief = res_data['overview']
language = list(map(lambda x: x["name"], res_data["spoken_languages"]))
brief = res_data["overview"]
if is_series:
director = list(map(lambda x: x['name'], res_data['created_by']))
director = list(map(lambda x: x["name"], res_data["created_by"]))
else:
director = list(map(lambda x: x['name'], filter(
lambda c: c['job'] == 'Director', res_data['credits']['crew'])))
playwright = list(map(lambda x: x['name'], filter(
lambda c: c['job'] == 'Screenplay', res_data['credits']['crew'])))
actor = list(map(lambda x: x['name'], res_data['credits']['cast']))
director = list(
map(
lambda x: x["name"],
filter(
lambda c: c["job"] == "Director", res_data["credits"]["crew"]
),
)
)
playwright = list(
map(
lambda x: x["name"],
filter(lambda c: c["job"] == "Screenplay", res_data["credits"]["crew"]),
)
)
actor = list(map(lambda x: x["name"], res_data["credits"]["cast"]))
area = []
other_info = {}
@ -197,41 +252,53 @@ class TMDB_TV(AbstractSite):
# other_info['奖项'] = res_data['awards']
# other_info['TMDB_ID'] = id
if is_series:
other_info['Seasons'] = res_data['number_of_seasons']
other_info['Episodes'] = res_data['number_of_episodes']
other_info["Seasons"] = res_data["number_of_seasons"]
other_info["Episodes"] = res_data["number_of_episodes"]
# TODO: use GET /configuration to get base url
img_url = ('https://image.tmdb.org/t/p/original/' + res_data['poster_path']) if res_data['poster_path'] is not None else None
img_url = (
("https://image.tmdb.org/t/p/original/" + res_data["poster_path"])
if res_data["poster_path"] is not None
else None
)
season_links = list(map(lambda s: {
'model': 'TVSeason',
'id_type': IdType.TMDB_TVSeason,
'id_value': f'{self.id_value}-{s["season_number"]}',
'title': s['name'],
'url': f'{self.url}/season/{s["season_number"]}'}, res_data['seasons']))
pd = ResourceContent(metadata={
'title': title,
'orig_title': orig_title,
'other_title': None,
'imdb_code': imdb_code,
'director': director,
'playwright': playwright,
'actor': actor,
'genre': genre,
'showtime': showtime,
'site': None,
'area': area,
'language': language,
'year': year,
'duration': duration,
'season_count': res_data['number_of_seasons'],
'season': None,
'episodes': None,
'single_episode_length': None,
'brief': brief,
'cover_image_url': img_url,
'related_resources': season_links,
})
season_links = list(
map(
lambda s: {
"model": "TVSeason",
"id_type": IdType.TMDB_TVSeason,
"id_value": f'{self.id_value}-{s["season_number"]}',
"title": s["name"],
"url": f'{self.url}/season/{s["season_number"]}',
},
res_data["seasons"],
)
)
pd = ResourceContent(
metadata={
"title": title,
"orig_title": orig_title,
"other_title": None,
"imdb_code": imdb_code,
"director": director,
"playwright": playwright,
"actor": actor,
"genre": genre,
"showtime": showtime,
"site": None,
"area": area,
"language": language,
"year": year,
"duration": duration,
"season_count": res_data["number_of_seasons"],
"season": None,
"episodes": None,
"single_episode_length": None,
"brief": brief,
"cover_image_url": img_url,
"related_resources": season_links,
}
)
if imdb_code:
pd.lookup_ids[IdType.IMDB] = imdb_code
@ -241,7 +308,9 @@ class TMDB_TV(AbstractSite):
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
return pd
@ -249,58 +318,87 @@ class TMDB_TV(AbstractSite):
class TMDB_TVSeason(AbstractSite):
SITE_NAME = SiteName.TMDB
ID_TYPE = IdType.TMDB_TVSeason
URL_PATTERNS = [r'\w+://www.themoviedb.org/tv/(\d+)[^/]*/season/(\d+)[^/]*$']
WIKI_PROPERTY_ID = '?'
URL_PATTERNS = [r"\w+://www.themoviedb.org/tv/(\d+)[^/]*/season/(\d+)[^/]*$"]
WIKI_PROPERTY_ID = "?"
DEFAULT_MODEL = TVSeason
ID_PATTERN = r'^(\d+)-(\d+)$'
ID_PATTERN = r"^(\d+)-(\d+)$"
@classmethod
def url_to_id(cls, url: str):
u = next(iter([re.match(p, url) for p in cls.URL_PATTERNS if re.match(p, url)]), None)
return u[1] + '-' + u[2] if u else None
u = next(
iter([re.match(p, url) for p in cls.URL_PATTERNS if re.match(p, url)]), None
)
return u[1] + "-" + u[2] if u else None
@classmethod
def id_to_url(cls, id_value):
v = id_value.split('-')
v = id_value.split("-")
return f"https://www.themoviedb.org/tv/{v[0]}/season/{v[1]}"
def scrape(self):
v = self.id_value.split('-')
v = self.id_value.split("-")
api_url = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
d = BasicDownloader(api_url).download().json()
if not d.get('id'):
raise ParseError('id')
pd = ResourceContent(metadata=_copy_dict(d, {'name': 'title', 'overview': 'brief', 'air_date': 'air_date', 'season_number': 0, 'external_ids': []}))
pd.metadata['required_resources'] = [{
'model': 'TVShow',
'id_type': IdType.TMDB_TV,
'id_value': v[0],
'title': f'TMDB TV Show {v[0]}',
'url': f"https://www.themoviedb.org/tv/{v[0]}",
}]
pd.lookup_ids[IdType.IMDB] = d['external_ids'].get('imdb_id')
pd.metadata['cover_image_url'] = ('https://image.tmdb.org/t/p/original/' + d['poster_path']) if d['poster_path'] else None
pd.metadata['title'] = pd.metadata['title'] if pd.metadata['title'] else f'Season {d["season_number"]}'
pd.metadata['episode_number_list'] = list(map(lambda ep: ep['episode_number'], d['episodes']))
pd.metadata['episode_count'] = len(pd.metadata['episode_number_list'])
if not d.get("id"):
raise ParseError("id")
pd = ResourceContent(
metadata=_copy_dict(
d,
{
"name": "title",
"overview": "brief",
"air_date": "air_date",
"season_number": 0,
"external_ids": [],
},
)
)
pd.metadata["required_resources"] = [
{
"model": "TVShow",
"id_type": IdType.TMDB_TV,
"id_value": v[0],
"title": f"TMDB TV Show {v[0]}",
"url": f"https://www.themoviedb.org/tv/{v[0]}",
}
]
pd.lookup_ids[IdType.IMDB] = d["external_ids"].get("imdb_id")
pd.metadata["cover_image_url"] = (
("https://image.tmdb.org/t/p/original/" + d["poster_path"])
if d["poster_path"]
else None
)
pd.metadata["title"] = (
pd.metadata["title"]
if pd.metadata["title"]
else f'Season {d["season_number"]}'
)
pd.metadata["episode_number_list"] = list(
map(lambda ep: ep["episode_number"], d["episodes"])
)
pd.metadata["episode_count"] = len(pd.metadata["episode_number_list"])
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
_logger.debug(
f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}'
)
# get external id from 1st episode
if pd.lookup_ids[IdType.IMDB]:
_logger.warning("Unexpected IMDB id for TMDB tv season")
elif len(pd.metadata['episode_number_list']) == 0:
_logger.warning("Unable to lookup IMDB id for TMDB tv season with zero episodes")
elif len(pd.metadata["episode_number_list"]) == 0:
_logger.warning(
"Unable to lookup IMDB id for TMDB tv season with zero episodes"
)
else:
ep = pd.metadata['episode_number_list'][0]
ep = pd.metadata["episode_number_list"][0]
api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
d2 = BasicDownloader(api_url2).download().json()
if not d2.get('id'):
raise ParseError('episode id for season')
pd.lookup_ids[IdType.IMDB] = d2['external_ids'].get('imdb_id')
if not d2.get("id"):
raise ParseError("episode id for season")
pd.lookup_ids[IdType.IMDB] = d2["external_ids"].get("imdb_id")
return pd

View file

@ -31,8 +31,8 @@ from django.utils.translation import gettext_lazy as _
class TVShow(Item):
category = ItemCategory.TV
url_path = 'tv'
demonstrative = _('这部剧集')
url_path = "tv"
demonstrative = _("这部剧集")
imdb = PrimaryLookupIdDescriptor(IdType.IMDB)
tmdb_tv = PrimaryLookupIdDescriptor(IdType.TMDB_TV)
imdb = PrimaryLookupIdDescriptor(IdType.IMDB)
@ -40,100 +40,208 @@ class TVShow(Item):
episode_count = models.PositiveIntegerField(null=True)
METADATA_COPY_LIST = [
'title',
'season_count',
'orig_title',
'other_title',
'director',
'playwright',
'actor',
'genre',
'showtime',
'site',
'area',
'language',
'year',
'duration',
'season_count',
'episode_count',
'single_episode_length',
'brief',
"title",
"season_count",
"orig_title",
"other_title",
"director",
"playwright",
"actor",
"genre",
"showtime",
"site",
"area",
"language",
"year",
"duration",
"season_count",
"episode_count",
"single_episode_length",
"brief",
]
orig_title = jsondata.CharField(_("original title"), blank=True, default='', max_length=500)
other_title = jsondata.ArrayField(models.CharField(_("other title"), blank=True, default='', max_length=500), null=True, blank=True, default=list, )
director = jsondata.ArrayField(models.CharField(_("director"), blank=True, default='', max_length=200), null=True, blank=True, default=list, )
playwright = jsondata.ArrayField(models.CharField(_("playwright"), blank=True, default='', max_length=200), null=True, blank=True, default=list, )
actor = jsondata.ArrayField(models.CharField(_("actor"), blank=True, default='', max_length=200), null=True, blank=True, default=list, )
genre = jsondata.ArrayField(models.CharField(_("genre"), blank=True, default='', max_length=50), null=True, blank=True, default=list, ) # , choices=MovieGenreEnum.choices
showtime = jsondata.ArrayField(null=True, blank=True, default=list, )
site = jsondata.URLField(_('site url'), blank=True, default='', max_length=200)
area = jsondata.ArrayField(models.CharField(_("country or region"), blank=True, default='', max_length=100, ), null=True, blank=True, default=list, )
language = jsondata.ArrayField(models.CharField(blank=True, default='', max_length=100, ), null=True, blank=True, default=list, )
orig_title = jsondata.CharField(
_("original title"), blank=True, default="", max_length=500
)
other_title = jsondata.ArrayField(
models.CharField(_("other title"), blank=True, default="", max_length=500),
null=True,
blank=True,
default=list,
)
director = jsondata.ArrayField(
models.CharField(_("director"), blank=True, default="", max_length=200),
null=True,
blank=True,
default=list,
)
playwright = jsondata.ArrayField(
models.CharField(_("playwright"), blank=True, default="", max_length=200),
null=True,
blank=True,
default=list,
)
actor = jsondata.ArrayField(
models.CharField(_("actor"), blank=True, default="", max_length=200),
null=True,
blank=True,
default=list,
)
genre = jsondata.ArrayField(
models.CharField(_("genre"), blank=True, default="", max_length=50),
null=True,
blank=True,
default=list,
) # , choices=MovieGenreEnum.choices
showtime = jsondata.ArrayField(
null=True,
blank=True,
default=list,
)
site = jsondata.URLField(_("site url"), blank=True, default="", max_length=200)
area = jsondata.ArrayField(
models.CharField(
_("country or region"),
blank=True,
default="",
max_length=100,
),
null=True,
blank=True,
default=list,
)
language = jsondata.ArrayField(
models.CharField(
blank=True,
default="",
max_length=100,
),
null=True,
blank=True,
default=list,
)
year = jsondata.IntegerField(null=True, blank=True)
season_number = jsondata.IntegerField(null=True, blank=True)
single_episode_length = jsondata.IntegerField(null=True, blank=True)
duration = jsondata.CharField(blank=True, default='', max_length=200)
duration = jsondata.CharField(blank=True, default="", max_length=200)
class TVSeason(Item):
category = ItemCategory.TV
url_path = 'tv/season'
demonstrative = _('这部剧集')
url_path = "tv/season"
demonstrative = _("这部剧集")
douban_movie = PrimaryLookupIdDescriptor(IdType.DoubanMovie)
imdb = PrimaryLookupIdDescriptor(IdType.IMDB)
tmdb_tvseason = PrimaryLookupIdDescriptor(IdType.TMDB_TVSeason)
show = models.ForeignKey(TVShow, null=True, on_delete=models.SET_NULL, related_name='seasons')
show = models.ForeignKey(
TVShow, null=True, on_delete=models.SET_NULL, related_name="seasons"
)
season_number = models.PositiveIntegerField(null=True)
episode_count = models.PositiveIntegerField(null=True)
METADATA_COPY_LIST = [
'title',
'orig_title',
'other_title',
'director',
'playwright',
'actor',
'genre',
'showtime',
'site',
'area',
'language',
'year',
'duration',
'season_number',
'episode_count',
'single_episode_length',
'brief',
"title",
"orig_title",
"other_title",
"director",
"playwright",
"actor",
"genre",
"showtime",
"site",
"area",
"language",
"year",
"duration",
"season_number",
"episode_count",
"single_episode_length",
"brief",
]
orig_title = jsondata.CharField(_("original title"), blank=True, default='', max_length=500)
other_title = jsondata.ArrayField(models.CharField(_("other title"), blank=True, default='', max_length=500), null=True, blank=True, default=list, )
director = jsondata.ArrayField(models.CharField(_("director"), blank=True, default='', max_length=200), null=True, blank=True, default=list, )
playwright = jsondata.ArrayField(models.CharField(_("playwright"), blank=True, default='', max_length=200), null=True, blank=True, default=list, )
actor = jsondata.ArrayField(models.CharField(_("actor"), blank=True, default='', max_length=200), null=True, blank=True, default=list, )
genre = jsondata.ArrayField(models.CharField(_("genre"), blank=True, default='', max_length=50), null=True, blank=True, default=list, ) # , choices=MovieGenreEnum.choices
showtime = jsondata.ArrayField(null=True, blank=True, default=list, )
site = jsondata.URLField(_('site url'), blank=True, default='', max_length=200)
area = jsondata.ArrayField(models.CharField(_("country or region"), blank=True, default='', max_length=100, ), null=True, blank=True, default=list, )
language = jsondata.ArrayField(models.CharField(blank=True, default='', max_length=100, ), null=True, blank=True, default=list, )
orig_title = jsondata.CharField(
_("original title"), blank=True, default="", max_length=500
)
other_title = jsondata.ArrayField(
models.CharField(_("other title"), blank=True, default="", max_length=500),
null=True,
blank=True,
default=list,
)
director = jsondata.ArrayField(
models.CharField(_("director"), blank=True, default="", max_length=200),
null=True,
blank=True,
default=list,
)
playwright = jsondata.ArrayField(
models.CharField(_("playwright"), blank=True, default="", max_length=200),
null=True,
blank=True,
default=list,
)
actor = jsondata.ArrayField(
models.CharField(_("actor"), blank=True, default="", max_length=200),
null=True,
blank=True,
default=list,
)
genre = jsondata.ArrayField(
models.CharField(_("genre"), blank=True, default="", max_length=50),
null=True,
blank=True,
default=list,
) # , choices=MovieGenreEnum.choices
showtime = jsondata.ArrayField(
null=True,
blank=True,
default=list,
)
site = jsondata.URLField(_("site url"), blank=True, default="", max_length=200)
area = jsondata.ArrayField(
models.CharField(
_("country or region"),
blank=True,
default="",
max_length=100,
),
null=True,
blank=True,
default=list,
)
language = jsondata.ArrayField(
models.CharField(
blank=True,
default="",
max_length=100,
),
null=True,
blank=True,
default=list,
)
year = jsondata.IntegerField(null=True, blank=True)
single_episode_length = jsondata.IntegerField(null=True, blank=True)
duration = jsondata.CharField(blank=True, default='', max_length=200)
duration = jsondata.CharField(blank=True, default="", max_length=200)
def update_linked_items_from_external_resource(self, resource):
"""add Work from resource.metadata['work'] if not yet"""
links = resource.required_resources + resource.related_resources
for w in links:
if w['model'] == 'TVShow':
p = ExternalResource.objects.filter(id_type=w['id_type'], id_value=w['id_value']).first()
if w["model"] == "TVShow":
p = ExternalResource.objects.filter(
id_type=w["id_type"], id_value=w["id_value"]
).first()
if p and p.item and self.show != p.item:
self.show = p.item
class TVEpisode(Item):
category = ItemCategory.TV
url_path = 'tv/episode'
show = models.ForeignKey(TVShow, null=True, on_delete=models.SET_NULL, related_name='episodes')
season = models.ForeignKey(TVSeason, null=True, on_delete=models.SET_NULL, related_name='episodes')
url_path = "tv/episode"
show = models.ForeignKey(
TVShow, null=True, on_delete=models.SET_NULL, related_name="episodes"
)
season = models.ForeignKey(
TVSeason, null=True, on_delete=models.SET_NULL, related_name="episodes"
)
episode_number = models.PositiveIntegerField(null=True)
imdb = PrimaryLookupIdDescriptor(IdType.IMDB)
METADATA_COPY_LIST = ['title', 'brief', 'episode_number']
METADATA_COPY_LIST = ["title", "brief", "episode_number"]

View file

@ -5,10 +5,10 @@ from catalog.tv.models import *
class TMDBTVTestCase(TestCase):
def test_parse(self):
t_id = '57243'
t_url = 'https://www.themoviedb.org/tv/57243-doctor-who'
t_url1 = 'https://www.themoviedb.org/tv/57243-doctor-who/seasons'
t_url2 = 'https://www.themoviedb.org/tv/57243'
t_id = "57243"
t_url = "https://www.themoviedb.org/tv/57243-doctor-who"
t_url1 = "https://www.themoviedb.org/tv/57243-doctor-who/seasons"
t_url2 = "https://www.themoviedb.org/tv/57243"
p1 = SiteManager.get_site_by_id_type(IdType.TMDB_TV)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
@ -17,29 +17,29 @@ class TMDBTVTestCase(TestCase):
p2 = SiteManager.get_site_by_url(t_url)
self.assertEqual(p1.id_to_url(t_id), t_url2)
self.assertEqual(p2.url_to_id(t_url), t_id)
wrong_url = 'https://www.themoviedb.org/tv/57243-doctor-who/season/13'
wrong_url = "https://www.themoviedb.org/tv/57243-doctor-who/season/13"
s1 = SiteManager.get_site_by_url(wrong_url)
self.assertNotIsInstance(s1, TVShow)
@use_local_response
def test_scrape(self):
t_url = 'https://www.themoviedb.org/tv/57243-doctor-who'
t_url = "https://www.themoviedb.org/tv/57243-doctor-who"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '57243')
self.assertEqual(site.id_value, "57243")
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], '神秘博士')
self.assertEqual(site.resource.metadata["title"], "神秘博士")
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, 'TVShow')
self.assertEqual(site.resource.item.imdb, 'tt0436992')
self.assertEqual(site.resource.item.__class__.__name__, "TVShow")
self.assertEqual(site.resource.item.imdb, "tt0436992")
class TMDBTVSeasonTestCase(TestCase):
def test_parse(self):
t_id = '57243-11'
t_url = 'https://www.themoviedb.org/tv/57243-doctor-who/season/11'
t_url_unique = 'https://www.themoviedb.org/tv/57243/season/11'
t_id = "57243-11"
t_url = "https://www.themoviedb.org/tv/57243-doctor-who/season/11"
t_url_unique = "https://www.themoviedb.org/tv/57243/season/11"
p1 = SiteManager.get_site_by_id_type(IdType.TMDB_TVSeason)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
@ -50,48 +50,48 @@ class TMDBTVSeasonTestCase(TestCase):
@use_local_response
def test_scrape(self):
t_url = 'https://www.themoviedb.org/tv/57243-doctor-who/season/4'
t_url = "https://www.themoviedb.org/tv/57243-doctor-who/season/4"
site = SiteManager.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '57243-4')
self.assertEqual(site.id_value, "57243-4")
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], '第 4 季')
self.assertEqual(site.resource.metadata["title"], "第 4 季")
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, 'TVSeason')
self.assertEqual(site.resource.item.imdb, 'tt1159991')
self.assertEqual(site.resource.item.__class__.__name__, "TVSeason")
self.assertEqual(site.resource.item.imdb, "tt1159991")
self.assertIsNotNone(site.resource.item.show)
self.assertEqual(site.resource.item.show.imdb, 'tt0436992')
self.assertEqual(site.resource.item.show.imdb, "tt0436992")
class DoubanMovieTVTestCase(TestCase):
@use_local_response
def test_scrape(self):
url3 = 'https://movie.douban.com/subject/3627919/'
url3 = "https://movie.douban.com/subject/3627919/"
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, 'TVSeason')
self.assertEqual(p3.item.__class__.__name__, "TVSeason")
self.assertIsNotNone(p3.item.show)
self.assertEqual(p3.item.show.imdb, 'tt0436992')
self.assertEqual(p3.item.show.imdb, "tt0436992")
@use_local_response
def test_scrape_singleseason(self):
url3 = 'https://movie.douban.com/subject/26895436/'
url3 = "https://movie.douban.com/subject/26895436/"
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, 'TVShow')
self.assertEqual(p3.item.__class__.__name__, "TVShow")
@use_local_response
def test_scrape_fix_imdb(self):
url = 'https://movie.douban.com/subject/35597581/'
url = "https://movie.douban.com/subject/35597581/"
item = SiteManager.get_site_by_url(url).get_resource_ready().item
# this douban links to S6E3, we'll reset it to S6E1 to keep consistant
self.assertEqual(item.imdb, 'tt21599650')
self.assertEqual(item.imdb, "tt21599650")
class MultiTVSitesTestCase(TestCase):
@use_local_response
def test_tvshows(self):
url1 = 'https://www.themoviedb.org/tv/57243-doctor-who'
url2 = 'https://www.imdb.com/title/tt0436992/'
url1 = "https://www.themoviedb.org/tv/57243-doctor-who"
url2 = "https://www.imdb.com/title/tt0436992/"
# url3 = 'https://movie.douban.com/subject/3541415/'
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
@ -101,9 +101,9 @@ class MultiTVSitesTestCase(TestCase):
@use_local_response
def test_tvseasons(self):
url1 = 'https://www.themoviedb.org/tv/57243-doctor-who/season/4'
url2 = 'https://www.imdb.com/title/tt1159991/'
url3 = 'https://movie.douban.com/subject/3627919/'
url1 = "https://www.themoviedb.org/tv/57243-doctor-who/season/4"
url2 = "https://www.imdb.com/title/tt1159991/"
url3 = "https://movie.douban.com/subject/3627919/"
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
@ -114,18 +114,18 @@ class MultiTVSitesTestCase(TestCase):
@use_local_response
def test_miniseries(self):
url1 = 'https://www.themoviedb.org/tv/86941-the-north-water'
url3 = 'https://movie.douban.com/subject/26895436/'
url1 = "https://www.themoviedb.org/tv/86941-the-north-water"
url3 = "https://movie.douban.com/subject/26895436/"
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, 'TVShow')
self.assertEqual(p3.item.__class__.__name__, "TVShow")
self.assertEqual(p1.item.id, p3.item.id)
@use_local_response
def test_tvspecial(self):
url1 = 'https://www.themoviedb.org/movie/282758-doctor-who-the-runaway-bride'
url2 = 'hhttps://www.imdb.com/title/tt0827573/'
url3 = 'https://movie.douban.com/subject/4296866/'
url1 = "https://www.themoviedb.org/movie/282758-doctor-who-the-runaway-bride"
url2 = "hhttps://www.imdb.com/title/tt0827573/"
url3 = "https://movie.douban.com/subject/4296866/"
p1 = SiteManager.get_site_by_url(url1).get_resource_ready()
p2 = SiteManager.get_site_by_url(url2).get_resource_ready()
p3 = SiteManager.get_site_by_url(url3).get_resource_ready()

View file

@ -3,13 +3,13 @@ from .api import api
from .views import *
from .models import *
app_name = 'catalog'
app_name = "catalog"
def _get_all_url_paths():
paths = ['item']
paths = ["item"]
for cls in Item.__subclasses__():
p = getattr(cls, 'url_path', None)
p = getattr(cls, "url_path", None)
if p:
paths.append(p)
res = "|".join(paths)
@ -17,9 +17,31 @@ def _get_all_url_paths():
urlpatterns = [
re_path(r'^item/(?P<item_uid>[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})?$', retrieve_by_uuid, name='retrieve_by_uuid'),
re_path(r'^(?P<item_path>' + _get_all_url_paths() + ')/(?P<item_uuid>[A-Za-z0-9]{21,22})$', retrieve, name='retrieve'),
re_path(r'^(?P<item_path>' + _get_all_url_paths() + ')/(?P<item_uuid>[A-Za-z0-9]{21,22})/reviews', review_list, name='review_list'),
re_path(r'^(?P<item_path>' + _get_all_url_paths() + ')/(?P<item_uuid>[A-Za-z0-9]{21,22})/marks(?:/(?P<following_only>\\w+))?', mark_list, name='mark_list'),
re_path(
r"^item/(?P<item_uid>[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})?$",
retrieve_by_uuid,
name="retrieve_by_uuid",
),
re_path(
r"^(?P<item_path>"
+ _get_all_url_paths()
+ ")/(?P<item_uuid>[A-Za-z0-9]{21,22})$",
retrieve,
name="retrieve",
),
re_path(
r"^(?P<item_path>"
+ _get_all_url_paths()
+ ")/(?P<item_uuid>[A-Za-z0-9]{21,22})/reviews",
review_list,
name="review_list",
),
re_path(
r"^(?P<item_path>"
+ _get_all_url_paths()
+ ")/(?P<item_uuid>[A-Za-z0-9]{21,22})/marks(?:/(?P<following_only>\\w+))?",
mark_list,
name="mark_list",
),
path("api/", api.urls),
]

View file

@ -34,9 +34,9 @@ def retrieve_by_uuid(request, item_uid):
def retrieve(request, item_path, item_uuid):
if request.method == 'GET':
if request.method == "GET":
item = get_object_or_404(Item, uid=base62.decode(item_uuid))
item_url = f'/{item_path}/{item_uuid}'
item_url = f"/{item_path}/{item_uuid}"
if item.url != item_url:
return redirect(item.url)
mark = None
@ -44,26 +44,46 @@ def retrieve(request, item_path, item_uuid):
mark_list = None
review_list = None
collection_list = []
shelf_types = [(n[1], n[2]) for n in iter(ShelfTypeNames) if n[0] == item.category]
shelf_types = [
(n[1], n[2]) for n in iter(ShelfTypeNames) if n[0] == item.category
]
if request.user.is_authenticated:
visible = query_visible(request.user)
mark = Mark(request.user, item)
_logger.info(mark.rating)
review = mark.review
collection_list = item.collections.all().filter(visible).annotate(like_counts=Count('likes')).order_by('-like_counts')
mark_query = ShelfMember.objects.filter(item=item).filter(visible).order_by('-created_time')
mark_list = [member.mark for member in mark_query[:NUM_REVIEWS_ON_ITEM_PAGE]]
review_list = Review.objects.filter(item=item).filter(visible).order_by('-created_time')[:NUM_REVIEWS_ON_ITEM_PAGE]
collection_list = (
item.collections.all()
.filter(visible)
.annotate(like_counts=Count("likes"))
.order_by("-like_counts")
)
mark_query = (
ShelfMember.objects.filter(item=item)
.filter(visible)
.order_by("-created_time")
)
mark_list = [
member.mark for member in mark_query[:NUM_REVIEWS_ON_ITEM_PAGE]
]
review_list = (
Review.objects.filter(item=item)
.filter(visible)
.order_by("-created_time")[:NUM_REVIEWS_ON_ITEM_PAGE]
)
return render(request, item.class_name + '.html', {
'item': item,
'mark': mark,
'review': review,
'mark_list': mark_list,
'review_list': review_list,
'collection_list': collection_list,
'shelf_types': shelf_types,
}
return render(
request,
item.class_name + ".html",
{
"item": item,
"mark": mark,
"review": review,
"mark_list": mark_list,
"review_list": review_list,
"collection_list": collection_list,
"shelf_types": shelf_types,
},
)
else:
return HttpResponseBadRequest()
@ -73,23 +93,24 @@ def mark_list(request, item_path, item_uuid, following_only=False):
item = get_object_or_404(Item, uid=base62.decode(item_uuid))
if not item:
return HttpResponseNotFound("item not found")
queryset = ShelfMember.objects.filter(item=item).order_by('-created_time')
queryset = ShelfMember.objects.filter(item=item).order_by("-created_time")
if following_only:
queryset = queryset.filter(query_following(request.user))
else:
queryset = queryset.filter(query_visible(request.user))
paginator = Paginator(queryset, NUM_REVIEWS_ON_LIST_PAGE)
page_number = request.GET.get('page', default=1)
page_number = request.GET.get("page", default=1)
marks = paginator.get_page(page_number)
marks.pagination = PageLinksGenerator(
PAGE_LINK_NUMBER, page_number, paginator.num_pages)
PAGE_LINK_NUMBER, page_number, paginator.num_pages
)
return render(
request,
'item_mark_list.html',
"item_mark_list.html",
{
'marks': marks,
'item': item,
}
"marks": marks,
"item": item,
},
)
@ -97,18 +118,19 @@ def review_list(request, item_path, item_uuid):
item = get_object_or_404(Item, uid=base62.decode(item_uuid))
if not item:
return HttpResponseNotFound("item not found")
queryset = Review.objects.filter(item=item).order_by('-created_time')
queryset = Review.objects.filter(item=item).order_by("-created_time")
queryset = queryset.filter(query_visible(request.user))
paginator = Paginator(queryset, NUM_REVIEWS_ON_LIST_PAGE)
page_number = request.GET.get('page', default=1)
page_number = request.GET.get("page", default=1)
reviews = paginator.get_page(page_number)
reviews.pagination = PageLinksGenerator(
PAGE_LINK_NUMBER, page_number, paginator.num_pages)
PAGE_LINK_NUMBER, page_number, paginator.num_pages
)
return render(
request,
'item_review_list.html',
"item_review_list.html",
{
'reviews': reviews,
'item': item,
}
"reviews": reviews,
"item": item,
},
)

View file

@ -2,5 +2,5 @@ from django.apps import AppConfig
class JournalConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'journal'
default_auto_field = "django.db.models.BigAutoField"
name = "journal"

View file

@ -12,27 +12,23 @@ from common.forms import PreviewImageInput
class ReviewForm(forms.ModelForm):
class Meta:
model = Review
fields = [
'id',
'item',
'title',
'body',
'visibility'
]
fields = ["id", "item", "title", "body", "visibility"]
widgets = {
'item': forms.TextInput(attrs={"hidden": ""}),
"item": forms.TextInput(attrs={"hidden": ""}),
}
title = forms.CharField(label=_("评论标题"))
body = MarkdownxFormField(label=_("评论正文 (Markdown)"))
share_to_mastodon = forms.BooleanField(
label=_("分享到联邦网络"), initial=True, required=False)
label=_("分享到联邦网络"), initial=True, required=False
)
id = forms.IntegerField(required=False, widget=forms.HiddenInput())
visibility = forms.TypedChoiceField(
label=_("可见性"),
initial=0,
coerce=int,
choices=VisibilityType.choices,
widget=forms.RadioSelect
widget=forms.RadioSelect,
)
@ -52,26 +48,26 @@ class CollectionForm(forms.ModelForm):
initial=0,
coerce=int,
choices=VisibilityType.choices,
widget=forms.RadioSelect
widget=forms.RadioSelect,
)
collaborative = forms.TypedChoiceField(
label=_("协作整理权限"),
initial=0,
coerce=int,
choices=COLLABORATIVE_CHOICES,
widget=forms.RadioSelect
widget=forms.RadioSelect,
)
class Meta:
model = Collection
fields = [
'title',
'cover',
'visibility',
'collaborative',
'brief',
"title",
"cover",
"visibility",
"collaborative",
"brief",
]
widgets = {
'cover': PreviewImageInput(),
"cover": PreviewImageInput(),
}

View file

@ -17,7 +17,11 @@ class UserOwnedObjectMixin:
return False
if self.visibility == 2:
return False
if viewer.is_blocking(owner) or owner.is_blocking(viewer) or viewer.is_muting(owner):
if (
viewer.is_blocking(owner)
or owner.is_blocking(viewer)
or viewer.is_muting(owner)
):
return False
if self.visibility == 1:
return viewer.is_following(owner)
@ -25,12 +29,26 @@ class UserOwnedObjectMixin:
return True
def is_editable_by(self, viewer):
return viewer.is_authenticated and (viewer.is_staff or viewer.is_superuser or viewer == self.owner)
return viewer.is_authenticated and (
viewer.is_staff or viewer.is_superuser or viewer == self.owner
)
@classmethod
def get_available(cls, entity, request_user, following_only=False):
# e.g. SongMark.get_available(song, request.user)
query_kwargs = {entity.__class__.__name__.lower(): entity}
all_entities = cls.objects.filter(**query_kwargs).order_by("-created_time") # get all marks for song
visible_entities = list(filter(lambda _entity: _entity.is_visible_to(request_user) and (_entity.owner.mastodon_username in request_user.mastodon_following if following_only else True), all_entities))
all_entities = cls.objects.filter(**query_kwargs).order_by(
"-created_time"
) # get all marks for song
visible_entities = list(
filter(
lambda _entity: _entity.is_visible_to(request_user)
and (
_entity.owner.mastodon_username in request_user.mastodon_following
if following_only
else True
),
all_entities,
)
)
return visible_entities

View file

@ -7,21 +7,21 @@ register = template.Library()
@register.simple_tag(takes_context=True)
def wish_item_action(context, item):
user = context['request'].user
user = context["request"].user
if user and user.is_authenticated:
action = {
'taken': user.shelf_manager.locate_item(item) is not None,
'url': reverse("journal:wish", args=[item.uuid]),
"taken": user.shelf_manager.locate_item(item) is not None,
"url": reverse("journal:wish", args=[item.uuid]),
}
return action
@register.simple_tag(takes_context=True)
def like_piece_action(context, piece):
user = context['request'].user
user = context["request"].user
if user and user.is_authenticated:
action = {
'taken': Like.objects.filter(target=piece, owner=user).first() is not None,
'url': reverse("journal:like", args=[piece.uuid]),
"taken": Like.objects.filter(target=piece, owner=user).first() is not None,
"url": reverse("journal:like", args=[piece.uuid]),
}
return action

View file

@ -2,8 +2,8 @@ from django.apps import AppConfig
class SocialConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'social'
default_auto_field = "django.db.models.BigAutoField"
name = "social"
def ready(self):
# load key modules in proper order, make sure class inject and signal works as expected

View file

@ -21,23 +21,27 @@ _logger = logging.getLogger(__name__)
class ActivityTemplate(models.TextChoices):
"""
"""
MarkItem = 'mark_item'
ReviewItem = 'review_item'
CreateCollection = 'create_collection'
LikeCollection = 'like_collection'
""" """
MarkItem = "mark_item"
ReviewItem = "review_item"
CreateCollection = "create_collection"
LikeCollection = "like_collection"
class LocalActivity(models.Model, UserOwnedObjectMixin):
owner = models.ForeignKey(User, on_delete=models.CASCADE)
visibility = models.PositiveSmallIntegerField(default=0) # 0: Public / 1: Follower only / 2: Self only
template = models.CharField(blank=False, choices=ActivityTemplate.choices, max_length=50)
visibility = models.PositiveSmallIntegerField(
default=0
) # 0: Public / 1: Follower only / 2: Self only
template = models.CharField(
blank=False, choices=ActivityTemplate.choices, max_length=50
)
action_object = models.ForeignKey(Piece, on_delete=models.CASCADE)
created_time = models.DateTimeField(default=timezone.now, db_index=True)
def __str__(self):
return f'Activity [{self.owner}:{self.template}:{self.action_object}]'
return f"Activity [{self.owner}:{self.template}:{self.action_object}]"
class ActivityManager:
@ -48,7 +52,11 @@ class ActivityManager:
q = Q(owner_id__in=self.owner.following, visibility__lt=2) | Q(owner=self.owner)
if before_time:
q = q & Q(created_time__lt=before_time)
return LocalActivity.objects.filter(q).order_by('-created_time').prefetch_related('action_object') # .select_related() https://github.com/django-polymorphic/django-polymorphic/pull/531
return (
LocalActivity.objects.filter(q)
.order_by("-created_time")
.prefetch_related("action_object")
) # .select_related() https://github.com/django-polymorphic/django-polymorphic/pull/531
@staticmethod
def get_manager_for_user(user):
@ -56,7 +64,7 @@ class ActivityManager:
User.activity_manager = cached_property(ActivityManager.get_manager_for_user)
User.activity_manager.__set_name__(User, 'activity_manager')
User.activity_manager.__set_name__(User, "activity_manager")
class DataSignalManager:
@ -68,9 +76,9 @@ class DataSignalManager:
if processor_class:
processor = processor_class(instance)
if created:
if hasattr(processor, 'created'):
if hasattr(processor, "created"):
processor.created()
elif hasattr(processor, 'updated'):
elif hasattr(processor, "updated"):
processor.updated()
@staticmethod
@ -78,7 +86,7 @@ class DataSignalManager:
processor_class = DataSignalManager.processors.get(instance.__class__)
if processor_class:
processor = processor_class(instance)
if hasattr(processor, 'deleted'):
if hasattr(processor, "deleted"):
processor.deleted()
@staticmethod
@ -103,15 +111,17 @@ class DefaultActivityProcessor:
def created(self):
params = {
'owner': self.action_object.owner,
'visibility': self.action_object.visibility,
'template': self.template,
'action_object': self.action_object,
"owner": self.action_object.owner,
"visibility": self.action_object.visibility,
"template": self.template,
"action_object": self.action_object,
}
LocalActivity.objects.create(**params)
def updated(self):
activity = LocalActivity.objects.filter(action_object=self.action_object).first()
activity = LocalActivity.objects.filter(
action_object=self.action_object
).first()
if not activity:
self.created()
elif activity.visibility != self.action_object.visibility:

View file

@ -2,8 +2,8 @@ from django.urls import path, re_path
from .views import *
app_name = 'social'
app_name = "social"
urlpatterns = [
path('', feed, name='feed'),
path('data', data, name='data'),
path("", feed, name="feed"),
path("data", data, name="data"),
]

View file

@ -23,31 +23,35 @@ PAGE_SIZE = 10
@login_required
def feed(request):
if request.method != 'GET':
if request.method != "GET":
return
user = request.user
unread = Announcement.objects.filter(pk__gt=user.read_announcement_index).order_by('-pk')
unread = Announcement.objects.filter(pk__gt=user.read_announcement_index).order_by(
"-pk"
)
if unread:
user.read_announcement_index = Announcement.objects.latest('pk').pk
user.save(update_fields=['read_announcement_index'])
user.read_announcement_index = Announcement.objects.latest("pk").pk
user.save(update_fields=["read_announcement_index"])
return render(
request,
'feed.html',
"feed.html",
{
'top_tags': user.tag_manager.all_tags[:10],
'unread_announcements': unread,
}
"top_tags": user.tag_manager.all_tags[:10],
"unread_announcements": unread,
},
)
@login_required
def data(request):
if request.method != 'GET':
if request.method != "GET":
return
return render(
request,
'feed_data.html',
"feed_data.html",
{
'activities': ActivityManager(request.user).get_timeline(before_time=request.GET.get('last'))[:PAGE_SIZE],
}
"activities": ActivityManager(request.user).get_timeline(
before_time=request.GET.get("last")
)[:PAGE_SIZE],
},
)