merge from main

This commit is contained in:
Te Llamas 2022-12-09 23:42:05 +00:00
commit b5c849d6b0
102 changed files with 48009 additions and 14 deletions

2
.gitignore vendored
View file

@ -1,3 +1,5 @@
.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

0
catalog/__init__.py Normal file
View file

3
catalog/admin.py Normal file
View file

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

11
catalog/api.py Normal file
View file

@ -0,0 +1,11 @@
from ninja import NinjaAPI
from .models import Podcast
from django.conf import settings
api = NinjaAPI(title=settings.SITE_INFO['site_name'], version="1.0.0", description=settings.SITE_INFO['site_name'])
@api.get("/podcasts/{item_id}")
def get_item(request, item_id: int):
return Podcast.objects.filter(pk=item_id).first()

6
catalog/apps.py Normal file
View file

@ -0,0 +1,6 @@
from django.apps import AppConfig
class CatalogConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'catalog'

77
catalog/book/models.py Normal file
View file

@ -0,0 +1,77 @@
"""
Models for Book
Series -> Work -> Edition
Series is not fully implemented at the moment
Goodreads
Famous works have many editions
Google Books:
only has Edition level ("volume") data
Douban:
old editions has only CUBN(Chinese Unified Book Number)
work data seems asymmetric (a book links to a work, but may not listed in that work as one of its editions)
"""
from django.db import models
from django.utils.translation import gettext_lazy as _
from catalog.common import *
from .utils import *
class Edition(Item):
isbn = PrimaryLookupIdDescriptor(IdType.ISBN)
asin = PrimaryLookupIdDescriptor(IdType.ASIN)
cubn = PrimaryLookupIdDescriptor(IdType.CUBN)
# douban_book = LookupIdDescriptor(IdType.DoubanBook)
# goodreads = LookupIdDescriptor(IdType.Goodreads)
languages = jsondata.ArrayField(_("语言"), null=True, blank=True, default=list)
publish_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True)
publish_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True)
pages = jsondata.IntegerField(blank=True, default=None)
authors = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list)
translaters = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list)
publishers = jsondata.ArrayField(_('出版方'), null=True, blank=True, default=list)
@property
def isbn10(self):
return isbn_13_to_10(self.isbn)
@isbn10.setter
def isbn10(self, value):
self.isbn = isbn_10_to_13(value)
def update_linked_items_from_external_resource(self, resource):
"""add Work from resource.metadata['work'] if not yet"""
links = resource.required_resources + resource.related_resources
for w in links:
if w['model'] == 'Work':
work = Work.objects.filter(primary_lookup_id_type=w['id_type'], primary_lookup_id_value=w['id_value']).first()
if work and work not in self.works.all():
self.works.add(work)
# if not work:
# _logger.info(f'Unable to find link for {w["url"]}')
class Work(Item):
# douban_work = PrimaryLookupIdDescriptor(IdType.DoubanBook_Work)
# goodreads_work = PrimaryLookupIdDescriptor(IdType.Goodreads_Work)
editions = models.ManyToManyField(Edition, related_name='works') # , through='WorkEdition'
# def __str__(self):
# return self.title
# class Meta:
# proxy = True
class Series(Item):
# douban_serie = LookupIdDescriptor(IdType.DoubanBook_Serie)
# goodreads_serie = LookupIdDescriptor(IdType.Goodreads_Serie)
class Meta:
proxy = True

237
catalog/book/tests.py Normal file
View file

@ -0,0 +1,237 @@
from django.test import TestCase
from catalog.book.models import *
from catalog.common import *
class BookTestCase(TestCase):
def setUp(self):
hyperion = Edition.objects.create(title="Hyperion")
hyperion.pages = 500
hyperion.isbn = '9780553283686'
hyperion.save()
# hyperion.isbn10 = '0553283685'
def test_properties(self):
hyperion = Edition.objects.get(title="Hyperion")
self.assertEqual(hyperion.title, "Hyperion")
self.assertEqual(hyperion.pages, 500)
self.assertEqual(hyperion.primary_lookup_id_type, IdType.ISBN)
self.assertEqual(hyperion.primary_lookup_id_value, '9780553283686')
andymion = Edition(title="Andymion", pages=42)
self.assertEqual(andymion.pages, 42)
def test_lookupids(self):
hyperion = Edition.objects.get(title="Hyperion")
hyperion.asin = 'B004G60EHS'
self.assertEqual(hyperion.primary_lookup_id_type, IdType.ASIN)
self.assertEqual(hyperion.primary_lookup_id_value, 'B004G60EHS')
self.assertEqual(hyperion.isbn, None)
self.assertEqual(hyperion.isbn10, None)
def test_isbn(self):
hyperion = Edition.objects.get(title="Hyperion")
self.assertEqual(hyperion.isbn, '9780553283686')
self.assertEqual(hyperion.isbn10, '0553283685')
hyperion.isbn10 = '0575099437'
self.assertEqual(hyperion.isbn, '9780575099432')
self.assertEqual(hyperion.isbn10, '0575099437')
def test_work(self):
hyperion_print = Edition.objects.get(title="Hyperion")
hyperion_ebook = Edition(title="Hyperion")
hyperion_ebook.save()
hyperion_ebook.asin = 'B0043M6780'
hyperion = Work(title="Hyperion")
hyperion.save()
hyperion.editions.add(hyperion_print)
hyperion.editions.add(hyperion_ebook)
# andymion = Edition(title="Andymion", pages=42)
# serie = Serie(title="Hyperion Cantos")
class GoodreadsTestCase(TestCase):
def setUp(self):
pass
def test_parse(self):
t_type = IdType.Goodreads
t_id = '77566'
t_url = 'https://www.goodreads.com/zh/book/show/77566.Hyperion'
t_url2 = 'https://www.goodreads.com/book/show/77566'
p1 = SiteList.get_site_by_id_type(t_type)
p2 = SiteList.get_site_by_url(t_url)
self.assertEqual(p1.id_to_url(t_id), t_url2)
self.assertEqual(p2.url_to_id(t_url), t_id)
@use_local_response
def test_scrape(self):
t_url = 'https://www.goodreads.com/book/show/77566.Hyperion'
t_url2 = 'https://www.goodreads.com/book/show/77566'
isbn = '9780553283686'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.url, t_url2)
site.get_resource()
self.assertEqual(site.ready, False)
self.assertIsNotNone(site.resource)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata.get('title'), 'Hyperion')
self.assertEqual(site.resource.metadata.get('isbn'), isbn)
self.assertEqual(site.resource.required_resources[0]['id_value'], '1383900')
edition = Edition.objects.get(primary_lookup_id_type=IdType.ISBN, primary_lookup_id_value=isbn)
resource = edition.external_resources.all().first()
self.assertEqual(resource.id_type, IdType.Goodreads)
self.assertEqual(resource.id_value, '77566')
self.assertNotEqual(resource.cover, '/media/item/default.svg')
self.assertEqual(edition.isbn, '9780553283686')
self.assertEqual(edition.title, 'Hyperion')
edition.delete()
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.url, t_url2)
site.get_resource()
self.assertEqual(site.ready, True, 'previous resource should still exist with data')
@use_local_response
def test_asin(self):
t_url = 'https://www.goodreads.com/book/show/45064996-hyperion'
site = SiteList.get_site_by_url(t_url)
site.get_resource_ready()
self.assertEqual(site.resource.item.title, 'Hyperion')
self.assertEqual(site.resource.item.asin, 'B004G60EHS')
@use_local_response
def test_work(self):
url = 'https://www.goodreads.com/work/editions/153313'
p = SiteList.get_site_by_url(url).get_resource_ready()
self.assertEqual(p.item.title, '1984')
url1 = 'https://www.goodreads.com/book/show/3597767-rok-1984'
url2 = 'https://www.goodreads.com/book/show/40961427-1984'
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
w1 = p1.item.works.all().first()
w2 = p2.item.works.all().first()
self.assertEqual(w1, w2)
class GoogleBooksTestCase(TestCase):
def test_parse(self):
t_type = IdType.GoogleBooks
t_id = 'hV--zQEACAAJ'
t_url = 'https://books.google.com.bn/books?id=hV--zQEACAAJ&hl=ms'
t_url2 = 'https://books.google.com/books?id=hV--zQEACAAJ'
p1 = SiteList.get_site_by_url(t_url)
p2 = SiteList.get_site_by_url(t_url2)
self.assertIsNotNone(p1)
self.assertEqual(p1.url, t_url2)
self.assertEqual(p1.ID_TYPE, t_type)
self.assertEqual(p1.id_value, t_id)
self.assertEqual(p2.url, t_url2)
@use_local_response
def test_scrape(self):
t_url = 'https://books.google.com.bn/books?id=hV--zQEACAAJ'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata.get('title'), '1984 Nineteen Eighty-Four')
self.assertEqual(site.resource.metadata.get('isbn'), '9781847498571')
self.assertEqual(site.resource.id_type, IdType.GoogleBooks)
self.assertEqual(site.resource.id_value, 'hV--zQEACAAJ')
self.assertEqual(site.resource.item.isbn, '9781847498571')
self.assertEqual(site.resource.item.title, '1984 Nineteen Eighty-Four')
class DoubanBookTestCase(TestCase):
def setUp(self):
pass
def test_parse(self):
t_type = IdType.DoubanBook
t_id = '35902899'
t_url = 'https://m.douban.com/book/subject/35902899/'
t_url2 = 'https://book.douban.com/subject/35902899/'
p1 = SiteList.get_site_by_url(t_url)
p2 = SiteList.get_site_by_url(t_url2)
self.assertEqual(p1.url, t_url2)
self.assertEqual(p1.ID_TYPE, t_type)
self.assertEqual(p1.id_value, t_id)
self.assertEqual(p2.url, t_url2)
@use_local_response
def test_scrape(self):
t_url = 'https://book.douban.com/subject/35902899/'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata.get('title'), '1984 Nineteen Eighty-Four')
self.assertEqual(site.resource.metadata.get('isbn'), '9781847498571')
self.assertEqual(site.resource.id_type, IdType.DoubanBook)
self.assertEqual(site.resource.id_value, '35902899')
self.assertEqual(site.resource.item.isbn, '9781847498571')
self.assertEqual(site.resource.item.title, '1984 Nineteen Eighty-Four')
@use_local_response
def test_work(self):
# url = 'https://www.goodreads.com/work/editions/153313'
url1 = 'https://book.douban.com/subject/1089243/'
url2 = 'https://book.douban.com/subject/2037260/'
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
w1 = p1.item.works.all().first()
w2 = p2.item.works.all().first()
self.assertEqual(w1.title, '黄金时代')
self.assertEqual(w2.title, '黄金时代')
self.assertEqual(w1, w2)
editions = w1.editions.all().order_by('title')
self.assertEqual(editions.count(), 2)
self.assertEqual(editions[0].title, 'Wang in Love and Bondage')
self.assertEqual(editions[1].title, '黄金时代')
class MultiBookSitesTestCase(TestCase):
@use_local_response
def test_editions(self):
# isbn = '9781847498571'
url1 = 'https://www.goodreads.com/book/show/56821625-1984'
url2 = 'https://book.douban.com/subject/35902899/'
url3 = 'https://books.google.com/books?id=hV--zQEACAAJ'
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p1.item.id, p2.item.id)
self.assertEqual(p2.item.id, p3.item.id)
@use_local_response
def test_works(self):
# url1 and url4 has same ISBN, hence they share same Edition instance, which belongs to 2 Work instances
url1 = 'https://book.douban.com/subject/1089243/'
url2 = 'https://book.douban.com/subject/2037260/'
url3 = 'https://www.goodreads.com/book/show/59952545-golden-age'
url4 = 'https://www.goodreads.com/book/show/11798823'
p1 = SiteList.get_site_by_url(url1).get_resource_ready() # lxml bug may break this
w1 = p1.item.works.all().first()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
w2 = p2.item.works.all().first()
self.assertEqual(w1, w2)
self.assertEqual(p1.item.works.all().count(), 1)
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
w3 = p3.item.works.all().first()
self.assertNotEqual(w3, w2)
p4 = SiteList.get_site_by_url(url4).get_resource_ready()
self.assertEqual(p4.item.works.all().count(), 2)
self.assertEqual(p1.item.works.all().count(), 2)
w2e = w2.editions.all().order_by('title')
self.assertEqual(w2e.count(), 2)
self.assertEqual(w2e[0].title, 'Wang in Love and Bondage')
self.assertEqual(w2e[1].title, '黄金时代')
w3e = w3.editions.all().order_by('title')
self.assertEqual(w3e.count(), 2)
self.assertEqual(w3e[0].title, 'Golden Age: A Novel')
self.assertEqual(w3e[1].title, '黄金时代')
e = Edition.objects.get(primary_lookup_id_value=9781662601217)
self.assertEqual(e.title, 'Golden Age: A Novel')

45
catalog/book/utils.py Normal file
View file

@ -0,0 +1,45 @@
def check_digit_10(isbn):
assert len(isbn) == 9
sum = 0
for i in range(len(isbn)):
c = int(isbn[i])
w = i + 1
sum += w * c
r = sum % 11
return 'X' if r == 10 else str(r)
def check_digit_13(isbn):
assert len(isbn) == 12
sum = 0
for i in range(len(isbn)):
c = int(isbn[i])
w = 3 if i % 2 else 1
sum += w * c
r = 10 - (sum % 10)
return '0' if r == 10 else str(r)
def isbn_10_to_13(isbn):
if not isbn or len(isbn) != 10:
return None
return '978' + isbn[:-1] + check_digit_13('978' + isbn[:-1])
def isbn_13_to_10(isbn):
if not isbn or len(isbn) != 13 or isbn[:3] != '978':
return None
else:
return isbn[3:12] + check_digit_10(isbn[3:12])
def is_isbn_13(isbn):
return len(isbn) == 13
def is_isbn_10(isbn):
return len(isbn) == 10 and isbn[0] >= '0' and isbn[0] <= '9'
def is_asin(asin):
return len(asin) == 10 and asin[0].lower == 'b'

View file

@ -0,0 +1,8 @@
from .models import *
from .sites import *
from .downloaders import *
from .scrapers import *
from . import jsondata
__all__ = ('IdType', 'Item', 'ExternalResource', 'ResourceContent', 'ParseError', 'AbstractSite', 'SiteList', 'jsondata', 'PrimaryLookupIdDescriptor', 'LookupIdDescriptor', 'get_mock_mode', 'get_mock_file', 'use_local_response', 'RetryDownloader', 'BasicDownloader', 'ProxiedDownloader', 'BasicImageDownloader', 'RESPONSE_OK', 'RESPONSE_NETWORK_ERROR', 'RESPONSE_INVALID_CONTENT', 'RESPONSE_CENSORSHIP')

View file

@ -0,0 +1,245 @@
import requests
import filetype
from PIL import Image
from io import BytesIO
from requests.exceptions import RequestException
from django.conf import settings
from pathlib import Path
import json
from io import StringIO
import re
import time
import logging
from lxml import html
_logger = logging.getLogger(__name__)
RESPONSE_OK = 0 # response is ready for pasring
RESPONSE_INVALID_CONTENT = -1 # content not valid but no need to retry
RESPONSE_NETWORK_ERROR = -2 # network error, retry next proxied url
RESPONSE_CENSORSHIP = -3 # censored, try sth special if possible
_mock_mode = False
def use_local_response(func):
def _func(args):
set_mock_mode(True)
func(args)
set_mock_mode(False)
return _func
def set_mock_mode(enabled):
global _mock_mode
_mock_mode = enabled
def get_mock_mode():
global _mock_mode
return _mock_mode
def get_mock_file(url):
fn = re.sub(r'[^\w]', '_', url)
return re.sub(r'_key_[A-Za-z0-9]+', '_key_19890604', fn)
class DownloadError(Exception):
def __init__(self, downloader, msg=None):
self.url = downloader.url
self.logs = downloader.logs
if downloader.response_type == RESPONSE_INVALID_CONTENT:
error = "Invalid Response"
elif downloader.response_type == RESPONSE_NETWORK_ERROR:
error = "Network Error"
elif downloader.response_type == RESPONSE_NETWORK_ERROR:
error = "Censored Content"
else:
error = "Unknown Error"
self.message = f"Download Failed: {error}{', ' + msg if msg else ''}, url: {self.url}"
super().__init__(self.message)
class BasicDownloader:
headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0',
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'no-cache',
}
def __init__(self, url, headers=None):
self.url = url
self.response_type = RESPONSE_OK
self.logs = []
if headers:
self.headers = headers
def get_timeout(self):
return settings.SCRAPING_TIMEOUT
def validate_response(self, response):
if response is None:
return RESPONSE_NETWORK_ERROR
elif response.status_code == 200:
return RESPONSE_OK
else:
return RESPONSE_INVALID_CONTENT
def _download(self, url):
try:
if not _mock_mode:
# TODO cache = get/set from redis
resp = requests.get(url, headers=self.headers, timeout=self.get_timeout())
if settings.DOWNLOADER_SAVEDIR:
with open(settings.DOWNLOADER_SAVEDIR + '/' + get_mock_file(url), 'w', encoding='utf-8') as fp:
fp.write(resp.text)
else:
resp = MockResponse(self.url)
response_type = self.validate_response(resp)
self.logs.append({'response_type': response_type, 'url': url, 'exception': None})
return resp, response_type
except RequestException as e:
self.logs.append({'response_type': RESPONSE_NETWORK_ERROR, 'url': url, 'exception': e})
return None, RESPONSE_NETWORK_ERROR
def download(self):
resp, self.response_type = self._download(self.url)
if self.response_type == RESPONSE_OK:
return resp
else:
raise DownloadError(self)
class ProxiedDownloader(BasicDownloader):
def get_proxied_urls(self):
urls = []
if settings.PROXYCRAWL_KEY is not None:
urls.append(f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={self.url}')
if settings.SCRAPESTACK_KEY is not None:
# urls.append(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
urls.append(f'http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
if settings.SCRAPERAPI_KEY is not None:
urls.append(f'http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={self.url}')
return urls
def get_special_proxied_url(self):
return f'{settings.LOCAL_PROXY}?url={self.url}' if settings.LOCAL_PROXY is not None else None
def download(self):
urls = self.get_proxied_urls()
last_try = False
url = urls.pop(0) if len(urls) else None
resp = None
while url:
resp, resp_type = self._download(url)
if resp_type == RESPONSE_OK or resp_type == RESPONSE_INVALID_CONTENT or last_try:
url = None
elif resp_type == RESPONSE_CENSORSHIP:
url = self.get_special_proxied_url()
last_try = True
else: # resp_type == RESPONSE_NETWORK_ERROR:
url = urls.pop(0) if len(urls) else None
self.response_type = resp_type
if self.response_type == RESPONSE_OK:
return resp
else:
raise DownloadError(self)
class RetryDownloader(BasicDownloader):
def download(self):
retries = settings.DOWNLOADER_RETRIES
while retries:
retries -= 1
resp, self.response_type = self._download(self.url)
if self.response_type == RESPONSE_OK:
return resp
elif self.response_type != RESPONSE_NETWORK_ERROR and retries == 0:
raise DownloadError(self)
elif retries > 0:
_logger.debug('Retry ' + self.url)
time.sleep((settings.DOWNLOADER_RETRIES - retries) * 0.5)
raise DownloadError(self, 'max out of retries')
class ImageDownloaderMixin:
def __init__(self, url, referer=None):
if referer is not None:
self.headers['Referer'] = referer
super().__init__(url)
def validate_response(self, response):
if response and response.status_code == 200:
try:
raw_img = response.content
img = Image.open(BytesIO(raw_img))
img.load() # corrupted image will trigger exception
content_type = response.headers.get('Content-Type')
self.extention = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
return RESPONSE_OK
except Exception:
return RESPONSE_NETWORK_ERROR
if response and response.status_code >= 400 and response.status_code < 500:
return RESPONSE_INVALID_CONTENT
else:
return RESPONSE_NETWORK_ERROR
class BasicImageDownloader(ImageDownloaderMixin, BasicDownloader):
@classmethod
def download_image(cls, image_url, page_url):
imgdl = cls(image_url, page_url)
try:
image = imgdl.download().content
image_extention = imgdl.extention
return image, image_extention
except Exception:
return None, None
class ProxiedImageDownloader(ImageDownloaderMixin, ProxiedDownloader):
pass
_local_response_path = str(Path(__file__).parent.parent.parent.absolute()) + '/test_data/'
class MockResponse:
def __init__(self, url):
self.url = url
fn = _local_response_path + get_mock_file(url)
try:
self.content = Path(fn).read_bytes()
self.status_code = 200
_logger.debug(f"use local response for {url} from {fn}")
except Exception:
self.content = b'Error: response file not found'
self.status_code = 404
_logger.debug(f"local response not found for {url} at {fn}")
@property
def text(self):
return self.content.decode('utf-8')
def json(self):
return json.load(StringIO(self.text))
def html(self):
return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug
@property
def headers(self):
return {'Content-Type': 'image/jpeg' if self.url.endswith('jpg') else 'text/html'}
requests.Response.html = MockResponse.html

201
catalog/common/jsondata.py Normal file
View file

@ -0,0 +1,201 @@
import copy
from datetime import date, datetime
from importlib import import_module
import django
from django.conf import settings
from django.core.exceptions import FieldError
from django.db.models import fields
from django.utils import dateparse, timezone
from functools import partialmethod
from django.db.models import JSONField
__all__ = ('BooleanField', 'CharField', 'DateField', 'DateTimeField', 'DecimalField', 'EmailField', 'FloatField', 'IntegerField', 'IPAddressField', 'GenericIPAddressField', 'NullBooleanField', 'TextField', 'TimeField', 'URLField', 'ArrayField')
class JSONFieldDescriptor(object):
def __init__(self, field):
self.field = field
def __get__(self, instance, cls=None):
if instance is None:
return self
json_value = getattr(instance, self.field.json_field_name)
if isinstance(json_value, dict):
if self.field.attname in json_value or not self.field.has_default():
value = json_value.get(self.field.attname, None)
if hasattr(self.field, 'from_json'):
value = self.field.from_json(value)
return value
else:
default = self.field.get_default()
if hasattr(self.field, 'to_json'):
json_value[self.field.attname] = self.field.to_json(default)
else:
json_value[self.field.attname] = default
return default
return None
def __set__(self, instance, value):
json_value = getattr(instance, self.field.json_field_name)
if json_value:
assert isinstance(json_value, dict)
else:
json_value = {}
if hasattr(self.field, 'to_json'):
value = self.field.to_json(value)
if not value and self.field.blank and not self.field.null:
try:
del json_value[self.field.attname]
except KeyError:
pass
else:
json_value[self.field.attname] = value
setattr(instance, self.field.json_field_name, json_value)
class JSONFieldMixin(object):
"""
Override django.db.model.fields.Field.contribute_to_class
to make a field always private, and register custom access descriptor
"""
def __init__(self, *args, **kwargs):
self.json_field_name = kwargs.pop('json_field_name', 'metadata')
super(JSONFieldMixin, self).__init__(*args, **kwargs)
def contribute_to_class(self, cls, name, private_only=False):
self.set_attributes_from_name(name)
self.model = cls
self.concrete = False
self.column = self.json_field_name
cls._meta.add_field(self, private=True)
if not getattr(cls, self.attname, None):
descriptor = JSONFieldDescriptor(self)
setattr(cls, self.attname, descriptor)
if self.choices is not None:
setattr(cls, 'get_%s_display' % self.name,
partialmethod(cls._get_FIELD_display, field=self))
def get_lookup(self, lookup_name):
# Always return None, to make get_transform been called
return None
def get_transform(self, name):
class TransformFactoryWrapper:
def __init__(self, json_field, transform, original_lookup):
self.json_field = json_field
self.transform = transform
self.original_lookup = original_lookup
def __call__(self, lhs, **kwargs):
lhs = copy.copy(lhs)
lhs.target = self.json_field
lhs.output_field = self.json_field
transform = self.transform(lhs, **kwargs)
transform._original_get_lookup = transform.get_lookup
transform.get_lookup = lambda name: transform._original_get_lookup(self.original_lookup)
return transform
json_field = self.model._meta.get_field(self.json_field_name)
transform = json_field.get_transform(self.name)
if transform is None:
raise FieldError(
"JSONField '%s' has no support for key '%s' %s lookup" %
(self.json_field_name, self.name, name)
)
return TransformFactoryWrapper(json_field, transform, name)
class BooleanField(JSONFieldMixin, fields.BooleanField):
def __init__(self, *args, **kwargs):
super(BooleanField, self).__init__(*args, **kwargs)
if django.VERSION < (2, ):
self.blank = False
class CharField(JSONFieldMixin, fields.CharField):
pass
class DateField(JSONFieldMixin, fields.DateField):
def to_json(self, value):
if value:
assert isinstance(value, (datetime, date))
return value.strftime('%Y-%m-%d')
def from_json(self, value):
if value is not None:
return dateparse.parse_date(value)
class DateTimeField(JSONFieldMixin, fields.DateTimeField):
def to_json(self, value):
if value:
if not timezone.is_aware(value):
value = timezone.make_aware(value)
return value.isoformat()
def from_json(self, value):
if value:
return dateparse.parse_datetime(value)
class DecimalField(JSONFieldMixin, fields.DecimalField):
pass
class EmailField(JSONFieldMixin, fields.EmailField):
pass
class FloatField(JSONFieldMixin, fields.FloatField):
pass
class IntegerField(JSONFieldMixin, fields.IntegerField):
pass
class IPAddressField(JSONFieldMixin, fields.IPAddressField):
pass
class GenericIPAddressField(JSONFieldMixin, fields.GenericIPAddressField):
pass
class NullBooleanField(JSONFieldMixin, fields.NullBooleanField):
pass
class TextField(JSONFieldMixin, fields.TextField):
pass
class TimeField(JSONFieldMixin, fields.TimeField):
def to_json(self, value):
if value:
if not timezone.is_aware(value):
value = timezone.make_aware(value)
return value.isoformat()
def from_json(self, value):
if value:
return dateparse.parse_time(value)
class URLField(JSONFieldMixin, fields.URLField):
pass
class ArrayField(JSONFieldMixin, JSONField):
pass

268
catalog/common/models.py Normal file
View file

@ -0,0 +1,268 @@
from polymorphic.models import PolymorphicModel
from django.db import models
from catalog.common import jsondata
from django.utils.translation import gettext_lazy as _
from django.utils import timezone
from django.core.files.uploadedfile import SimpleUploadedFile
from django.contrib.contenttypes.models import ContentType
import uuid
from .utils import DEFAULT_ITEM_COVER, item_cover_path
# from django.conf import settings
class IdType(models.TextChoices):
WikiData = 'wikidata', _('维基数据')
ISBN10 = 'isbn10', _('ISBN10')
ISBN = 'isbn', _('ISBN') # ISBN 13
ASIN = 'asin', _('ASIN')
ISSN = 'issn', _('ISSN')
CUBN = 'cubn', _('统一书号')
ISRC = 'isrc', _('ISRC') # only for songs
GTIN = 'gtin', _('GTIN UPC EAN码') # ISBN is separate
Feed = 'feed', _('Feed URL')
IMDB = 'imdb', _('IMDb')
TMDB_TV = 'tmdb_tv', _('TMDB剧集')
TMDB_TVSeason = 'tmdb_tvseason', _('TMDB剧集')
TMDB_TVEpisode = 'tmdb_tvepisode', _('TMDB剧集')
TMDB_Movie = 'tmdb_movie', _('TMDB电影')
Goodreads = 'goodreads', _('Goodreads')
Goodreads_Work = 'goodreads_work', _('Goodreads著作')
GoogleBooks = 'googlebooks', _('谷歌图书')
DoubanBook = 'doubanbook', _('豆瓣读书')
DoubanBook_Work = 'doubanbook_work', _('豆瓣读书著作')
DoubanMovie = 'doubanmovie', _('豆瓣电影')
DoubanMusic = 'doubanmusic', _('豆瓣音乐')
DoubanGame = 'doubangame', _('豆瓣游戏')
DoubanDrama = 'doubandrama', _('豆瓣舞台剧')
Bandcamp = 'bandcamp', _('Bandcamp')
Spotify_Album = 'spotify_album', _('Spotify专辑')
Spotify_Show = 'spotify_show', _('Spotify播客')
Discogs_Release = 'discogs_release', ('Discogs Release')
Discogs_Master = 'discogs_master', ('Discogs Master')
MusicBrainz = 'musicbrainz', ('MusicBrainz ID')
DoubanBook_Author = 'doubanbook_author', _('豆瓣读书作者')
DoubanCelebrity = 'doubanmovie_celebrity', _('豆瓣电影影人')
Goodreads_Author = 'goodreads_author', _('Goodreads作者')
Spotify_Artist = 'spotify_artist', _('Spotify艺术家')
TMDB_Person = 'tmdb_person', _('TMDB影人')
IGDB = 'igdb', _('IGDB游戏')
Steam = 'steam', _('Steam游戏')
Bangumi = 'bangumi', _('Bangumi')
ApplePodcast = 'apple_podcast', _('苹果播客')
class ItemType(models.TextChoices):
Book = 'book', _('')
TV = 'tv', _('剧集')
TVSeason = 'tvseason', _('剧集分季')
TVEpisode = 'tvepisode', _('剧集分集')
Movie = 'movie', _('电影')
Music = 'music', _('音乐')
Game = 'game', _('游戏')
Boardgame = 'boardgame', _('桌游')
Podcast = 'podcast', _('播客')
FanFic = 'fanfic', _('网文')
Performance = 'performance', _('演出')
Exhibition = 'exhibition', _('展览')
class SubItemType(models.TextChoices):
Season = 'season', _('剧集分季')
Episode = 'episode', _('剧集分集')
Version = 'version', _('版本')
# class CreditType(models.TextChoices):
# Author = 'author', _('作者')
# Translater = 'translater', _('译者')
# Producer = 'producer', _('出品人')
# Director = 'director', _('电影')
# Actor = 'actor', _('演员')
# Playwright = 'playwright', _('播客')
# VoiceActor = 'voiceactor', _('配音')
# Host = 'host', _('主持人')
# Developer = 'developer', _('开发者')
# Publisher = 'publisher', _('出版方')
class PrimaryLookupIdDescriptor(object): # TODO make it mixin of Field
def __init__(self, id_type):
self.id_type = id_type
def __get__(self, instance, cls=None):
if instance is None:
return self
if self.id_type != instance.primary_lookup_id_type:
return None
return instance.primary_lookup_id_value
def __set__(self, instance, id_value):
if id_value:
instance.primary_lookup_id_type = self.id_type
instance.primary_lookup_id_value = id_value
else:
instance.primary_lookup_id_type = None
instance.primary_lookup_id_value = None
class LookupIdDescriptor(object): # TODO make it mixin of Field
def __init__(self, id_type):
self.id_type = id_type
def __get__(self, instance, cls=None):
if instance is None:
return self
return instance.get_lookup_id(self.id_type)
def __set__(self, instance, value):
instance.set_lookup_id(self.id_type, value)
# class ItemId(models.Model):
# item = models.ForeignKey('Item', models.CASCADE)
# id_type = models.CharField(_("源网站"), blank=False, choices=IdType.choices, max_length=50)
# id_value = models.CharField(_("源网站ID"), blank=False, max_length=1000)
# class ItemCredit(models.Model):
# item = models.ForeignKey('Item', models.CASCADE)
# credit_type = models.CharField(_("类型"), choices=CreditType.choices, blank=False, max_length=50)
# name = models.CharField(_("名字"), blank=False, max_length=1000)
# def check_source_id(sid):
# if not sid:
# return True
# s = sid.split(':')
# if len(s) < 2:
# return False
# return sid[0] in IdType.values()
class Item(PolymorphicModel):
uid = models.UUIDField(default=uuid.uuid4, editable=False)
# item_type = models.CharField(_("类型"), choices=ItemType.choices, blank=False, max_length=50)
title = models.CharField(_("title in primary language"), max_length=1000, default="")
# title_ml = models.JSONField(_("title in different languages {['lang':'zh-cn', 'text':'', primary:True], ...}"), null=True, blank=True, default=list)
brief = models.TextField(_("简介"), blank=True, default="")
# brief_ml = models.JSONField(_("brief in different languages {['lang':'zh-cn', 'text':'', primary:True], ...}"), null=True, blank=True, default=list)
genres = models.JSONField(_("分类"), null=True, blank=True, default=list)
primary_lookup_id_type = models.CharField(_("isbn/cubn/imdb"), blank=False, null=True, max_length=50)
primary_lookup_id_value = models.CharField(_("1234/tt789"), blank=False, null=True, max_length=1000)
metadata = models.JSONField(_("其他信息"), blank=True, null=True, default=dict)
cover = models.ImageField(upload_to=item_cover_path, default=DEFAULT_ITEM_COVER, blank=True)
created_time = models.DateTimeField(auto_now_add=True)
edited_time = models.DateTimeField(auto_now=True)
# parent_item = models.ForeignKey('Item', null=True, on_delete=models.SET_NULL, related_name='child_items')
# identical_item = models.ForeignKey('Item', null=True, on_delete=models.SET_NULL, related_name='identical_items')
# def get_lookup_id(self, id_type: str) -> str:
# prefix = id_type.strip().lower() + ':'
# return next((x[len(prefix):] for x in self.lookup_ids if x.startswith(prefix)), None)
class Meta:
unique_together = [['polymorphic_ctype_id', 'primary_lookup_id_type', 'primary_lookup_id_value']]
def __str__(self):
return f"{self.id}{' ' + self.primary_lookup_id_type + ':' + self.primary_lookup_id_value if self.primary_lookup_id_value else ''} ({self.title})"
@classmethod
def get_best_lookup_id(cls, lookup_ids):
""" get best available lookup id, ideally commonly used """
best_id_types = [
IdType.ISBN, IdType.CUBN, IdType.ASIN,
IdType.GTIN, IdType.ISRC, IdType.MusicBrainz,
IdType.Feed,
IdType.IMDB, IdType.TMDB_TVSeason
]
for t in best_id_types:
if lookup_ids.get(t):
return t, lookup_ids[t]
return list(lookup_ids.items())[0]
def update_lookup_ids(self, lookup_ids):
# TODO
# ll = set(lookup_ids)
# ll = list(filter(lambda a, b: b, ll))
# print(ll)
pass
METADATA_COPY_LIST = ['title', 'brief'] # list of metadata keys to copy from resource to item
@classmethod
def copy_metadata(cls, metadata):
return dict((k, v) for k, v in metadata.items() if k in cls.METADATA_COPY_LIST and v is not None)
def merge_data_from_external_resources(self):
"""Subclass may override this"""
lookup_ids = []
for p in self.external_resources.all():
lookup_ids.append((p.id_type, p.id_value))
lookup_ids += p.other_lookup_ids.items()
for k in self.METADATA_COPY_LIST:
if not getattr(self, k) and p.metadata.get(k):
setattr(self, k, p.metadata.get(k))
if not self.cover and p.cover:
self.cover = p.cover
self.update_lookup_ids(lookup_ids)
def update_linked_items_from_external_resource(self, resource):
"""Subclass should override this"""
pass
class ItemLookupId(models.Model):
item = models.ForeignKey(Item, null=True, on_delete=models.SET_NULL, related_name='lookup_ids')
id_type = models.CharField(_("源网站"), blank=True, choices=IdType.choices, max_length=50)
id_value = models.CharField(_("源网站ID"), blank=True, max_length=1000)
raw_url = models.CharField(_("源网站ID"), blank=True, max_length=1000, unique=True)
class Meta:
unique_together = [['id_type', 'id_value']]
class ExternalResource(models.Model):
item = models.ForeignKey(Item, null=True, on_delete=models.SET_NULL, related_name='external_resources')
id_type = models.CharField(_("IdType of the source site"), blank=False, choices=IdType.choices, max_length=50)
id_value = models.CharField(_("Primary Id on the source site"), blank=False, max_length=1000)
url = models.CharField(_("url to the resource"), blank=False, max_length=1000, unique=True)
cover = models.ImageField(upload_to=item_cover_path, default=DEFAULT_ITEM_COVER, blank=True)
other_lookup_ids = models.JSONField(default=dict)
metadata = models.JSONField(default=dict)
scraped_time = models.DateTimeField(null=True)
created_time = models.DateTimeField(auto_now_add=True)
edited_time = models.DateTimeField(auto_now=True)
required_resources = jsondata.ArrayField(null=False, blank=False, default=list)
related_resources = jsondata.ArrayField(null=False, blank=False, default=list)
class Meta:
unique_together = [['id_type', 'id_value']]
def __str__(self):
return f"{self.id}{':' + self.id_type + ':' + self.id_value if self.id_value else ''} ({self.url})"
def update_content(self, resource_content):
self.other_lookup_ids = resource_content.lookup_ids
self.metadata = resource_content.metadata
if resource_content.cover_image and resource_content.cover_image_extention:
self.cover = SimpleUploadedFile('temp.' + resource_content.cover_image_extention, resource_content.cover_image)
self.scraped_time = timezone.now()
self.save()
@property
def ready(self):
return bool(self.metadata and self.scraped_time)
def get_all_lookup_ids(self):
d = self.other_lookup_ids.copy()
d[self.id_type] = self.id_value
d = {k: v for k, v in d.items() if bool(v)}
return d
def get_preferred_model(self):
model = self.metadata.get('preferred_model')
if model:
m = ContentType.objects.filter(app_label='catalog', model=model.lower()).first()
if m:
return m.model_class()
else:
raise ValueError(f'preferred model {model} does not exist')
return None

View file

@ -0,0 +1,4 @@
class ParseError(Exception):
def __init__(self, scraper, field):
msg = f'{type(scraper).__name__}: Error parsing field "{field}" for url {scraper.url}'
super().__init__(msg)

155
catalog/common/sites.py Normal file
View file

@ -0,0 +1,155 @@
"""
Site and SiteList
Site should inherite from AbstractSite
a Site should map to a unique set of url patterns.
a Site may scrape a url and store result in ResourceContent
ResourceContent persists as an ExternalResource which may link to an Item
"""
from typing import *
import re
from .models import ExternalResource
from dataclasses import dataclass, field
import logging
_logger = logging.getLogger(__name__)
@dataclass
class ResourceContent:
lookup_ids: dict = field(default_factory=dict)
metadata: dict = field(default_factory=dict)
cover_image: bytes = None
cover_image_extention: str = None
class AbstractSite:
"""
Abstract class to represent a site
"""
ID_TYPE = None
WIKI_PROPERTY_ID = 'P0undefined0'
DEFAULT_MODEL = None
URL_PATTERNS = [r"\w+://undefined/(\d+)"]
@classmethod
def validate_url(self, url: str):
u = next(iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), None)
return u is not None
@classmethod
def id_to_url(self, id_value):
return 'https://undefined/' + id_value
@classmethod
def url_to_id(self, url: str):
u = next(iter([re.match(p, url) for p in self.URL_PATTERNS if re.match(p, url)]), None)
return u[1] if u else None
def __str__(self):
return f'<{self.__class__.__name__}: {self.url}>'
def __init__(self, url=None):
self.id_value = self.url_to_id(url) if url else None
self.url = self.id_to_url(self.id_value) if url else None
self.resource = None
def get_resource(self):
if not self.resource:
self.resource = ExternalResource.objects.filter(url=self.url).first()
if self.resource is None:
self.resource = ExternalResource(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url)
return self.resource
def bypass_scrape(self, data_from_link) -> ResourceContent | None:
"""subclass may implement this to use data from linked resource and bypass actual scrape"""
return None
def scrape(self) -> ResourceContent:
"""subclass should implement this, return ResourceContent object"""
data = ResourceContent()
return data
def get_item(self):
p = self.get_resource()
if not p:
raise ValueError(f'resource not available for {self.url}')
model = p.get_preferred_model()
if not model:
model = self.DEFAULT_MODEL
t, v = model.get_best_lookup_id(p.get_all_lookup_ids())
if t is not None:
p.item = model.objects.filter(primary_lookup_id_type=t, primary_lookup_id_value=v).first()
if p.item is None:
obj = model.copy_metadata(p.metadata)
obj['primary_lookup_id_type'] = t
obj['primary_lookup_id_value'] = v
p.item = model.objects.create(**obj)
return p.item
@property
def ready(self):
return bool(self.resource and self.resource.ready)
def get_resource_ready(self, auto_save=True, auto_create=True, auto_link=True, data_from_link=None):
"""return a resource scraped, or scrape if not yet"""
if auto_link:
auto_create = True
if auto_create:
auto_save = True
p = self.get_resource()
resource_content = {}
if not self.resource:
return None
if not p.ready:
resource_content = self.bypass_scrape(data_from_link)
if not resource_content:
resource_content = self.scrape()
p.update_content(resource_content)
if not p.ready:
_logger.error(f'unable to get resource {self.url} ready')
return None
if auto_create and p.item is None:
self.get_item()
if auto_save:
p.save()
if p.item:
p.item.merge_data_from_external_resources()
p.item.save()
if auto_link:
for linked_resources in p.required_resources:
linked_site = SiteList.get_site_by_url(linked_resources['url'])
if linked_site:
linked_site.get_resource_ready(auto_link=False)
else:
_logger.error(f'unable to get site for {linked_resources["url"]}')
p.item.update_linked_items_from_external_resource(p)
p.item.save()
return p
class SiteList:
registry = {}
@classmethod
def register(cls, target) -> Callable:
id_type = target.ID_TYPE
if id_type in cls.registry:
raise ValueError(f'Site for {id_type} already exists')
cls.registry[id_type] = target
return target
@classmethod
def get_site_by_id_type(cls, typ: str):
return cls.registry[typ]() if typ in cls.registry else None
@classmethod
def get_site_by_url(cls, url: str):
cls = next(filter(lambda p: p.validate_url(url), cls.registry.values()), None)
return cls(url) if cls else None
@classmethod
def get_id_by_url(cls, url: str):
site = cls.get_site_by_url(url)
return site.url_to_id(url) if site else None

14
catalog/common/utils.py Normal file
View file

@ -0,0 +1,14 @@
import logging
from django.utils import timezone
import uuid
_logger = logging.getLogger(__name__)
DEFAULT_ITEM_COVER = 'item/default.svg'
def item_cover_path(resource, filename):
fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1]
return 'items/' + resource.id_type + '/' + fn

8
catalog/game/models.py Normal file
View file

@ -0,0 +1,8 @@
from catalog.common import *
class Game(Item):
igdb = PrimaryLookupIdDescriptor(IdType.IGDB)
steam = PrimaryLookupIdDescriptor(IdType.Steam)
douban_game = PrimaryLookupIdDescriptor(IdType.DoubanGame)
platforms = jsondata.ArrayField(default=list)

117
catalog/game/tests.py Normal file
View file

@ -0,0 +1,117 @@
from django.test import TestCase
from catalog.common import *
from catalog.models import *
class IGDBTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.IGDB
t_id_value = 'portal-2'
t_url = 'https://www.igdb.com/games/portal-2'
site = SiteList.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.url, t_url)
self.assertEqual(site.id_value, t_id_value)
@use_local_response
def test_scrape(self):
t_url = 'https://www.igdb.com/games/portal-2'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'Portal 2')
self.assertIsInstance(site.resource.item, Game)
self.assertEqual(site.resource.item.steam, '620')
@use_local_response
def test_scrape_non_steam(self):
t_url = 'https://www.igdb.com/games/the-legend-of-zelda-breath-of-the-wild'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'The Legend of Zelda: Breath of the Wild')
self.assertIsInstance(site.resource.item, Game)
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IGDB)
self.assertEqual(site.resource.item.primary_lookup_id_value, 'the-legend-of-zelda-breath-of-the-wild')
class SteamTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.Steam
t_id_value = '620'
t_url = 'https://store.steampowered.com/app/620/Portal_2/'
t_url2 = 'https://store.steampowered.com/app/620'
site = SiteList.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.url, t_url2)
self.assertEqual(site.id_value, t_id_value)
@use_local_response
def test_scrape(self):
t_url = 'https://store.steampowered.com/app/620/Portal_2/'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'Portal 2')
self.assertEqual(site.resource.metadata['brief'], '“终身测试计划”现已升级,您可以为您自己或您的好友设计合作谜题!')
self.assertIsInstance(site.resource.item, Game)
self.assertEqual(site.resource.item.steam, '620')
class DoubanGameTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.DoubanGame
t_id_value = '10734307'
t_url = 'https://www.douban.com/game/10734307/'
site = SiteList.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.url, t_url)
self.assertEqual(site.id_value, t_id_value)
@use_local_response
def test_scrape(self):
t_url = 'https://www.douban.com/game/10734307/'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], '传送门2 Portal 2')
self.assertIsInstance(site.resource.item, Game)
self.assertEqual(site.resource.item.douban_game, '10734307')
class BangumiGameTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.Bangumi
t_id_value = '15912'
t_url = 'https://bgm.tv/subject/15912'
site = SiteList.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.url, t_url)
self.assertEqual(site.id_value, t_id_value)
@use_local_response
def test_scrape(self):
# TODO
pass
class MultiGameSitesTestCase(TestCase):
@use_local_response
def test_games(self):
url1 = 'https://www.igdb.com/games/portal-2'
url2 = 'https://store.steampowered.com/app/620/Portal_2/'
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
self.assertEqual(p1.item.id, p2.item.id)

View file

@ -0,0 +1,22 @@
from django.core.management.base import BaseCommand
import pprint
from catalog.common import SiteList
from catalog.sites import *
class Command(BaseCommand):
help = 'Scrape a catalog item from external resource (but not save it)'
def add_arguments(self, parser):
parser.add_argument('url', type=str, help='URL to scrape')
def handle(self, *args, **options):
url = str(options['url'])
site = SiteList.get_site_by_url(url)
if site is None:
self.stdout.write(self.style.ERROR(f'Unknown site for {url}'))
return
self.stdout.write(f'Fetching from {site}')
resource = site.get_resource_ready(auto_link=False, auto_save=False)
self.stdout.write(self.style.SUCCESS(f'Done.'))
pprint.pp(resource.metadata)

25
catalog/models.py Normal file
View file

@ -0,0 +1,25 @@
from .book.models import Edition, Work, Series
from .movie.models import Movie
from .tv.models import TVShow, TVSeason, TVEpisode
from .music.models import Album
from .game.models import Game
from .podcast.models import Podcast
from .performance.models import Performance
# class Exhibition(Item):
# class Meta:
# proxy = True
# class Fanfic(Item):
# class Meta:
# proxy = True
# class Boardgame(Item):
# class Meta:
# proxy = True

8
catalog/movie/models.py Normal file
View file

@ -0,0 +1,8 @@
from catalog.common import *
class Movie(Item):
imdb = PrimaryLookupIdDescriptor(IdType.IMDB)
tmdb_movie = PrimaryLookupIdDescriptor(IdType.TMDB_Movie)
douban_movie = PrimaryLookupIdDescriptor(IdType.DoubanMovie)
duration = jsondata.IntegerField(blank=True, default=None)

90
catalog/movie/tests.py Normal file
View file

@ -0,0 +1,90 @@
from django.test import TestCase
from catalog.common import *
class DoubanMovieTestCase(TestCase):
def test_parse(self):
t_id = '3541415'
t_url = 'https://movie.douban.com/subject/3541415/'
p1 = SiteList.get_site_by_id_type(IdType.DoubanMovie)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
p2 = SiteList.get_site_by_url(t_url)
self.assertEqual(p1.id_to_url(t_id), t_url)
self.assertEqual(p2.url_to_id(t_url), t_id)
@use_local_response
def test_scrape(self):
t_url = 'https://movie.douban.com/subject/3541415/'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '3541415')
site.get_resource_ready()
self.assertEqual(site.resource.metadata['title'], '盗梦空间')
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, 'Movie')
self.assertEqual(site.resource.item.imdb, 'tt1375666')
class TMDBMovieTestCase(TestCase):
def test_parse(self):
t_id = '293767'
t_url = 'https://www.themoviedb.org/movie/293767-billy-lynn-s-long-halftime-walk'
t_url2 = 'https://www.themoviedb.org/movie/293767'
p1 = SiteList.get_site_by_id_type(IdType.TMDB_Movie)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
self.assertEqual(p1.validate_url(t_url2), True)
p2 = SiteList.get_site_by_url(t_url)
self.assertEqual(p1.id_to_url(t_id), t_url2)
self.assertEqual(p2.url_to_id(t_url), t_id)
@use_local_response
def test_scrape(self):
t_url = 'https://www.themoviedb.org/movie/293767'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '293767')
site.get_resource_ready()
self.assertEqual(site.resource.metadata['title'], '比利·林恩的中场战事')
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, 'Movie')
self.assertEqual(site.resource.item.imdb, 'tt2513074')
class IMDBMovieTestCase(TestCase):
def test_parse(self):
t_id = 'tt1375666'
t_url = 'https://www.imdb.com/title/tt1375666/'
t_url2 = 'https://www.imdb.com/title/tt1375666/'
p1 = SiteList.get_site_by_id_type(IdType.IMDB)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
self.assertEqual(p1.validate_url(t_url2), True)
p2 = SiteList.get_site_by_url(t_url)
self.assertEqual(p1.id_to_url(t_id), t_url2)
self.assertEqual(p2.url_to_id(t_url), t_id)
@use_local_response
def test_scrape(self):
t_url = 'https://www.imdb.com/title/tt1375666/'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, 'tt1375666')
site.get_resource_ready()
self.assertEqual(site.resource.metadata['title'], '盗梦空间')
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.imdb, 'tt1375666')
class MultiMovieSitesTestCase(TestCase):
@use_local_response
def test_movies(self):
url1 = 'https://www.themoviedb.org/movie/27205-inception'
url2 = 'https://movie.douban.com/subject/3541415/'
url3 = 'https://www.imdb.com/title/tt1375666/'
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p1.item.id, p2.item.id)
self.assertEqual(p2.item.id, p3.item.id)

10
catalog/music/models.py Normal file
View file

@ -0,0 +1,10 @@
from catalog.common import *
class Album(Item):
barcode = PrimaryLookupIdDescriptor(IdType.GTIN)
douban_music = PrimaryLookupIdDescriptor(IdType.DoubanMusic)
spotify_album = PrimaryLookupIdDescriptor(IdType.Spotify_Album)
class Meta:
proxy = True

61
catalog/music/tests.py Normal file
View file

@ -0,0 +1,61 @@
from django.test import TestCase
from catalog.common import *
from catalog.models import *
class SpotifyTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.Spotify_Album
t_id_value = '65KwtzkJXw7oT819NFWmEP'
t_url = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP'
site = SiteList.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.url, t_url)
self.assertEqual(site.id_value, t_id_value)
@use_local_response
def test_scrape(self):
t_url = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
self.assertIsInstance(site.resource.item, Album)
self.assertEqual(site.resource.item.barcode, '3610159662676')
class DoubanMusicTestCase(TestCase):
def test_parse(self):
t_id_type = IdType.DoubanMusic
t_id_value = '33551231'
t_url = 'https://music.douban.com/subject/33551231/'
site = SiteList.get_site_by_id_type(t_id_type)
self.assertIsNotNone(site)
self.assertEqual(site.validate_url(t_url), True)
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.url, t_url)
self.assertEqual(site.id_value, t_id_value)
@use_local_response
def test_scrape(self):
t_url = 'https://music.douban.com/subject/33551231/'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], 'The Race For Space')
self.assertIsInstance(site.resource.item, Album)
self.assertEqual(site.resource.item.barcode, '3610159662676')
class MultiMusicSitesTestCase(TestCase):
@use_local_response
def test_albums(self):
url1 = 'https://music.douban.com/subject/33551231/'
url2 = 'https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP'
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
self.assertEqual(p1.item.id, p2.item.id)

View file

@ -0,0 +1,13 @@
from catalog.common import *
from django.utils.translation import gettext_lazy as _
class Performance(Item):
douban_drama = LookupIdDescriptor(IdType.DoubanDrama)
versions = jsondata.ArrayField(_('版本'), null=False, blank=False, default=list)
directors = jsondata.ArrayField(_('导演'), null=False, blank=False, default=list)
playwrights = jsondata.ArrayField(_('编剧'), null=False, blank=False, default=list)
actors = jsondata.ArrayField(_('主演'), null=False, blank=False, default=list)
class Meta:
proxy = True

View file

@ -0,0 +1,37 @@
from django.test import TestCase
from catalog.common import *
class DoubanDramaTestCase(TestCase):
def setUp(self):
pass
def test_parse(self):
t_id = '24849279'
t_url = 'https://www.douban.com/location/drama/24849279/'
p1 = SiteList.get_site_by_id_type(IdType.DoubanDrama)
self.assertIsNotNone(p1)
p1 = SiteList.get_site_by_url(t_url)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
self.assertEqual(p1.id_to_url(t_id), t_url)
self.assertEqual(p1.url_to_id(t_url), t_id)
@use_local_response
def test_scrape(self):
t_url = 'https://www.douban.com/location/drama/24849279/'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
resource = site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(resource.metadata['title'], '红花侠')
item = site.get_item()
self.assertEqual(item.title, '红花侠')
# self.assertEqual(i.other_titles, ['スカーレットピンパーネル', 'THE SCARLET PIMPERNEL'])
# self.assertEqual(len(i.brief), 545)
# self.assertEqual(i.genres, ['音乐剧'])
# self.assertEqual(i.versions, ['08星组公演版', '10年月組公演版', '17年星組公演版', 'ュージカル2017年版'])
# self.assertEqual(i.directors, ['小池修一郎', '小池 修一郎', '石丸さち子'])
# self.assertEqual(i.playwrights, ['小池修一郎', 'Baroness Orczy原作', '小池 修一郎'])
# self.assertEqual(i.actors, ['安蘭けい', '柚希礼音', '遠野あすか', '霧矢大夢', '龍真咲'])

13
catalog/podcast/models.py Normal file
View file

@ -0,0 +1,13 @@
from catalog.common import *
class Podcast(Item):
feed_url = PrimaryLookupIdDescriptor(IdType.Feed)
apple_podcast = PrimaryLookupIdDescriptor(IdType.ApplePodcast)
# ximalaya = LookupIdDescriptor(IdType.Ximalaya)
# xiaoyuzhou = LookupIdDescriptor(IdType.Xiaoyuzhou)
hosts = jsondata.ArrayField(default=list)
# class PodcastEpisode(Item):
# pass

30
catalog/podcast/tests.py Normal file
View file

@ -0,0 +1,30 @@
from django.test import TestCase
from catalog.podcast.models import *
from catalog.common import *
class ApplePodcastTestCase(TestCase):
def setUp(self):
pass
def test_parse(self):
t_id = '657765158'
t_url = 'https://podcasts.apple.com/us/podcast/%E5%A4%A7%E5%86%85%E5%AF%86%E8%B0%88/id657765158'
t_url2 = 'https://podcasts.apple.com/us/podcast/id657765158'
p1 = SiteList.get_site_by_id_type(IdType.ApplePodcast)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
p2 = SiteList.get_site_by_url(t_url)
self.assertEqual(p1.id_to_url(t_id), t_url2)
self.assertEqual(p2.url_to_id(t_url), t_id)
@use_local_response
def test_scrape(self):
t_url = 'https://podcasts.apple.com/gb/podcast/the-new-yorker-radio-hour/id1050430296'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '1050430296')
site.get_resource_ready()
self.assertEqual(site.resource.metadata['title'], 'The New Yorker Radio Hour')
# self.assertEqual(site.resource.metadata['feed_url'], 'http://feeds.wnyc.org/newyorkerradiohour')
self.assertEqual(site.resource.metadata['feed_url'], 'http://feeds.feedburner.com/newyorkerradiohour')

15
catalog/sites/__init__.py Normal file
View file

@ -0,0 +1,15 @@
from ..common.sites import SiteList
from .apple_podcast import ApplePodcast
from .douban_book import DoubanBook
from .douban_movie import DoubanMovie
from .douban_music import DoubanMusic
from .douban_game import DoubanGame
from .douban_drama import DoubanDrama
from .goodreads import Goodreads
from .google_books import GoogleBooks
from .tmdb import TMDB_Movie
from .imdb import IMDB
from .spotify import Spotify
from .igdb import IGDB
from .steam import Steam
from .bangumi import Bangumi

View file

@ -0,0 +1,40 @@
from catalog.common import *
from catalog.models import *
import logging
_logger = logging.getLogger(__name__)
@SiteList.register
class ApplePodcast(AbstractSite):
ID_TYPE = IdType.ApplePodcast
URL_PATTERNS = [r"https://[^.]+.apple.com/\w+/podcast/*[^/?]*/id(\d+)"]
WIKI_PROPERTY_ID = 'P5842'
DEFAULT_MODEL = Podcast
@classmethod
def id_to_url(self, id_value):
return "https://podcasts.apple.com/us/podcast/id" + id_value
def scrape(self):
api_url = f'https://itunes.apple.com/lookup?id={self.id_value}'
dl = BasicDownloader(api_url)
resp = dl.download()
r = resp.json()['results'][0]
pd = ResourceContent(metadata={
'title': r['trackName'],
'feed_url': r['feedUrl'],
'hosts': [r['artistName']],
'genres': r['genres'],
'cover_image_url': r['artworkUrl600'],
})
pd.lookup_ids[IdType.Feed] = pd.metadata.get('feed_url')
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd

24
catalog/sites/bangumi.py Normal file
View file

@ -0,0 +1,24 @@
from catalog.common import *
from catalog.models import *
import logging
_logger = logging.getLogger(__name__)
@SiteList.register
class Bangumi(AbstractSite):
ID_TYPE = IdType.Bangumi
URL_PATTERNS = [
r"https://bgm\.tv/subject/(\d+)",
]
WIKI_PROPERTY_ID = ''
DEFAULT_MODEL = None
@classmethod
def id_to_url(self, id_value):
return f"https://bgm.tv/subject/{id_value}"
def scrape(self):
# TODO rewrite with bangumi api https://bangumi.github.io/api/
pass

28
catalog/sites/douban.py Normal file
View file

@ -0,0 +1,28 @@
import re
from catalog.common import *
RE_NUMBERS = re.compile(r"\d+\d*")
RE_WHITESPACES = re.compile(r"\s+")
class DoubanDownloader(ProxiedDownloader):
def validate_response(self, response):
if response is None:
return RESPONSE_NETWORK_ERROR
elif response.status_code == 204:
return RESPONSE_CENSORSHIP
elif response.status_code == 200:
content = response.content.decode('utf-8')
if content.find('关于豆瓣') == -1:
# if content.find('你的 IP 发出') == -1:
# error = error + 'Content not authentic' # response is garbage
# else:
# error = error + 'IP banned'
return RESPONSE_NETWORK_ERROR
elif content.find('<title>页面不存在</title>') != -1 or content.find('呃... 你想访问的条目豆瓣不收录。') != -1: # re.search('不存在[^<]+</title>', content, re.MULTILINE):
return RESPONSE_CENSORSHIP
else:
return RESPONSE_OK
else:
return RESPONSE_INVALID_CONTENT

View file

@ -0,0 +1,180 @@
from catalog.common import *
from .douban import *
from catalog.book.models import *
from catalog.book.utils import *
import logging
_logger = logging.getLogger(__name__)
class ScraperMixin:
def set_field(self, field, value=None):
self.data[field] = value
def parse_str(self, query):
elem = self.html.xpath(query)
return elem[0].strip() if elem else None
def parse_field(self, field, query, error_when_missing=False):
elem = self.html.xpath(query)
if elem:
self.data[field] = elem[0].strip()
elif error_when_missing:
raise ParseError(self, field)
else:
self.data[field] = None
return elem
@SiteList.register
class DoubanBook(AbstractSite, ScraperMixin):
ID_TYPE = IdType.DoubanBook
URL_PATTERNS = [r"\w+://book\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/book/subject/(\d+)/{0,1}"]
WIKI_PROPERTY_ID = '?'
DEFAULT_MODEL = Edition
@classmethod
def id_to_url(self, id_value):
return "https://book.douban.com/subject/" + id_value + "/"
def scrape(self):
self.data = {}
self.html = DoubanDownloader(self.url).download().html()
self.parse_field('title', "/html/body//h1/span/text()")
self.parse_field('isbn', "//div[@id='info']//span[text()='ISBN:']/following::text()")
# TODO does douban store ASIN as ISBN, need more cleanup if so
if not self.data['title']:
if self.data['isbn']:
self.data['title'] = 'isbn: ' + isbn
else:
raise ParseError(self, 'title')
self.parse_field('cover_image_url', "//*[@id='mainpic']/a/img/@src")
self.parse_field('brief', "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
self.parse_field('series', "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
self.parse_field('producer', "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
self.parse_field('cubn', "//div[@id='info']//span[text()='统一书号:']/following::text()")
self.parse_field('subtitle', "//div[@id='info']//span[text()='副标题:']/following::text()")
self.parse_field('orig_title', "//div[@id='info']//span[text()='原作名:']/following::text()")
self.parse_field('language', "//div[@id='info']//span[text()='语言:']/following::text()")
self.parse_field('pub_house', "//div[@id='info']//span[text()='出版社:']/following::text()")
self.parse_field('pub_date', "//div[@id='info']//span[text()='出版年:']/following::text()")
year_month_day = RE_NUMBERS.findall(self.data['pub_date']) if self.data['pub_date'] else []
if len(year_month_day) in (2, 3):
pub_year = int(year_month_day[0])
pub_month = int(year_month_day[1])
elif len(year_month_day) == 1:
pub_year = int(year_month_day[0])
pub_month = None
else:
pub_year = None
pub_month = None
if pub_year and pub_month and pub_year < pub_month:
pub_year, pub_month = pub_month, pub_year
pub_year = None if pub_year is not None and pub_year not in range(
0, 3000) else pub_year
pub_month = None if pub_month is not None and pub_month not in range(
1, 12) else pub_month
self.parse_field('binding', "//div[@id='info']//span[text()='装帧:']/following::text()")
self.parse_field('price', "//div[@id='info']//span[text()='定价:']/following::text()")
self.parse_field('pages', "//div[@id='info']//span[text()='页数:']/following::text()")
if self.data['pages'] is not None:
self.data['pages'] = int(RE_NUMBERS.findall(self.data['pages'])[0]) if RE_NUMBERS.findall(self.data['pages']) else None
if self.data['pages'] and (self.data['pages'] > 999999 or self.data['pages'] < 1):
self.data['pages'] = None
contents = None
try:
contents_elem = self.html.xpath(
"//h2/span[text()='目录']/../following-sibling::div[1]")[0]
# if next the id of next sibling contains `dir`, that would be the full contents
if "dir" in contents_elem.getnext().xpath("@id")[0]:
contents_elem = contents_elem.getnext()
contents = '\n'.join(p.strip() for p in contents_elem.xpath("text()")[:-2]) if len(contents_elem) else None
else:
contents = '\n'.join(p.strip() for p in contents_elem.xpath("text()")) if len(contents_elem) else None
except Exception:
pass
self.data['contents'] = contents
# there are two html formats for authors and translators
authors_elem = self.html.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""")
if not authors_elem:
authors_elem = self.html.xpath(
"""//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""")
if authors_elem:
authors = []
for author in authors_elem:
authors.append(RE_WHITESPACES.sub(' ', author.strip())[:200])
else:
authors = None
self.data['authors'] = authors
translators_elem = self.html.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""")
if not translators_elem:
translators_elem = self.html.xpath(
"""//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""")
if translators_elem:
translators = []
for translator in translators_elem:
translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
else:
translators = None
self.data['translators'] = translators
work_link = self.parse_str('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
if work_link:
r = re.match(r'\w+://book.douban.com/works/(\d+)', work_link)
self.data['required_resources'] = [{
'model': 'Work',
'id_type': IdType.DoubanBook_Work,
'id_value': r[1] if r else None,
'title': self.data['title'],
'url': work_link,
}]
pd = ResourceContent(metadata=self.data)
pd.lookup_ids[IdType.ISBN] = self.data.get('isbn')
pd.lookup_ids[IdType.CUBN] = self.data.get('cubn')
if self.data["cover_image_url"]:
imgdl = BasicImageDownloader(self.data["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}')
return pd
@SiteList.register
class DoubanBook_Work(AbstractSite):
ID_TYPE = IdType.DoubanBook_Work
URL_PATTERNS = [r"\w+://book\.douban\.com/works/(\d+)"]
WIKI_PROPERTY_ID = '?'
DEFAULT_MODEL = Work
@classmethod
def id_to_url(self, id_value):
return "https://book.douban.com/works/" + id_value + "/"
def bypass_scrape(self, data_from_link):
if not data_from_link:
return None
pd = ResourceContent(metadata={
'title': data_from_link['title'],
})
return pd
def scrape(self):
content = DoubanDownloader(self.url).download().html()
title_elem = content.xpath("//h1/text()")
title = title_elem[0].split('全部版本(')[0].strip() if title_elem else None
if not title:
raise ParseError(self, 'title')
pd = ResourceContent(metadata={
'title': title,
})
return pd

View file

@ -0,0 +1,58 @@
from catalog.common import *
from catalog.models import *
from .douban import DoubanDownloader
import logging
_logger = logging.getLogger(__name__)
@SiteList.register
class DoubanDrama(AbstractSite):
ID_TYPE = IdType.DoubanDrama
URL_PATTERNS = [r"\w+://www.douban.com/location/drama/(\d+)/"]
WIKI_PROPERTY_ID = 'P6443'
DEFAULT_MODEL = Performance
@classmethod
def id_to_url(self, id_value):
return "https://www.douban.com/location/drama/" + id_value + "/"
def scrape(self):
h = DoubanDownloader(self.url).download().html()
data = {}
title_elem = h.xpath("/html/body//h1/span/text()")
if title_elem:
data["title"] = title_elem[0].strip()
else:
raise ParseError(self, "title")
data['other_titles'] = [s.strip() for s in title_elem[1:]]
other_title_elem = h.xpath("//dl//dt[text()='又名:']/following::dd[@itemprop='name']/text()")
if len(other_title_elem) > 0:
data['other_titles'].append(other_title_elem[0].strip())
plot_elem = h.xpath("//div[@id='link-report']/text()")
if len(plot_elem) == 0:
plot_elem = h.xpath("//div[@class='abstract']/text()")
data['brief'] = '\n'.join(plot_elem) if len(plot_elem) > 0 else ''
data['genres'] = [s.strip() for s in h.xpath("//dl//dt[text()='类型:']/following-sibling::dd[@itemprop='genre']/text()")]
data['versions'] = [s.strip() for s in h.xpath("//dl//dt[text()='版本:']/following-sibling::dd[@class='titles']/a//text()")]
data['directors'] = [s.strip() for s in h.xpath("//div[@class='meta']/dl//dt[text()='导演:']/following-sibling::dd/a[@itemprop='director']//text()")]
data['playwrights'] = [s.strip() for s in h.xpath("//div[@class='meta']/dl//dt[text()='编剧:']/following-sibling::dd/a[@itemprop='author']//text()")]
data['actors'] = [s.strip() for s in h.xpath("//div[@class='meta']/dl//dt[text()='主演:']/following-sibling::dd/a[@itemprop='actor']//text()")]
img_url_elem = h.xpath("//img[@itemprop='image']/@src")
data['cover_image_url'] = img_url_elem[0].strip() if img_url_elem else None
pd = ResourceContent(metadata=data)
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd

View file

@ -0,0 +1,76 @@
from catalog.common import *
from catalog.models import *
from .douban import DoubanDownloader
import dateparser
import logging
_logger = logging.getLogger(__name__)
@SiteList.register
class DoubanGame(AbstractSite):
ID_TYPE = IdType.DoubanGame
URL_PATTERNS = [r"\w+://www\.douban\.com/game/(\d+)/{0,1}", r"\w+://m.douban.com/game/subject/(\d+)/{0,1}"]
WIKI_PROPERTY_ID = ''
DEFAULT_MODEL = Game
@classmethod
def id_to_url(self, id_value):
return "https://www.douban.com/game/" + id_value + "/"
def scrape(self):
content = DoubanDownloader(self.url).download().html()
elem = content.xpath("//div[@id='content']/h1/text()")
title = elem[0].strip() if len(elem) else None
if not title:
raise ParseError(self, "title")
other_title_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()")
other_title = other_title_elem[0].strip().split(' / ') if other_title_elem else None
developer_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()")
developer = developer_elem[0].strip().split(' / ') if developer_elem else None
publisher_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()")
publisher = publisher_elem[0].strip().split(' / ') if publisher_elem else None
platform_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()")
platform = platform_elem if platform_elem else None
genre_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()")
genre = None
if genre_elem:
genre = [g for g in genre_elem if g != '游戏']
date_elem = content.xpath(
"//dl[@class='game-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()")
release_date = dateparser.parse(date_elem[0].strip()).strftime('%Y-%m-%d') if date_elem else None
brief_elem = content.xpath("//div[@class='mod item-desc']/p/text()")
brief = '\n'.join(brief_elem) if brief_elem else None
img_url_elem = content.xpath(
"//div[@class='item-subject-info']/div[@class='pic']//img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
pd = ResourceContent(metadata={
'title': title,
'other_title': other_title,
'developer': developer,
'publisher': publisher,
'release_date': release_date,
'genre': genre,
'platform': platform,
'brief': brief,
'cover_image_url': img_url
})
if pd.metadata["cover_image_url"]:
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(pd.metadata['cover_image_url'], self.url)
return pd

View file

@ -0,0 +1,275 @@
from catalog.common import *
from .douban import *
from catalog.movie.models import *
from catalog.tv.models import *
import logging
from django.db import models
from django.utils.translation import gettext_lazy as _
from .tmdb import TMDB_TV, search_tmdb_by_imdb_id
_logger = logging.getLogger(__name__)
class MovieGenreEnum(models.TextChoices):
DRAMA = 'Drama', _('剧情')
KIDS = 'Kids', _('儿童')
COMEDY = 'Comedy', _('喜剧')
BIOGRAPHY = 'Biography', _('传记')
ACTION = 'Action', _('动作')
HISTORY = 'History', _('历史')
ROMANCE = 'Romance', _('爱情')
WAR = 'War', _('战争')
SCI_FI = 'Sci-Fi', _('科幻')
CRIME = 'Crime', _('犯罪')
ANIMATION = 'Animation', _('动画')
WESTERN = 'Western', _('西部')
MYSTERY = 'Mystery', _('悬疑')
FANTASY = 'Fantasy', _('奇幻')
THRILLER = 'Thriller', _('惊悚')
ADVENTURE = 'Adventure', _('冒险')
HORROR = 'Horror', _('恐怖')
DISASTER = 'Disaster', _('灾难')
DOCUMENTARY = 'Documentary', _('纪录片')
MARTIAL_ARTS = 'Martial-Arts', _('武侠')
SHORT = 'Short', _('短片')
ANCIENT_COSTUM = 'Ancient-Costum', _('古装')
EROTICA = 'Erotica', _('情色')
SPORT = 'Sport', _('运动')
GAY_LESBIAN = 'Gay/Lesbian', _('同性')
OPERA = 'Opera', _('戏曲')
MUSIC = 'Music', _('音乐')
FILM_NOIR = 'Film-Noir', _('黑色电影')
MUSICAL = 'Musical', _('歌舞')
REALITY_TV = 'Reality-TV', _('真人秀')
FAMILY = 'Family', _('家庭')
TALK_SHOW = 'Talk-Show', _('脱口秀')
NEWS = 'News', _('新闻')
SOAP = 'Soap', _('肥皂剧')
TV_MOVIE = 'TV Movie', _('电视电影')
THEATRE = 'Theatre', _('舞台艺术')
OTHER = 'Other', _('其他')
# MovieGenreTranslator = ChoicesDictGenerator(MovieGenreEnum)
@SiteList.register
class DoubanMovie(AbstractSite):
ID_TYPE = IdType.DoubanMovie
URL_PATTERNS = [r"\w+://movie\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/movie/subject/(\d+)/{0,1}"]
WIKI_PROPERTY_ID = '?'
# no DEFAULT_MODEL as it may be either TV Season and Movie
@classmethod
def id_to_url(self, id_value):
return "https://movie.douban.com/subject/" + id_value + "/"
def scrape(self):
content = DoubanDownloader(self.url).download().html()
try:
raw_title = content.xpath(
"//span[@property='v:itemreviewed']/text()")[0].strip()
except IndexError:
raise ParseError(self, 'title')
orig_title = content.xpath(
"//img[@rel='v:image']/@alt")[0].strip()
title = raw_title.split(orig_title)[0].strip()
# if has no chinese title
if title == '':
title = orig_title
if title == orig_title:
orig_title = None
# there are two html formats for authors and translators
other_title_elem = content.xpath(
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
other_title = other_title_elem[0].strip().split(
' / ') if other_title_elem else None
imdb_elem = content.xpath(
"//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()")
if not imdb_elem:
imdb_elem = content.xpath(
"//div[@id='info']//span[text()='IMDb:']/following-sibling::text()[1]")
imdb_code = imdb_elem[0].strip() if imdb_elem else None
director_elem = content.xpath(
"//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()")
director = director_elem if director_elem else None
playwright_elem = content.xpath(
"//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()")
playwright = list(map(lambda a: a[:200], playwright_elem)) if playwright_elem else None
actor_elem = content.xpath(
"//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()")
actor = list(map(lambda a: a[:200], actor_elem)) if actor_elem else None
# construct genre translator
genre_translator = {}
attrs = [attr for attr in dir(MovieGenreEnum) if '__' not in attr]
for attr in attrs:
genre_translator[getattr(MovieGenreEnum, attr).label] = getattr(
MovieGenreEnum, attr).value
genre_elem = content.xpath("//span[@property='v:genre']/text()")
if genre_elem:
genre = []
for g in genre_elem:
g = g.split(' ')[0]
if g == '紀錄片': # likely some original data on douban was corrupted
g = '纪录片'
elif g == '鬼怪':
g = '惊悚'
if g in genre_translator:
genre.append(genre_translator[g])
elif g in genre_translator.values():
genre.append(g)
else:
_logger.error(f'unable to map genre {g}')
else:
genre = None
showtime_elem = content.xpath(
"//span[@property='v:initialReleaseDate']/text()")
if showtime_elem:
showtime = []
for st in showtime_elem:
parts = st.split('(')
if len(parts) == 1:
time = st.split('(')[0]
region = ''
else:
time = st.split('(')[0]
region = st.split('(')[1][0:-1]
showtime.append({time: region})
else:
showtime = None
site_elem = content.xpath(
"//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href")
site = site_elem[0].strip()[:200] if site_elem else None
if site and not re.match(r'http.+', site):
site = None
area_elem = content.xpath(
"//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]")
if area_elem:
area = [a.strip()[:100] for a in area_elem[0].split('/')]
else:
area = None
language_elem = content.xpath(
"//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]")
if language_elem:
language = [a.strip() for a in language_elem[0].split(' / ')]
else:
language = None
year_elem = content.xpath("//span[@class='year']/text()")
year = int(re.search(r'\d+', year_elem[0])[0]) if year_elem and re.search(r'\d+', year_elem[0]) else None
duration_elem = content.xpath("//span[@property='v:runtime']/text()")
other_duration_elem = content.xpath(
"//span[@property='v:runtime']/following-sibling::text()[1]")
if duration_elem:
duration = duration_elem[0].strip()
if other_duration_elem:
duration += other_duration_elem[0].rstrip()
duration = duration.split('/')[0].strip()
else:
duration = None
season_elem = content.xpath(
"//*[@id='season']/option[@selected='selected']/text()")
if not season_elem:
season_elem = content.xpath(
"//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]")
season = int(season_elem[0].strip()) if season_elem else None
else:
season = int(season_elem[0].strip())
episodes_elem = content.xpath(
"//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]")
episodes = int(episodes_elem[0].strip()) if episodes_elem and episodes_elem[0].strip().isdigit() else None
single_episode_length_elem = content.xpath(
"//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]")
single_episode_length = single_episode_length_elem[0].strip(
)[:100] if single_episode_length_elem else None
# if has field `episodes` not none then must be series
is_series = True if episodes else False
brief_elem = content.xpath("//span[@class='all hidden']")
if not brief_elem:
brief_elem = content.xpath("//span[@property='v:summary']")
brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
'./text()')]) if brief_elem else None
img_url_elem = content.xpath("//img[@rel='v:image']/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
pd = ResourceContent(metadata={
'title': title,
'orig_title': orig_title,
'other_title': other_title,
'imdb_code': imdb_code,
'director': director,
'playwright': playwright,
'actor': actor,
'genre': genre,
'showtime': showtime,
'site': site,
'area': area,
'language': language,
'year': year,
'duration': duration,
'season_number': season,
'episodes': episodes,
'single_episode_length': single_episode_length,
'brief': brief,
'is_series': is_series,
'cover_image_url': img_url,
})
pd.metadata['preferred_model'] = ('TVSeason' if season else 'TVShow') if is_series else 'Movie'
if imdb_code:
res_data = search_tmdb_by_imdb_id(imdb_code)
tmdb_show_id = None
if 'movie_results' in res_data and len(res_data['movie_results']) > 0:
pd.metadata['preferred_model'] = 'Movie'
elif 'tv_results' in res_data and len(res_data['tv_results']) > 0:
pd.metadata['preferred_model'] = 'TVShow'
elif 'tv_season_results' in res_data and len(res_data['tv_season_results']) > 0:
pd.metadata['preferred_model'] = 'TVSeason'
tmdb_show_id = res_data['tv_season_results'][0]['show_id']
elif 'tv_episode_results' in res_data and len(res_data['tv_episode_results']) > 0:
pd.metadata['preferred_model'] = 'TVSeason'
tmdb_show_id = res_data['tv_episode_results'][0]['show_id']
if res_data['tv_episode_results'][0]['episode_number'] != 1:
_logger.error(f'Douban Movie {self.url} mapping to unexpected imdb episode {imdb_code}')
# TODO correct the IMDB id
pd.lookup_ids[IdType.IMDB] = imdb_code
if tmdb_show_id:
pd.metadata['required_resources'] = [{
'model': 'TVShow',
'id_type': IdType.TMDB_TV,
'id_value': tmdb_show_id,
'title': title,
'url': TMDB_TV.id_to_url(tmdb_show_id),
}]
# TODO parse sister seasons
# pd.metadata['related_resources'] = []
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd

View file

@ -0,0 +1,115 @@
from catalog.common import *
from catalog.models import *
from .douban import DoubanDownloader
import dateparser
import logging
_logger = logging.getLogger(__name__)
@SiteList.register
class DoubanMusic(AbstractSite):
ID_TYPE = IdType.DoubanMusic
URL_PATTERNS = [r"\w+://music\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/music/subject/(\d+)/{0,1}"]
WIKI_PROPERTY_ID = ''
DEFAULT_MODEL = Album
@classmethod
def id_to_url(self, id_value):
return "https://music.douban.com/subject/" + id_value + "/"
def scrape(self):
content = DoubanDownloader(self.url).download().html()
elem = content.xpath("//h1/span/text()")
title = elem[0].strip() if len(elem) else None
if not title:
raise ParseError(self, "title")
artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()")
artist = None if not artists_elem else list(map(lambda a: a[:200], artists_elem))
genre_elem = content.xpath(
"//div[@id='info']//span[text()='流派:']/following::text()[1]")
genre = genre_elem[0].strip() if genre_elem else None
date_elem = content.xpath(
"//div[@id='info']//span[text()='发行时间:']/following::text()[1]")
release_date = dateparser.parse(date_elem[0].strip()).strftime('%Y-%m-%d') if date_elem else None
company_elem = content.xpath(
"//div[@id='info']//span[text()='出版者:']/following::text()[1]")
company = company_elem[0].strip() if company_elem else None
track_list_elem = content.xpath(
"//div[@class='track-list']/div[@class='indent']/div/text()"
)
if track_list_elem:
track_list = '\n'.join([track.strip() for track in track_list_elem])
else:
track_list = None
brief_elem = content.xpath("//span[@class='all hidden']")
if not brief_elem:
brief_elem = content.xpath("//span[@property='v:summary']")
brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
'./text()')]) if brief_elem else None
gtin = None
isrc = None
other_info = {}
other_elem = content.xpath(
"//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
if other_elem:
other_info['又名'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]")
if other_elem:
other_info['专辑类型'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]")
if other_elem:
other_info['介质'] = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]")
if other_elem:
other_info['ISRC'] = other_elem[0].strip()
isrc = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]")
if other_elem:
other_info['条形码'] = other_elem[0].strip()
gtin = other_elem[0].strip()
other_elem = content.xpath(
"//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]")
if other_elem:
other_info['碟片数'] = other_elem[0].strip()
img_url_elem = content.xpath("//div[@id='mainpic']//img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
pd = ResourceContent(metadata={
'title': title,
'artist': artist,
'genre': genre,
'release_date': release_date,
'duration': None,
'company': company,
'track_list': track_list,
'brief': brief,
'other_info': other_info,
'cover_image_url': img_url
})
if gtin:
pd.lookup_ids[IdType.GTIN] = gtin
if isrc:
pd.lookup_ids[IdType.ISRC] = isrc
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd

116
catalog/sites/goodreads.py Normal file
View file

@ -0,0 +1,116 @@
from catalog.book.models import Edition, Work
from catalog.common import *
from lxml import html
import json
import logging
_logger = logging.getLogger(__name__)
class GoodreadsDownloader(RetryDownloader):
def validate_response(self, response):
if response is None:
return RESPONSE_NETWORK_ERROR
elif response.status_code == 200:
if response.text.find('__NEXT_DATA__') != -1:
return RESPONSE_OK
else:
# Goodreads may return legacy version for a/b testing
# retry if so
return RESPONSE_NETWORK_ERROR
else:
return RESPONSE_INVALID_CONTENT
@SiteList.register
class Goodreads(AbstractSite):
ID_TYPE = IdType.Goodreads
WIKI_PROPERTY_ID = 'P2968'
DEFAULT_MODEL = Edition
URL_PATTERNS = [r".+goodreads.com/.*book/show/(\d+)", r".+goodreads.com/.*book/(\d+)"]
@classmethod
def id_to_url(self, id_value):
return "https://www.goodreads.com/book/show/" + id_value
def scrape(self, response=None):
data = {}
if response is not None:
h = html.fromstring(response.text.strip())
else:
dl = GoodreadsDownloader(self.url)
h = dl.download().html()
# Next.JS version of GoodReads
# JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']
elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()')
src = elem[0].strip() if elem else None
if not src:
raise ParseError(self, '__NEXT_DATA__ element')
d = json.loads(src)['props']['pageProps']['apolloState']
o = {'Book': [], 'Work': [], 'Series': [], 'Contributor': []}
for v in d.values():
t = v.get('__typename')
if t and t in o:
o[t].append(v)
b = next(filter(lambda x: x.get('title'), o['Book']), None)
if not b:
# Goodreads may return empty page template when internal service timeouts
raise ParseError(self, 'Book in __NEXT_DATA__ json')
data['title'] = b['title']
data['brief'] = b['description']
data['isbn'] = b['details'].get('isbn13')
asin = b['details'].get('asin')
if asin and asin != data['isbn']:
data['asin'] = asin
data['pages'] = b['details'].get('numPages')
data['cover_image_url'] = b['imageUrl']
w = next(filter(lambda x: x.get('details'), o['Work']), None)
if w:
data['required_resources'] = [{
'model': 'Work',
'id_type': IdType.Goodreads_Work,
'id_value': str(w['legacyId']),
'title': w['details']['originalTitle'],
'url': w['editions']['webUrl'],
}]
pd = ResourceContent(metadata=data)
pd.lookup_ids[IdType.ISBN] = data.get('isbn')
pd.lookup_ids[IdType.ASIN] = data.get('asin')
if data["cover_image_url"]:
imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {data["cover_image_url"]}')
return pd
@SiteList.register
class Goodreads_Work(AbstractSite):
ID_TYPE = IdType.Goodreads_Work
WIKI_PROPERTY_ID = ''
DEFAULT_MODEL = Work
URL_PATTERNS = [r".+goodreads.com/work/editions/(\d+)"]
@classmethod
def id_to_url(self, id_value):
return "https://www.goodreads.com/work/editions/" + id_value
def scrape(self, response=None):
content = BasicDownloader(self.url).download().html()
title_elem = content.xpath("//h1/a/text()")
title = title_elem[0].strip() if title_elem else None
if not title:
raise ParseError(self, 'title')
author_elem = content.xpath("//h2/a/text()")
author = author_elem[0].strip() if author_elem else None
first_published_elem = content.xpath("//h2/span/text()")
first_published = first_published_elem[0].strip() if first_published_elem else None
pd = ResourceContent(metadata={
'title': title,
'author': author,
'first_published': first_published
})
return pd

View file

@ -0,0 +1,79 @@
from catalog.common import *
from catalog.models import *
import re
import logging
_logger = logging.getLogger(__name__)
@SiteList.register
class GoogleBooks(AbstractSite):
ID_TYPE = IdType.GoogleBooks
URL_PATTERNS = [
r"https://books\.google\.co[^/]+/books\?id=([^&#]+)",
r"https://www\.google\.co[^/]+/books/edition/[^/]+/([^&#?]+)",
r"https://books\.google\.co[^/]+/books/about/[^?]+?id=([^&#?]+)",
]
WIKI_PROPERTY_ID = ''
DEFAULT_MODEL = Edition
@classmethod
def id_to_url(self, id_value):
return "https://books.google.com/books?id=" + id_value
def scrape(self):
api_url = f'https://www.googleapis.com/books/v1/volumes/{self.id_value}'
b = BasicDownloader(api_url).download().json()
other = {}
title = b['volumeInfo']['title']
subtitle = b['volumeInfo']['subtitle'] if 'subtitle' in b['volumeInfo'] else None
pub_year = None
pub_month = None
if 'publishedDate' in b['volumeInfo']:
pub_date = b['volumeInfo']['publishedDate'].split('-')
pub_year = pub_date[0]
pub_month = pub_date[1] if len(pub_date) > 1 else None
pub_house = b['volumeInfo']['publisher'] if 'publisher' in b['volumeInfo'] else None
language = b['volumeInfo']['language'] if 'language' in b['volumeInfo'] else None
pages = b['volumeInfo']['pageCount'] if 'pageCount' in b['volumeInfo'] else None
if 'mainCategory' in b['volumeInfo']:
other['分类'] = b['volumeInfo']['mainCategory']
authors = b['volumeInfo']['authors'] if 'authors' in b['volumeInfo'] else None
if 'description' in b['volumeInfo']:
brief = b['volumeInfo']['description']
elif 'textSnippet' in b['volumeInfo']:
brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
else:
brief = ''
brief = re.sub(r'<.*?>', '', brief.replace('<br', '\n<br'))
img_url = b['volumeInfo']['imageLinks']['thumbnail'] if 'imageLinks' in b['volumeInfo'] else None
isbn10 = None
isbn13 = None
for iid in b['volumeInfo']['industryIdentifiers'] if 'industryIdentifiers' in b['volumeInfo'] else []:
if iid['type'] == 'ISBN_10':
isbn10 = iid['identifier']
if iid['type'] == 'ISBN_13':
isbn13 = iid['identifier']
isbn = isbn13 # if isbn13 is not None else isbn10
raw_img, ext = BasicImageDownloader.download_image(img_url, self.url)
data = {
'title': title,
'subtitle': subtitle,
'orig_title': None,
'author': authors,
'translator': None,
'language': language,
'pub_house': pub_house,
'pub_year': pub_year,
'pub_month': pub_month,
'binding': None,
'pages': pages,
'isbn': isbn,
'brief': brief,
'contents': None,
'other_info': other,
'cover_image_url': img_url,
}
return ResourceContent(metadata=data, cover_image=raw_img, cover_image_extention=ext, lookup_ids={IdType.ISBN: isbn13})

113
catalog/sites/igdb.py Normal file
View file

@ -0,0 +1,113 @@
"""
IGDB
use (e.g. "portal-2") as id, which is different from real id in IGDB API
"""
from catalog.common import *
from catalog.models import *
from django.conf import settings
from igdb.wrapper import IGDBWrapper
import requests
import datetime
import json
import logging
_logger = logging.getLogger(__name__)
def _igdb_access_token():
try:
token = requests.post(f'https://id.twitch.tv/oauth2/token?client_id={settings.IGDB_CLIENT_ID}&client_secret={settings.IGDB_CLIENT_SECRET}&grant_type=client_credentials').json()['access_token']
except Exception:
_logger.error('unable to obtain IGDB token')
token = '<invalid>'
return token
_wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, _igdb_access_token())
def search_igdb_by_3p_url(steam_url):
r = IGDB.api_query('websites', f'fields *, game.*; where url = "{steam_url}";')
if not r:
return None
r = sorted(r, key=lambda w: w['game']['id'])
return IGDB(url=r[0]['game']['url'])
@SiteList.register
class IGDB(AbstractSite):
ID_TYPE = IdType.IGDB
URL_PATTERNS = [r"\w+://www\.igdb\.com/games/([a-zA-Z0-9\-_]+)"]
WIKI_PROPERTY_ID = '?'
DEFAULT_MODEL = Game
@classmethod
def id_to_url(self, id_value):
return "https://www.igdb.com/games/" + id_value
@classmethod
def api_query(cls, p, q):
key = 'igdb:' + p + '/' + q
if get_mock_mode():
r = BasicDownloader(key).download().json()
else:
r = json.loads(_wrapper.api_request(p, q))
if settings.DOWNLOADER_SAVEDIR:
with open(settings.DOWNLOADER_SAVEDIR + '/' + get_mock_file(key), 'w', encoding='utf-8') as fp:
fp.write(json.dumps(r))
return r
def scrape(self):
fields = '*, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name'
r = self.api_query('games', f'fields {fields}; where url = "{self.url}";')[0]
brief = r['summary'] if 'summary' in r else ''
brief += "\n\n" + r['storyline'] if 'storyline' in r else ''
developer = None
publisher = None
release_date = None
genre = None
platform = None
if 'involved_companies' in r:
developer = next(iter([c['company']['name'] for c in r['involved_companies'] if c['developer']]), None)
publisher = next(iter([c['company']['name'] for c in r['involved_companies'] if c['publisher']]), None)
if 'platforms' in r:
ps = sorted(r['platforms'], key=lambda p: p['id'])
platform = [(p['name'] if p['id'] != 6 else 'Windows') for p in ps]
if 'first_release_date' in r:
release_date = datetime.datetime.fromtimestamp(r['first_release_date'], datetime.timezone.utc).strftime('%Y-%m-%d')
if 'genres' in r:
genre = [g['name'] for g in r['genres']]
websites = self.api_query('websites', f'fields *; where game.url = "{self.url}";')
steam_url = None
official_site = None
for website in websites:
if website['category'] == 1:
official_site = website['url']
elif website['category'] == 13:
steam_url = website['url']
pd = ResourceContent(metadata={
'title': r['name'],
'other_title': None,
'developer': developer,
'publisher': publisher,
'release_date': release_date,
'genre': genre,
'platform': platform,
'brief': brief,
'official_site': official_site,
'igdb_id': r['id'],
'cover_image_url': 'https:' + r['cover']['url'].replace('t_thumb', 't_cover_big'),
})
if steam_url:
pd.lookup_ids[IdType.Steam] = SiteList.get_site_by_id_type(IdType.Steam).url_to_id(steam_url)
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd

48
catalog/sites/imdb.py Normal file
View file

@ -0,0 +1,48 @@
from catalog.common import *
from .tmdb import search_tmdb_by_imdb_id
from catalog.movie.models import *
from catalog.tv.models import *
import logging
_logger = logging.getLogger(__name__)
@SiteList.register
class IMDB(AbstractSite):
ID_TYPE = IdType.IMDB
URL_PATTERNS = [r'\w+://www.imdb.com/title/(tt\d+)']
WIKI_PROPERTY_ID = '?'
@classmethod
def id_to_url(self, id_value):
return "https://www.imdb.com/title/" + id_value + "/"
def scrape(self):
self.scraped = False
res_data = search_tmdb_by_imdb_id(self.id_value)
if 'movie_results' in res_data and len(res_data['movie_results']) > 0:
url = f"https://www.themoviedb.org/movie/{res_data['movie_results'][0]['id']}"
elif 'tv_results' in res_data and len(res_data['tv_results']) > 0:
url = f"https://www.themoviedb.org/tv/{res_data['tv_results'][0]['id']}"
elif 'tv_season_results' in res_data and len(res_data['tv_season_results']) > 0:
# this should not happen given IMDB only has ids for either show or episode
tv_id = res_data['tv_season_results'][0]['show_id']
season_number = res_data['tv_season_results'][0]['season_number']
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}"
elif 'tv_episode_results' in res_data and len(res_data['tv_episode_results']) > 0:
tv_id = res_data['tv_episode_results'][0]['show_id']
season_number = res_data['tv_episode_results'][0]['season_number']
episode_number = res_data['tv_episode_results'][0]['episode_number']
if season_number == 0:
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}/episode/{episode_number}"
elif episode_number == 1:
url = f"https://www.themoviedb.org/tv/{tv_id}/season/{season_number}"
else:
raise ParseError(self, "IMDB id matching TMDB but not first episode, this is not supported")
else:
raise ParseError(self, "IMDB id not found in TMDB")
tmdb = SiteList.get_site_by_url(url)
pd = tmdb.scrape()
pd.metadata['preferred_model'] = tmdb.DEFAULT_MODEL.__name__
return pd

145
catalog/sites/spotify.py Normal file
View file

@ -0,0 +1,145 @@
"""
Spotify
"""
from django.conf import settings
from catalog.common import *
from catalog.models import *
from .douban import *
import time
import datetime
import requests
import dateparser
import logging
_logger = logging.getLogger(__name__)
spotify_token = None
spotify_token_expire_time = time.time()
@SiteList.register
class Spotify(AbstractSite):
ID_TYPE = IdType.Spotify_Album
URL_PATTERNS = [r'\w+://open\.spotify\.com/album/([a-zA-Z0-9]+)']
WIKI_PROPERTY_ID = '?'
DEFAULT_MODEL = Album
@classmethod
def id_to_url(self, id_value):
return f"https://open.spotify.com/album/{id_value}"
def scrape(self):
api_url = "https://api.spotify.com/v1/albums/" + self.id_value
headers = {
'Authorization': f"Bearer {get_spotify_token()}"
}
res_data = BasicDownloader(api_url, headers=headers).download().json()
artist = []
for artist_dict in res_data['artists']:
artist.append(artist_dict['name'])
title = res_data['name']
genre = ', '.join(res_data['genres'])
company = []
for com in res_data['copyrights']:
company.append(com['text'])
duration = 0
track_list = []
track_urls = []
for track in res_data['tracks']['items']:
track_urls.append(track['external_urls']['spotify'])
duration += track['duration_ms']
if res_data['tracks']['items'][-1]['disc_number'] > 1:
# more than one disc
track_list.append(str(
track['disc_number']) + '-' + str(track['track_number']) + '. ' + track['name'])
else:
track_list.append(str(track['track_number']) + '. ' + track['name'])
track_list = '\n'.join(track_list)
release_date = dateparser.parse(res_data['release_date']).strftime('%Y-%m-%d')
gtin = None
if res_data['external_ids'].get('upc'):
gtin = res_data['external_ids'].get('upc')
if res_data['external_ids'].get('ean'):
gtin = res_data['external_ids'].get('ean')
isrc = None
if res_data['external_ids'].get('isrc'):
isrc = res_data['external_ids'].get('isrc')
pd = ResourceContent(metadata={
'title': title,
'artist': artist,
'genre': genre,
'track_list': track_list,
'release_date': release_date,
'duration': duration,
'company': company,
'brief': None,
'cover_image_url': res_data['images'][0]['url']
})
if gtin:
pd.lookup_ids[IdType.GTIN] = gtin
if isrc:
pd.lookup_ids[IdType.ISRC] = isrc
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd
def get_spotify_token():
global spotify_token, spotify_token_expire_time
if get_mock_mode():
return 'mocked'
if spotify_token is None or is_spotify_token_expired():
invoke_spotify_token()
return spotify_token
def is_spotify_token_expired():
global spotify_token_expire_time
return True if spotify_token_expire_time <= time.time() else False
def invoke_spotify_token():
global spotify_token, spotify_token_expire_time
r = requests.post(
"https://accounts.spotify.com/api/token",
data={
"grant_type": "client_credentials"
},
headers={
"Authorization": f"Basic {settings.SPOTIFY_CREDENTIAL}"
}
)
data = r.json()
if r.status_code == 401:
# token expired, try one more time
# this maybe caused by external operations,
# for example debugging using a http client
r = requests.post(
"https://accounts.spotify.com/api/token",
data={
"grant_type": "client_credentials"
},
headers={
"Authorization": f"Basic {settings.SPOTIFY_CREDENTIAL}"
}
)
data = r.json()
elif r.status_code != 200:
raise Exception(f"Request to spotify API fails. Reason: {r.reason}")
# minus 2 for execution time error
spotify_token_expire_time = int(data['expires_in']) + time.time() - 2
spotify_token = data['access_token']

64
catalog/sites/steam.py Normal file
View file

@ -0,0 +1,64 @@
from catalog.common import *
from catalog.models import *
from .igdb import search_igdb_by_3p_url
import dateparser
import logging
_logger = logging.getLogger(__name__)
@SiteList.register
class Steam(AbstractSite):
ID_TYPE = IdType.Steam
URL_PATTERNS = [r"\w+://store\.steampowered\.com/app/(\d+)"]
WIKI_PROPERTY_ID = '?'
DEFAULT_MODEL = Game
@classmethod
def id_to_url(self, id_value):
return "https://store.steampowered.com/app/" + str(id_value)
def scrape(self):
i = search_igdb_by_3p_url(self.url)
pd = i.scrape() if i else ResourceContent()
headers = BasicDownloader.headers.copy()
headers['Host'] = 'store.steampowered.com'
headers['Cookie'] = "wants_mature_content=1; birthtime=754700401;"
content = BasicDownloader(self.url, headers=headers).download().html()
title = content.xpath("//div[@class='apphub_AppName']/text()")[0]
developer = content.xpath("//div[@id='developers_list']/a/text()")
publisher = content.xpath("//div[@class='glance_ctn']//div[@class='dev_row'][2]//a/text()")
release_date = dateparser.parse(
content.xpath(
"//div[@class='release_date']/div[@class='date']/text()")[0]
).strftime('%Y-%m-%d')
genre = content.xpath(
"//div[@class='details_block']/b[2]/following-sibling::a/text()")
platform = ['PC']
brief = content.xpath(
"//div[@class='game_description_snippet']/text()")[0].strip()
# try Steam images if no image from IGDB
if pd.cover_image is None:
pd.metadata['cover_image_url'] = content.xpath("//img[@class='game_header_image_full']/@src")[0].replace("header.jpg", "library_600x900.jpg")
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(pd.metadata['cover_image_url'], self.url)
if pd.cover_image is None:
pd.metadata['cover_image_url'] = content.xpath("//img[@class='game_header_image_full']/@src")[0]
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(pd.metadata['cover_image_url'], self.url)
# merge data from IGDB, use localized Steam data if available
d = {
'developer': developer,
'publisher': publisher,
'release_date': release_date,
'genre': genre,
'platform': platform,
}
d.update(pd.metadata)
pd.metadata = d
if title:
pd.metadata['title'] = title
if brief:
pd.metadata['brief'] = brief
return pd

328
catalog/sites/tmdb.py Normal file
View file

@ -0,0 +1,328 @@
"""
The Movie Database
"""
import re
from django.conf import settings
from catalog.common import *
from .douban import *
from catalog.movie.models import *
from catalog.tv.models import *
import logging
_logger = logging.getLogger(__name__)
def search_tmdb_by_imdb_id(imdb_id):
tmdb_api_url = f"https://api.themoviedb.org/3/find/{imdb_id}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&external_source=imdb_id"
res_data = BasicDownloader(tmdb_api_url).download().json()
return res_data
def _copy_dict(s, key_map):
d = {}
for src, dst in key_map.items():
d[dst if dst else src] = s.get(src)
return d
genre_map = {
'Sci-Fi & Fantasy': 'Sci-Fi',
'War & Politics': 'War',
'儿童': 'Kids',
'冒险': 'Adventure',
'剧情': 'Drama',
'动作': 'Action',
'动作冒险': 'Action',
'动画': 'Animation',
'历史': 'History',
'喜剧': 'Comedy',
'奇幻': 'Fantasy',
'家庭': 'Family',
'恐怖': 'Horror',
'悬疑': 'Mystery',
'惊悚': 'Thriller',
'战争': 'War',
'新闻': 'News',
'爱情': 'Romance',
'犯罪': 'Crime',
'电视电影': 'TV Movie',
'真人秀': 'Reality-TV',
'科幻': 'Sci-Fi',
'纪录': 'Documentary',
'肥皂剧': 'Soap',
'脱口秀': 'Talk-Show',
'西部': 'Western',
'音乐': 'Music',
}
@SiteList.register
class TMDB_Movie(AbstractSite):
ID_TYPE = IdType.TMDB_Movie
URL_PATTERNS = [r'\w+://www.themoviedb.org/movie/(\d+)']
WIKI_PROPERTY_ID = '?'
DEFAULT_MODEL = Movie
@classmethod
def id_to_url(self, id_value):
return f"https://www.themoviedb.org/movie/{id_value}"
def scrape(self):
is_series = False
if is_series:
api_url = f"https://api.themoviedb.org/3/tv/{self.id_value}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
else:
api_url = f"https://api.themoviedb.org/3/movie/{self.id_value}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
res_data = BasicDownloader(api_url).download().json()
if is_series:
title = res_data['name']
orig_title = res_data['original_name']
year = int(res_data['first_air_date'].split(
'-')[0]) if res_data['first_air_date'] else None
imdb_code = res_data['external_ids']['imdb_id']
showtime = [{res_data['first_air_date']: "首播日期"}
] if res_data['first_air_date'] else None
duration = None
else:
title = res_data['title']
orig_title = res_data['original_title']
year = int(res_data['release_date'].split('-')
[0]) if res_data['release_date'] else None
showtime = [{res_data['release_date']: "发布日期"}
] if res_data['release_date'] else None
imdb_code = res_data['imdb_id']
# in minutes
duration = res_data['runtime'] if res_data['runtime'] else None
genre = list(map(lambda x: genre_map[x['name']] if x['name']
in genre_map else 'Other', res_data['genres']))
language = list(map(lambda x: x['name'], res_data['spoken_languages']))
brief = res_data['overview']
if is_series:
director = list(map(lambda x: x['name'], res_data['created_by']))
else:
director = list(map(lambda x: x['name'], filter(
lambda c: c['job'] == 'Director', res_data['credits']['crew'])))
playwright = list(map(lambda x: x['name'], filter(
lambda c: c['job'] == 'Screenplay', res_data['credits']['crew'])))
actor = list(map(lambda x: x['name'], res_data['credits']['cast']))
area = []
other_info = {}
# other_info['TMDB评分'] = res_data['vote_average']
# other_info['分级'] = res_data['contentRating']
# other_info['Metacritic评分'] = res_data['metacriticRating']
# other_info['奖项'] = res_data['awards']
# other_info['TMDB_ID'] = id
if is_series:
other_info['Seasons'] = res_data['number_of_seasons']
other_info['Episodes'] = res_data['number_of_episodes']
# TODO: use GET /configuration to get base url
img_url = ('https://image.tmdb.org/t/p/original/' + res_data['poster_path']) if res_data['poster_path'] is not None else None
pd = ResourceContent(metadata={
'title': title,
'orig_title': orig_title,
'other_title': None,
'imdb_code': imdb_code,
'director': director,
'playwright': playwright,
'actor': actor,
'genre': genre,
'showtime': showtime,
'site': None,
'area': area,
'language': language,
'year': year,
'duration': duration,
'season': None,
'episodes': None,
'single_episode_length': None,
'brief': brief,
'cover_image_url': img_url,
})
if imdb_code:
pd.lookup_ids[IdType.IMDB] = imdb_code
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd
@SiteList.register
class TMDB_TV(AbstractSite):
ID_TYPE = IdType.TMDB_TV
URL_PATTERNS = [r'\w+://www.themoviedb.org/tv/(\d+)[^/]*$', r'\w+://www.themoviedb.org/tv/(\d+)[^/]*/seasons']
WIKI_PROPERTY_ID = '?'
DEFAULT_MODEL = TVShow
@classmethod
def id_to_url(self, id_value):
return f"https://www.themoviedb.org/tv/{id_value}"
def scrape(self):
is_series = True
if is_series:
api_url = f"https://api.themoviedb.org/3/tv/{self.id_value}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
else:
api_url = f"https://api.themoviedb.org/3/movie/{self.id_value}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
res_data = BasicDownloader(api_url).download().json()
if is_series:
title = res_data['name']
orig_title = res_data['original_name']
year = int(res_data['first_air_date'].split(
'-')[0]) if res_data['first_air_date'] else None
imdb_code = res_data['external_ids']['imdb_id']
showtime = [{res_data['first_air_date']: "首播日期"}
] if res_data['first_air_date'] else None
duration = None
else:
title = res_data['title']
orig_title = res_data['original_title']
year = int(res_data['release_date'].split('-')
[0]) if res_data['release_date'] else None
showtime = [{res_data['release_date']: "发布日期"}
] if res_data['release_date'] else None
imdb_code = res_data['imdb_id']
# in minutes
duration = res_data['runtime'] if res_data['runtime'] else None
genre = list(map(lambda x: genre_map[x['name']] if x['name']
in genre_map else 'Other', res_data['genres']))
language = list(map(lambda x: x['name'], res_data['spoken_languages']))
brief = res_data['overview']
if is_series:
director = list(map(lambda x: x['name'], res_data['created_by']))
else:
director = list(map(lambda x: x['name'], filter(
lambda c: c['job'] == 'Director', res_data['credits']['crew'])))
playwright = list(map(lambda x: x['name'], filter(
lambda c: c['job'] == 'Screenplay', res_data['credits']['crew'])))
actor = list(map(lambda x: x['name'], res_data['credits']['cast']))
area = []
other_info = {}
# other_info['TMDB评分'] = res_data['vote_average']
# other_info['分级'] = res_data['contentRating']
# other_info['Metacritic评分'] = res_data['metacriticRating']
# other_info['奖项'] = res_data['awards']
# other_info['TMDB_ID'] = id
if is_series:
other_info['Seasons'] = res_data['number_of_seasons']
other_info['Episodes'] = res_data['number_of_episodes']
# TODO: use GET /configuration to get base url
img_url = ('https://image.tmdb.org/t/p/original/' + res_data['poster_path']) if res_data['poster_path'] is not None else None
season_links = list(map(lambda s: {
'model': 'TVSeason',
'id_type': IdType.TMDB_TVSeason,
'id_value': f'{self.id_value}-{s["season_number"]}',
'title': s['name'],
'url': f'{self.url}/season/{s["season_number"]}'}, res_data['seasons']))
pd = ResourceContent(metadata={
'title': title,
'orig_title': orig_title,
'other_title': None,
'imdb_code': imdb_code,
'director': director,
'playwright': playwright,
'actor': actor,
'genre': genre,
'showtime': showtime,
'site': None,
'area': area,
'language': language,
'year': year,
'duration': duration,
'season': None,
'episodes': None,
'single_episode_length': None,
'brief': brief,
'cover_image_url': img_url,
'related_resources': season_links,
})
if imdb_code:
pd.lookup_ids[IdType.IMDB] = imdb_code
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
return pd
@SiteList.register
class TMDB_TVSeason(AbstractSite):
ID_TYPE = IdType.TMDB_TVSeason
URL_PATTERNS = [r'\w+://www.themoviedb.org/tv/(\d+)[^/]*/season/(\d+)[^/]*$']
WIKI_PROPERTY_ID = '?'
DEFAULT_MODEL = TVSeason
ID_PATTERN = r'^(\d+)-(\d+)$'
@classmethod
def url_to_id(cls, url: str):
u = next(iter([re.match(p, url) for p in cls.URL_PATTERNS if re.match(p, url)]), None)
return u[1] + '-' + u[2] if u else None
@classmethod
def id_to_url(cls, id_value):
v = id_value.split('-')
return f"https://www.themoviedb.org/tv/{v[0]}/season/{v[1]}"
def scrape(self):
v = self.id_value.split('-')
api_url = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
d = BasicDownloader(api_url).download().json()
if not d.get('id'):
raise ParseError('id')
pd = ResourceContent(metadata=_copy_dict(d, {'name': 'title', 'overview': 'brief', 'air_date': 'air_date', 'season_number': 0, 'external_ids': 0}))
pd.metadata['required_resources'] = [{
'model': 'TVShow',
'id_type': IdType.TMDB_TV,
'id_value': v[0],
'title': f'TMDB TV Show {v[0]}',
'url': f"https://www.themoviedb.org/tv/{v[0]}",
}]
pd.lookup_ids[IdType.IMDB] = d['external_ids'].get('imdb_id')
pd.metadata['cover_image_url'] = ('https://image.tmdb.org/t/p/original/' + d['poster_path']) if d['poster_path'] else None
pd.metadata['title'] = pd.metadata['title'] if pd.metadata['title'] else f'Season {d["season_number"]}'
pd.metadata['episode_number_list'] = list(map(lambda ep: ep['episode_number'], d['episodes']))
pd.metadata['episode_count'] = len(pd.metadata['episode_number_list'])
if pd.metadata["cover_image_url"]:
imgdl = BasicImageDownloader(pd.metadata["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {pd.metadata["cover_image_url"]}')
# get external id from 1st episode
if pd.lookup_ids[IdType.IMDB]:
_logger.warning("Unexpected IMDB id for TMDB tv season")
elif len(pd.metadata['episode_number_list']) == 0:
_logger.warning("Unable to lookup IMDB id for TMDB tv season with zero episodes")
else:
ep = pd.metadata['episode_number_list'][0]
api_url2 = f"https://api.themoviedb.org/3/tv/{v[0]}/season/{v[1]}/episode/{ep}?api_key={settings.TMDB_API3_KEY}&language=zh-CN&append_to_response=external_ids,credits"
d2 = BasicDownloader(api_url2).download().json()
if not d2.get('id'):
raise ParseError('episode id for season')
pd.lookup_ids[IdType.IMDB] = d2['external_ids'].get('imdb_id')
return pd

10
catalog/tests.py Normal file
View file

@ -0,0 +1,10 @@
from django.test import TestCase
from catalog.book.tests import *
from catalog.movie.tests import *
from catalog.tv.tests import *
from catalog.music.tests import *
from catalog.game.tests import *
from catalog.podcast.tests import *
from catalog.performance.tests import *
# imported tests with same name might be ignored silently

62
catalog/tv/models.py Normal file
View file

@ -0,0 +1,62 @@
"""
Models for TV
TVShow -> TVSeason -> TVEpisode
TVEpisode is not fully implemented at the moment
Three way linking between Douban / IMDB / TMDB are quite messy
IMDB:
most widely used.
no ID for Season, only for Show and Episode
TMDB:
most friendly API.
for some TV specials, both shown as an Episode of Season 0 and a Movie, with same IMDB id
Douban:
most wanted by our users.
for single season show, IMDB id of the show id used
for multi season show, IMDB id for Ep 1 will be used to repensent that season
tv specials are are shown as movies
For now, we follow Douban convention, but keep an eye on it in case it breaks its own rules...
"""
from catalog.common import *
from django.db import models
class TVShow(Item):
imdb = PrimaryLookupIdDescriptor(IdType.IMDB)
tmdb_tv = PrimaryLookupIdDescriptor(IdType.TMDB_TV)
imdb = PrimaryLookupIdDescriptor(IdType.IMDB)
season_count = jsondata.IntegerField(blank=True, default=None)
class TVSeason(Item):
douban_movie = PrimaryLookupIdDescriptor(IdType.DoubanMovie)
imdb = PrimaryLookupIdDescriptor(IdType.IMDB)
tmdb_tvseason = PrimaryLookupIdDescriptor(IdType.TMDB_TVSeason)
show = models.ForeignKey(TVShow, null=True, on_delete=models.SET_NULL, related_name='seasons')
season_number = models.PositiveIntegerField()
episode_count = jsondata.IntegerField(blank=True, default=None)
METADATA_COPY_LIST = ['title', 'brief', 'season_number', 'episode_count']
def update_linked_items_from_external_resource(self, resource):
"""add Work from resource.metadata['work'] if not yet"""
links = resource.required_resources + resource.related_resources
for w in links:
if w['model'] == 'TVShow':
p = ExternalResource.objects.filter(id_type=w['id_type'], id_value=w['id_value']).first()
if p and p.item and self.show != p.item:
self.show = p.item
class TVEpisode(Item):
show = models.ForeignKey(TVShow, null=True, on_delete=models.SET_NULL, related_name='episodes')
season = models.ForeignKey(TVSeason, null=True, on_delete=models.SET_NULL, related_name='episodes')
episode_number = models.PositiveIntegerField()
imdb = PrimaryLookupIdDescriptor(IdType.IMDB)
METADATA_COPY_LIST = ['title', 'brief', 'episode_number']

128
catalog/tv/tests.py Normal file
View file

@ -0,0 +1,128 @@
from django.test import TestCase
from catalog.common import *
from catalog.tv.models import *
class TMDBTVTestCase(TestCase):
def test_parse(self):
t_id = '57243'
t_url = 'https://www.themoviedb.org/tv/57243-doctor-who'
t_url1 = 'https://www.themoviedb.org/tv/57243-doctor-who/seasons'
t_url2 = 'https://www.themoviedb.org/tv/57243'
p1 = SiteList.get_site_by_id_type(IdType.TMDB_TV)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
self.assertEqual(p1.validate_url(t_url1), True)
self.assertEqual(p1.validate_url(t_url2), True)
p2 = SiteList.get_site_by_url(t_url)
self.assertEqual(p1.id_to_url(t_id), t_url2)
self.assertEqual(p2.url_to_id(t_url), t_id)
wrong_url = 'https://www.themoviedb.org/tv/57243-doctor-who/season/13'
s1 = SiteList.get_site_by_url(wrong_url)
self.assertNotIsInstance(s1, TVShow)
@use_local_response
def test_scrape(self):
t_url = 'https://www.themoviedb.org/tv/57243-doctor-who'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '57243')
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], '神秘博士')
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, 'TVShow')
self.assertEqual(site.resource.item.imdb, 'tt0436992')
class TMDBTVSeasonTestCase(TestCase):
def test_parse(self):
t_id = '57243-11'
t_url = 'https://www.themoviedb.org/tv/57243-doctor-who/season/11'
t_url_unique = 'https://www.themoviedb.org/tv/57243/season/11'
p1 = SiteList.get_site_by_id_type(IdType.TMDB_TVSeason)
self.assertIsNotNone(p1)
self.assertEqual(p1.validate_url(t_url), True)
self.assertEqual(p1.validate_url(t_url_unique), True)
p2 = SiteList.get_site_by_url(t_url)
self.assertEqual(p1.id_to_url(t_id), t_url_unique)
self.assertEqual(p2.url_to_id(t_url), t_id)
@use_local_response
def test_scrape(self):
t_url = 'https://www.themoviedb.org/tv/57243-doctor-who/season/4'
site = SiteList.get_site_by_url(t_url)
self.assertEqual(site.ready, False)
self.assertEqual(site.id_value, '57243-4')
site.get_resource_ready()
self.assertEqual(site.ready, True)
self.assertEqual(site.resource.metadata['title'], '第 4 季')
self.assertEqual(site.resource.item.primary_lookup_id_type, IdType.IMDB)
self.assertEqual(site.resource.item.__class__.__name__, 'TVSeason')
self.assertEqual(site.resource.item.imdb, 'tt1159991')
self.assertIsNotNone(site.resource.item.show)
self.assertEqual(site.resource.item.show.imdb, 'tt0436992')
class DoubanMovieTVTestCase(TestCase):
@use_local_response
def test_scrape(self):
url3 = 'https://movie.douban.com/subject/3627919/'
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, 'TVSeason')
self.assertIsNotNone(p3.item.show)
self.assertEqual(p3.item.show.imdb, 'tt0436992')
@use_local_response
def test_scrape_singleseason(self):
url3 = 'https://movie.douban.com/subject/26895436/'
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, 'TVShow')
class MultiTVSitesTestCase(TestCase):
@use_local_response
def test_tvshows(self):
url1 = 'https://www.themoviedb.org/tv/57243-doctor-who'
url2 = 'https://www.imdb.com/title/tt0436992/'
# url3 = 'https://movie.douban.com/subject/3541415/'
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
# p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p1.item.id, p2.item.id)
# self.assertEqual(p2.item.id, p3.item.id)
@use_local_response
def test_tvseasons(self):
url1 = 'https://www.themoviedb.org/tv/57243-doctor-who/season/4'
url2 = 'https://www.imdb.com/title/tt1159991/'
url3 = 'https://movie.douban.com/subject/3627919/'
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p1.item.imdb, p2.item.imdb)
self.assertEqual(p2.item.imdb, p3.item.imdb)
self.assertEqual(p1.item.id, p2.item.id)
self.assertEqual(p2.item.id, p3.item.id)
@use_local_response
def test_miniseries(self):
url1 = 'https://www.themoviedb.org/tv/86941-the-north-water'
url3 = 'https://movie.douban.com/subject/26895436/'
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p3.item.__class__.__name__, 'TVShow')
self.assertEqual(p1.item.id, p3.item.id)
@use_local_response
def test_tvspecial(self):
url1 = 'https://www.themoviedb.org/movie/282758-doctor-who-the-runaway-bride'
url2 = 'hhttps://www.imdb.com/title/tt0827573/'
url3 = 'https://movie.douban.com/subject/4296866/'
p1 = SiteList.get_site_by_url(url1).get_resource_ready()
p2 = SiteList.get_site_by_url(url2).get_resource_ready()
p3 = SiteList.get_site_by_url(url3).get_resource_ready()
self.assertEqual(p1.item.imdb, p2.item.imdb)
self.assertEqual(p2.item.imdb, p3.item.imdb)
self.assertEqual(p1.item.id, p2.item.id)
self.assertEqual(p2.item.id, p3.item.id)

6
catalog/urls.py Normal file
View file

@ -0,0 +1,6 @@
from django.urls import path
from .api import api
urlpatterns = [
path("", api.urls),
]

3
catalog/views.py Normal file
View file

@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.

View file

@ -497,7 +497,7 @@ class DoubanMovieScraper(DoubanScrapperMixin, AbstractScraper):
episodes_elem = content.xpath(
"//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]")
episodes = int(episodes_elem[0].strip()) if episodes_elem and episodes_elem[0].isdigit() else None
episodes = int(episodes_elem[0].strip()) if episodes_elem and episodes_elem[0].strip().isdigit() else None
single_episode_length_elem = content.xpath(
"//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]")

View file

@ -8,9 +8,22 @@ from common.scraper import *
from igdb.wrapper import IGDBWrapper
import json
import datetime
import logging
wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, settings.IGDB_ACCESS_TOKEN)
_logger = logging.getLogger(__name__)
def _igdb_access_token():
try:
token = requests.post(f'https://id.twitch.tv/oauth2/token?client_id={settings.IGDB_CLIENT_ID}&client_secret={settings.IGDB_CLIENT_SECRET}&grant_type=client_credentials').json()['access_token']
except Exception:
_logger.error('unable to obtain IGDB token')
token = '<invalid>'
return token
wrapper = IGDBWrapper(settings.IGDB_CLIENT_ID, _igdb_access_token())
class IgdbGameScraper(AbstractScraper):

View file

@ -461,6 +461,11 @@ select::placeholder {
color: #606c76;
}
.navbar .current {
color: #00a1cc;
font-weight: bold;
}
.navbar .navbar__search-box {
margin: 0 12% 0 15px;
display: inline-flex;

File diff suppressed because one or more lines are too long

View file

@ -35,6 +35,10 @@
&:visited
color: $color-secondary
.current
color: $color-primary
font-weight: bold
& &__search-box
margin: 0 12% 0 15px
display: inline-flex

View file

@ -23,13 +23,13 @@
</div>
<button class="navbar__dropdown-btn">• • •</button>
<ul class="navbar__link-list">
{% if request.user.is_authenticated %}
<a class="navbar__link" href="{% url 'users:home' request.user.mastodon_username %}">{% trans '主页' %}</a>
<a class="navbar__link" href="{% url 'timeline:timeline' %}">{% trans '动态' %}</a>
<a class="navbar__link" id="logoutLink" href="{% url 'users:data' %}">{% trans '数据' %}</a>
<a class="navbar__link" id="logoutLink" href="{% url 'users:preferences' %}">{% trans '设置' %}</a>
<a class="navbar__link {% if current == 'home' %}current{% endif %}" href="{% url 'users:home' request.user.mastodon_username %}">{% trans '主页' %}</a>
<a class="navbar__link {% if current == 'timeline' %}current{% endif %}" href="{% url 'timeline:timeline' %}">{% trans '动态' %}</a>
<a class="navbar__link {% if current == 'data' %}current{% endif %}" href="{% url 'users:data' %}">{% trans '数据' %}</a>
<a class="navbar__link {% if current == 'preferences' %}current{% endif %}" href="{% url 'users:preferences' %}">{% trans '设置' %}</a>
<a class="navbar__link" id="logoutLink" href="{% url 'users:logout' %}">{% trans '登出' %}</a>
{% if request.user.is_staff %}
<a class="navbar__link" href="{% admin_url %}">{% trans '后台' %}</a>

View file

@ -8,12 +8,17 @@ django-rq
django-simple-history
django-hijack
django-user-messages
django-slack
#django-ninja
#django-polymorphic
meilisearch
easy-thumbnails
lxml
openpyxl
psycopg2
psycopg2-binary
requests
filetype
setproctitle
tqdm
opencc
dnspython

View file

@ -0,0 +1,303 @@
{
"album_type" : "album",
"artists" : [ {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/6VsiDFMZJlJ053P1uO4A6h"
},
"href" : "https://api.spotify.com/v1/artists/6VsiDFMZJlJ053P1uO4A6h",
"id" : "6VsiDFMZJlJ053P1uO4A6h",
"name" : "Public Service Broadcasting",
"type" : "artist",
"uri" : "spotify:artist:6VsiDFMZJlJ053P1uO4A6h"
} ],
"available_markets" : [ "AD", "AE", "AG", "AL", "AM", "AO", "AR", "AT", "AZ", "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "FI", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", "IT", "JM", "JO", "KE", "KG", "KH", "KM", "KN", "KR", "KW", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "MG", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", "MX", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PS", "PT", "PW", "PY", "QA", "RO", "RW", "SA", "SC", "SE", "SI", "SK", "SL", "SM", "SN", "SR", "ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TR", "TT", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "ZA", "ZM", "ZW" ],
"copyrights" : [ {
"text" : "Test Card Recordings",
"type" : "C"
}, {
"text" : "Test Card Recordings",
"type" : "P"
} ],
"external_ids" : {
"upc" : "3610159662676"
},
"external_urls" : {
"spotify" : "https://open.spotify.com/album/65KwtzkJXw7oT819NFWmEP"
},
"genres" : [ ],
"href" : "https://api.spotify.com/v1/albums/65KwtzkJXw7oT819NFWmEP",
"id" : "65KwtzkJXw7oT819NFWmEP",
"images" : [ {
"height" : 640,
"url" : "https://i.scdn.co/image/ab67616d0000b273123ebfc7ca99a9bb6342cd36",
"width" : 640
}, {
"height" : 300,
"url" : "https://i.scdn.co/image/ab67616d00001e02123ebfc7ca99a9bb6342cd36",
"width" : 300
}, {
"height" : 64,
"url" : "https://i.scdn.co/image/ab67616d00004851123ebfc7ca99a9bb6342cd36",
"width" : 64
} ],
"label" : "Test Card Recordings",
"name" : "The Race For Space",
"popularity" : 44,
"release_date" : "2014",
"release_date_precision" : "year",
"total_tracks" : 9,
"tracks" : {
"href" : "https://api.spotify.com/v1/albums/65KwtzkJXw7oT819NFWmEP/tracks?offset=0&limit=50",
"items" : [ {
"artists" : [ {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/6VsiDFMZJlJ053P1uO4A6h"
},
"href" : "https://api.spotify.com/v1/artists/6VsiDFMZJlJ053P1uO4A6h",
"id" : "6VsiDFMZJlJ053P1uO4A6h",
"name" : "Public Service Broadcasting",
"type" : "artist",
"uri" : "spotify:artist:6VsiDFMZJlJ053P1uO4A6h"
} ],
"available_markets" : [ "AD", "AE", "AG", "AL", "AM", "AO", "AR", "AT", "AZ", "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", "IT", "JM", "JO", "KE", "KG", "KH", "KM", "KN", "KR", "KW", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "MG", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", "MX", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PS", "PT", "PW", "PY", "QA", "RO", "RW", "SA", "SC", "SE", "SI", "SK", "SL", "SM", "SN", "SR", "ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TR", "TT", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "ZA", "ZM", "ZW" ],
"disc_number" : 1,
"duration_ms" : 159859,
"explicit" : false,
"external_urls" : {
"spotify" : "https://open.spotify.com/track/3982V8R7oW3xyV8zASbCGG"
},
"href" : "https://api.spotify.com/v1/tracks/3982V8R7oW3xyV8zASbCGG",
"id" : "3982V8R7oW3xyV8zASbCGG",
"is_local" : false,
"name" : "The Race For Space",
"preview_url" : "https://p.scdn.co/mp3-preview/cc69663d5b6a7982e5f162e625f1b319b26956ec?cid=4b150d8d6d374d1e8dbb85f4f11a2ad9",
"track_number" : 1,
"type" : "track",
"uri" : "spotify:track:3982V8R7oW3xyV8zASbCGG"
}, {
"artists" : [ {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/6VsiDFMZJlJ053P1uO4A6h"
},
"href" : "https://api.spotify.com/v1/artists/6VsiDFMZJlJ053P1uO4A6h",
"id" : "6VsiDFMZJlJ053P1uO4A6h",
"name" : "Public Service Broadcasting",
"type" : "artist",
"uri" : "spotify:artist:6VsiDFMZJlJ053P1uO4A6h"
} ],
"available_markets" : [ "AD", "AE", "AG", "AL", "AM", "AO", "AR", "AT", "AZ", "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", "IT", "JM", "JO", "KE", "KG", "KH", "KM", "KN", "KR", "KW", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "MG", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", "MX", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PS", "PT", "PW", "PY", "QA", "RO", "RW", "SA", "SC", "SE", "SI", "SK", "SL", "SM", "SN", "SR", "ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TR", "TT", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "ZA", "ZM", "ZW" ],
"disc_number" : 1,
"duration_ms" : 429374,
"explicit" : false,
"external_urls" : {
"spotify" : "https://open.spotify.com/track/4EhQrGzqi8k24qWIJuG5CH"
},
"href" : "https://api.spotify.com/v1/tracks/4EhQrGzqi8k24qWIJuG5CH",
"id" : "4EhQrGzqi8k24qWIJuG5CH",
"is_local" : false,
"name" : "Sputnik",
"preview_url" : "https://p.scdn.co/mp3-preview/32ccf0b8f7ef1251c35e97acb405e4e7cc2660d2?cid=4b150d8d6d374d1e8dbb85f4f11a2ad9",
"track_number" : 2,
"type" : "track",
"uri" : "spotify:track:4EhQrGzqi8k24qWIJuG5CH"
}, {
"artists" : [ {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/6VsiDFMZJlJ053P1uO4A6h"
},
"href" : "https://api.spotify.com/v1/artists/6VsiDFMZJlJ053P1uO4A6h",
"id" : "6VsiDFMZJlJ053P1uO4A6h",
"name" : "Public Service Broadcasting",
"type" : "artist",
"uri" : "spotify:artist:6VsiDFMZJlJ053P1uO4A6h"
} ],
"available_markets" : [ "AD", "AE", "AG", "AL", "AM", "AO", "AR", "AT", "AZ", "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", "IT", "JM", "JO", "KE", "KG", "KH", "KM", "KN", "KR", "KW", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "MG", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", "MX", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PS", "PT", "PW", "PY", "QA", "RO", "RW", "SA", "SC", "SE", "SI", "SK", "SL", "SM", "SN", "SR", "ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TR", "TT", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "ZA", "ZM", "ZW" ],
"disc_number" : 1,
"duration_ms" : 228623,
"explicit" : false,
"external_urls" : {
"spotify" : "https://open.spotify.com/track/4IaRxPHdzLJ78tm7lxg9M8"
},
"href" : "https://api.spotify.com/v1/tracks/4IaRxPHdzLJ78tm7lxg9M8",
"id" : "4IaRxPHdzLJ78tm7lxg9M8",
"is_local" : false,
"name" : "Gagarin",
"preview_url" : "https://p.scdn.co/mp3-preview/1d91010dc50a73caa3831c4617f3d658ae279339?cid=4b150d8d6d374d1e8dbb85f4f11a2ad9",
"track_number" : 3,
"type" : "track",
"uri" : "spotify:track:4IaRxPHdzLJ78tm7lxg9M8"
}, {
"artists" : [ {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/6VsiDFMZJlJ053P1uO4A6h"
},
"href" : "https://api.spotify.com/v1/artists/6VsiDFMZJlJ053P1uO4A6h",
"id" : "6VsiDFMZJlJ053P1uO4A6h",
"name" : "Public Service Broadcasting",
"type" : "artist",
"uri" : "spotify:artist:6VsiDFMZJlJ053P1uO4A6h"
} ],
"available_markets" : [ "AD", "AE", "AG", "AL", "AM", "AO", "AR", "AT", "AZ", "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", "IT", "JM", "JO", "KE", "KG", "KH", "KM", "KN", "KR", "KW", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "MG", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", "MX", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PS", "PT", "PW", "PY", "QA", "RO", "RW", "SA", "SC", "SE", "SI", "SK", "SL", "SM", "SN", "SR", "ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TR", "TT", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "ZA", "ZM", "ZW" ],
"disc_number" : 1,
"duration_ms" : 181621,
"explicit" : false,
"external_urls" : {
"spotify" : "https://open.spotify.com/track/6SONXH9dJQgDY9vCjdkZfK"
},
"href" : "https://api.spotify.com/v1/tracks/6SONXH9dJQgDY9vCjdkZfK",
"id" : "6SONXH9dJQgDY9vCjdkZfK",
"is_local" : false,
"name" : "Fire in the Cockpit",
"preview_url" : "https://p.scdn.co/mp3-preview/a2180cec25187fa80ddc80dcbe36edda1cc169cc?cid=4b150d8d6d374d1e8dbb85f4f11a2ad9",
"track_number" : 4,
"type" : "track",
"uri" : "spotify:track:6SONXH9dJQgDY9vCjdkZfK"
}, {
"artists" : [ {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/6VsiDFMZJlJ053P1uO4A6h"
},
"href" : "https://api.spotify.com/v1/artists/6VsiDFMZJlJ053P1uO4A6h",
"id" : "6VsiDFMZJlJ053P1uO4A6h",
"name" : "Public Service Broadcasting",
"type" : "artist",
"uri" : "spotify:artist:6VsiDFMZJlJ053P1uO4A6h"
} ],
"available_markets" : [ "AD", "AE", "AG", "AL", "AM", "AO", "AR", "AT", "AZ", "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", "IT", "JM", "JO", "KE", "KG", "KH", "KM", "KN", "KR", "KW", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "MG", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", "MX", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PS", "PT", "PW", "PY", "QA", "RO", "RW", "SA", "SC", "SE", "SI", "SK", "SL", "SM", "SN", "SR", "ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TR", "TT", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "ZA", "ZM", "ZW" ],
"disc_number" : 1,
"duration_ms" : 255606,
"explicit" : false,
"external_urls" : {
"spotify" : "https://open.spotify.com/track/52KMWPHDL84oo2Ncj3O6RX"
},
"href" : "https://api.spotify.com/v1/tracks/52KMWPHDL84oo2Ncj3O6RX",
"id" : "52KMWPHDL84oo2Ncj3O6RX",
"is_local" : false,
"name" : "E.V.A.",
"preview_url" : "https://p.scdn.co/mp3-preview/732171a4a5e27540b6709602b4af9662fda98595?cid=4b150d8d6d374d1e8dbb85f4f11a2ad9",
"track_number" : 5,
"type" : "track",
"uri" : "spotify:track:52KMWPHDL84oo2Ncj3O6RX"
}, {
"artists" : [ {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/6VsiDFMZJlJ053P1uO4A6h"
},
"href" : "https://api.spotify.com/v1/artists/6VsiDFMZJlJ053P1uO4A6h",
"id" : "6VsiDFMZJlJ053P1uO4A6h",
"name" : "Public Service Broadcasting",
"type" : "artist",
"uri" : "spotify:artist:6VsiDFMZJlJ053P1uO4A6h"
} ],
"available_markets" : [ "AD", "AE", "AG", "AL", "AM", "AO", "AR", "AT", "AZ", "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", "IT", "JM", "JO", "KE", "KG", "KH", "KM", "KN", "KR", "KW", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "MG", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", "MX", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PS", "PT", "PW", "PY", "QA", "RO", "RW", "SA", "SC", "SE", "SI", "SK", "SL", "SM", "SN", "SR", "ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TR", "TT", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "ZA", "ZM", "ZW" ],
"disc_number" : 1,
"duration_ms" : 379931,
"explicit" : false,
"external_urls" : {
"spotify" : "https://open.spotify.com/track/3jjMyq44OIjNgmpXLhpw7W"
},
"href" : "https://api.spotify.com/v1/tracks/3jjMyq44OIjNgmpXLhpw7W",
"id" : "3jjMyq44OIjNgmpXLhpw7W",
"is_local" : false,
"name" : "The Other Side",
"preview_url" : "https://p.scdn.co/mp3-preview/5eda4958044595b36842f2362799d91f080a7357?cid=4b150d8d6d374d1e8dbb85f4f11a2ad9",
"track_number" : 6,
"type" : "track",
"uri" : "spotify:track:3jjMyq44OIjNgmpXLhpw7W"
}, {
"artists" : [ {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/6VsiDFMZJlJ053P1uO4A6h"
},
"href" : "https://api.spotify.com/v1/artists/6VsiDFMZJlJ053P1uO4A6h",
"id" : "6VsiDFMZJlJ053P1uO4A6h",
"name" : "Public Service Broadcasting",
"type" : "artist",
"uri" : "spotify:artist:6VsiDFMZJlJ053P1uO4A6h"
}, {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/7wbZFLV3wwTqyrKNCJ8Y8D"
},
"href" : "https://api.spotify.com/v1/artists/7wbZFLV3wwTqyrKNCJ8Y8D",
"id" : "7wbZFLV3wwTqyrKNCJ8Y8D",
"name" : "Smoke Fairies",
"type" : "artist",
"uri" : "spotify:artist:7wbZFLV3wwTqyrKNCJ8Y8D"
} ],
"available_markets" : [ "AD", "AE", "AG", "AL", "AM", "AO", "AR", "AT", "AZ", "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", "IT", "JM", "JO", "KE", "KG", "KH", "KM", "KN", "KR", "KW", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "MG", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", "MX", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PS", "PT", "PW", "PY", "QA", "RO", "RW", "SA", "SC", "SE", "SI", "SK", "SL", "SM", "SN", "SR", "ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TR", "TT", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "ZA", "ZM", "ZW" ],
"disc_number" : 1,
"duration_ms" : 269376,
"explicit" : false,
"external_urls" : {
"spotify" : "https://open.spotify.com/track/5Um9ghqMlKALp9AcRMIk7B"
},
"href" : "https://api.spotify.com/v1/tracks/5Um9ghqMlKALp9AcRMIk7B",
"id" : "5Um9ghqMlKALp9AcRMIk7B",
"is_local" : false,
"name" : "Valentina",
"preview_url" : "https://p.scdn.co/mp3-preview/9e812bde9e2944d22f1eae78eab2adb89ce1f1cd?cid=4b150d8d6d374d1e8dbb85f4f11a2ad9",
"track_number" : 7,
"type" : "track",
"uri" : "spotify:track:5Um9ghqMlKALp9AcRMIk7B"
}, {
"artists" : [ {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/6VsiDFMZJlJ053P1uO4A6h"
},
"href" : "https://api.spotify.com/v1/artists/6VsiDFMZJlJ053P1uO4A6h",
"id" : "6VsiDFMZJlJ053P1uO4A6h",
"name" : "Public Service Broadcasting",
"type" : "artist",
"uri" : "spotify:artist:6VsiDFMZJlJ053P1uO4A6h"
} ],
"available_markets" : [ "AD", "AE", "AG", "AL", "AM", "AO", "AR", "AT", "AZ", "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", "IT", "JM", "JO", "KE", "KG", "KH", "KM", "KN", "KR", "KW", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "MG", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", "MX", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PS", "PT", "PW", "PY", "QA", "RO", "RW", "SA", "SC", "SE", "SI", "SK", "SL", "SM", "SN", "SR", "ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TR", "TT", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "ZA", "ZM", "ZW" ],
"disc_number" : 1,
"duration_ms" : 252720,
"explicit" : false,
"external_urls" : {
"spotify" : "https://open.spotify.com/track/5xYZXIgVAND5sWjN8G0hID"
},
"href" : "https://api.spotify.com/v1/tracks/5xYZXIgVAND5sWjN8G0hID",
"id" : "5xYZXIgVAND5sWjN8G0hID",
"is_local" : false,
"name" : "Go!",
"preview_url" : "https://p.scdn.co/mp3-preview/a7f4e9d98224dea630ee6604938848c3fd0c2842?cid=4b150d8d6d374d1e8dbb85f4f11a2ad9",
"track_number" : 8,
"type" : "track",
"uri" : "spotify:track:5xYZXIgVAND5sWjN8G0hID"
}, {
"artists" : [ {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/6VsiDFMZJlJ053P1uO4A6h"
},
"href" : "https://api.spotify.com/v1/artists/6VsiDFMZJlJ053P1uO4A6h",
"id" : "6VsiDFMZJlJ053P1uO4A6h",
"name" : "Public Service Broadcasting",
"type" : "artist",
"uri" : "spotify:artist:6VsiDFMZJlJ053P1uO4A6h"
} ],
"available_markets" : [ "AD", "AE", "AG", "AL", "AM", "AO", "AR", "AT", "AZ", "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", "IT", "JM", "JO", "KE", "KG", "KH", "KM", "KN", "KR", "KW", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "MG", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", "MX", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PS", "PT", "PW", "PY", "QA", "RO", "RW", "SA", "SC", "SE", "SI", "SK", "SL", "SM", "SN", "SR", "ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TR", "TT", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "ZA", "ZM", "ZW" ],
"disc_number" : 1,
"duration_ms" : 442359,
"explicit" : false,
"external_urls" : {
"spotify" : "https://open.spotify.com/track/5ERrJuNLnmHj525ooOKyqJ"
},
"href" : "https://api.spotify.com/v1/tracks/5ERrJuNLnmHj525ooOKyqJ",
"id" : "5ERrJuNLnmHj525ooOKyqJ",
"is_local" : false,
"name" : "Tomorrow",
"preview_url" : "https://p.scdn.co/mp3-preview/779a285aca862b886613815a0c1d1817446b550e?cid=4b150d8d6d374d1e8dbb85f4f11a2ad9",
"track_number" : 9,
"type" : "track",
"uri" : "spotify:track:5ERrJuNLnmHj525ooOKyqJ"
} ],
"limit" : 50,
"next" : null,
"offset" : 0,
"previous" : null,
"total" : 9
},
"type" : "album",
"uri" : "spotify:album:65KwtzkJXw7oT819NFWmEP"
}

View file

@ -0,0 +1 @@
{"movie_results":[],"person_results":[],"tv_results":[{"adult":false,"backdrop_path":"/sRfl6vyzGWutgG0cmXmbChC4iN6.jpg","id":57243,"name":"神秘博士","original_language":"en","original_name":"Doctor Who","overview":"名为“博士”的宇宙最后一个时间领主有着重生的能力、体力及优越的智力利用时光机器TARDIS英国传统的蓝色警亭展开他勇敢的时光冒险之旅拯救外星生物、地球与时空。","poster_path":"/sz4zF5z9zyFh8Z6g5IQPNq91cI7.jpg","media_type":"tv","genre_ids":[10759,18,10765],"popularity":158.575,"first_air_date":"2005-03-26","vote_average":7.402,"vote_count":2475,"origin_country":["GB"]}],"tv_episode_results":[],"tv_season_results":[]}

View file

@ -0,0 +1 @@
{"movie_results":[{"adult":false,"backdrop_path":"/13qDzilftzRZMUEHcpi57VLqNPw.jpg","id":282758,"title":"神秘博士:逃跑新娘","original_language":"en","original_title":"Doctor Who: The Runaway Bride","overview":"失去了罗斯的博士正在心灰意冷而正在举行婚礼的多娜却被突然传送到塔迪斯里。博士带坏脾气的多娜返回地球却被一群外星机器人追杀塔迪斯上演了一场公路飚车。后来博士发现多娜身上带有异常含量的Huon粒子而该粒子来源于上一代宇宙霸主。而博士的母星加利弗雷在宇宙中崛起时已经消灭了所有的Huon粒子。最终博士揭开了一个藏于地球40亿年的秘密。","poster_path":"/gkTCC4VLv8jATM3kouAUK3EaoGd.jpg","media_type":"movie","genre_ids":[878],"popularity":7.214,"release_date":"2006-12-25","video":false,"vote_average":7.739,"vote_count":201}],"person_results":[],"tv_results":[],"tv_episode_results":[{"id":1008547,"name":"2006年圣诞特辑逃跑新娘","overview":"失去了罗斯的博士正在心灰意冷而正在举行婚礼的多娜却被突然传送到塔迪斯里。博士带坏脾气的多娜返回地球却被一群外星机器人追杀塔迪斯上演了一场公路飚车。后来博士发现多娜身上带有异常含量的Huon粒子而该粒子来源于上一代宇宙霸主。而博士的母星加利弗雷在宇宙中崛起时已经消灭了所有的Huon粒子。最终博士揭开了一个藏于地球40亿年的秘密。","media_type":"tv_episode","vote_average":6.8,"vote_count":14,"air_date":"2006-12-25","episode_number":4,"production_code":"NCFT094N","runtime":64,"season_number":0,"show_id":57243,"still_path":"/mkJufoqvEBMVvnVUjYlR9lGarZB.jpg"}],"tv_season_results":[]}

View file

@ -0,0 +1 @@
{"movie_results":[],"person_results":[],"tv_results":[],"tv_episode_results":[{"id":941505,"name":"活宝搭档","overview":"博士在伦敦发现艾迪派斯公司新产品药物有问题人类服用后会悄悄的产生土豆状生物并在夜里1点10分逃走回到保姆身边于是博士潜入公司决定探查究竟在探查时遇到了多娜原来Adiposian人丢失了他们的繁育星球于是跑到地球利用人类做代孕母繁殖宝宝。最后保姆在高空中被抛弃脂肪球回到了父母身边博士邀请多娜一同旅行。【Rose从平行宇宙回归】","media_type":"tv_episode","vote_average":7.2,"vote_count":43,"air_date":"2008-04-05","episode_number":1,"production_code":"","runtime":null,"season_number":4,"show_id":57243,"still_path":"/cq1zrCS267vGXa3rCYQkVKNJE9v.jpg"}],"tv_season_results":[]}

View file

@ -0,0 +1 @@
{"movie_results":[{"adult":false,"backdrop_path":"/s3TBrRGB1iav7gFOCNx3H31MoES.jpg","id":27205,"title":"盗梦空间","original_language":"en","original_title":"Inception","overview":"道姆·柯布与同事阿瑟和纳什在一次针对日本能源大亨齐藤的盗梦行动中失败,反被齐藤利用。齐藤威逼利诱因遭通缉而流亡海外的柯布帮他拆分他竞争对手的公司,采取极端措施在其唯一继承人罗伯特·费希尔的深层潜意识中种下放弃家族公司、自立门户的想法。为了重返美国,柯布偷偷求助于岳父迈尔斯,吸收了年轻的梦境设计师艾里阿德妮、梦境演员艾姆斯和药剂师约瑟夫加入行动。在一层层递进的梦境中,柯布不仅要对付费希尔潜意识的本能反抗,还必须直面已逝妻子梅的处处破坏,实际情况远比预想危险得多…","poster_path":"/lQEjWasu07JbQHdfFI5VnEUfId2.jpg","media_type":"movie","genre_ids":[28,878,12],"popularity":74.425,"release_date":"2010-07-15","video":false,"vote_average":8.359,"vote_count":32695}],"person_results":[],"tv_results":[],"tv_episode_results":[],"tv_season_results":[]}

View file

@ -0,0 +1 @@
{"movie_results":[],"person_results":[],"tv_results":[{"adult":false,"backdrop_path":"/8IC1q0lHFwi5m8VtChLzIfmpaZH.jpg","id":86941,"name":"北海鲸梦","original_language":"en","original_name":"The North Water","overview":"改编自伊恩·麦奎尔的同名获奖小说聚焦19世纪一次灾难性的捕鲸活动。故事围绕帕特里克·萨姆纳展开他是一名声名狼藉的前战地医生后成为捕鲸船上的医生在船上遇到了鱼叉手亨利·德拉克斯一个残忍、不道德的杀手。萨姆纳没有逃离过去的恐惧而是被迫在北极荒原上为生存而进行残酷的斗争...","poster_path":"/9CM0ca8pX1os3SJ24hsIc0nN8ph.jpg","media_type":"tv","genre_ids":[18,9648],"popularity":11.318,"first_air_date":"2021-07-14","vote_average":7.5,"vote_count":75,"origin_country":["US"]}],"tv_episode_results":[],"tv_season_results":[]}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,10 @@
{
"resultCount":1,
"results": [
{"wrapperType":"track", "kind":"podcast", "artistId":127981066, "collectionId":1050430296, "trackId":1050430296, "artistName":"WNYC Studios and The New Yorker", "collectionName":"The New Yorker Radio Hour", "trackName":"The New Yorker Radio Hour", "collectionCensoredName":"The New Yorker Radio Hour", "trackCensoredName":"The New Yorker Radio Hour", "artistViewUrl":"https://podcasts.apple.com/us/artist/wnyc/127981066?uo=4", "collectionViewUrl":"https://podcasts.apple.com/us/podcast/the-new-yorker-radio-hour/id1050430296?uo=4", "feedUrl":"http://feeds.feedburner.com/newyorkerradiohour", "trackViewUrl":"https://podcasts.apple.com/us/podcast/the-new-yorker-radio-hour/id1050430296?uo=4", "artworkUrl30":"https://is2-ssl.mzstatic.com/image/thumb/Podcasts115/v4/e3/83/42/e38342fa-712d-ec74-2f31-946601e04e27/mza_2714925949638887112.png/30x30bb.jpg", "artworkUrl60":"https://is2-ssl.mzstatic.com/image/thumb/Podcasts115/v4/e3/83/42/e38342fa-712d-ec74-2f31-946601e04e27/mza_2714925949638887112.png/60x60bb.jpg", "artworkUrl100":"https://is2-ssl.mzstatic.com/image/thumb/Podcasts115/v4/e3/83/42/e38342fa-712d-ec74-2f31-946601e04e27/mza_2714925949638887112.png/100x100bb.jpg", "collectionPrice":0.00, "trackPrice":0.00, "collectionHdPrice":0, "releaseDate":"2022-11-29T11:00:00Z", "collectionExplicitness":"notExplicit", "trackExplicitness":"cleaned", "trackCount":150, "trackTimeMillis":1097, "country":"USA", "currency":"USD", "primaryGenreName":"News Commentary", "contentAdvisoryRating":"Clean", "artworkUrl600":"https://is2-ssl.mzstatic.com/image/thumb/Podcasts115/v4/e3/83/42/e38342fa-712d-ec74-2f31-946601e04e27/mza_2714925949638887112.png/600x600bb.jpg", "genreIds":["1530", "26", "1489", "1527"], "genres":["News Commentary", "Podcasts", "News", "Politics"]}]
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,75 @@
{
"kind": "books#volume",
"id": "hV--zQEACAAJ",
"etag": "lwbqGlV/h5s",
"selfLink": "https://www.googleapis.com/books/v1/volumes/hV--zQEACAAJ",
"volumeInfo": {
"title": "1984 Nineteen Eighty-Four",
"authors": [
"George Orwell"
],
"publisher": "Alma Classics",
"publishedDate": "2021-01-07",
"description": "In 1984, London is a grim city in the totalitarian state of Oceania where Big Brother is always watching you and the Thought Police can practically read your mind. Winston Smith is a man in grave danger for the simple reason that his memory still functions. Drawn into a forbidden love affair, Winston finds the courage to join a secret revolutionary organization called The Brotherhood, dedicated to the destruction of the Party. Together with his beloved Julia, he hazards his life in a deadly match against the powers that be.Lionel Trilling said of Orwell's masterpiece \" 1984 is a profound, terrifying, and wholly fascinating book. It is a fantasy of the political future, and like any such fantasy, serves its author as a magnifying device for an examination of the present.\" Though the year 1984 now exists in the past, Orwell's novel remains an urgent call for the individual willing to speak truth to power.\"",
"industryIdentifiers": [
{
"type": "ISBN_10",
"identifier": "1847498574"
},
{
"type": "ISBN_13",
"identifier": "9781847498571"
}
],
"readingModes": {
"text": false,
"image": false
},
"pageCount": 400,
"printedPageCount": 400,
"dimensions": {
"height": "19.90 cm",
"width": "13.10 cm",
"thickness": "2.20 cm"
},
"printType": "BOOK",
"averageRating": 4,
"ratingsCount": 564,
"maturityRating": "NOT_MATURE",
"allowAnonLogging": false,
"contentVersion": "preview-1.0.0",
"panelizationSummary": {
"containsEpubBubbles": false,
"containsImageBubbles": false
},
"imageLinks": {
"smallThumbnail": "http://books.google.com/books/content?id=hV--zQEACAAJ&printsec=frontcover&img=1&zoom=5&imgtk=AFLRE72QQ6bzD4LfhArQGJHoUdX5wex-wfg5FVAKOo2MbmCbFSF_HbDUwhZ-gAvmSKiEBTyoRkC3Kvbo9k1jB0uiOyOXcvgAc2643MV091Ny8TySRaV2HSVXtch-MYK2qfzNvUKwGEhx&source=gbs_api",
"thumbnail": "http://books.google.com/books/content?id=hV--zQEACAAJ&printsec=frontcover&img=1&zoom=1&imgtk=AFLRE70UTuB9rf2_mqyGrJGsI2XbzpjV2vGQP9Oyjc441rCvvRiGMhMGYXsgTMbAUZ3rHtxarPvPIqaT-RGH9JzzFEbrXs3cp7f2jaHVh3M-fyPcEkg0eao_AuYUePhckBN-PtHZNyy-&source=gbs_api"
},
"language": "en",
"previewLink": "http://books.google.com/books?id=hV--zQEACAAJ&hl=&source=gbs_api",
"infoLink": "https://play.google.com/store/books/details?id=hV--zQEACAAJ&source=gbs_api",
"canonicalVolumeLink": "https://play.google.com/store/books/details?id=hV--zQEACAAJ"
},
"saleInfo": {
"country": "US",
"saleability": "NOT_FOR_SALE",
"isEbook": false
},
"accessInfo": {
"country": "US",
"viewability": "NO_PAGES",
"embeddable": false,
"publicDomain": false,
"textToSpeechPermission": "ALLOWED",
"epub": {
"isAvailable": false
},
"pdf": {
"isAvailable": false
},
"webReaderLink": "http://play.google.com/books/reader?id=hV--zQEACAAJ&hl=&source=gbs_api",
"accessViewStatus": "NONE",
"quoteSharingAllowed": false
}
}

View file

@ -0,0 +1 @@
[{"id": 72, "age_ratings": [11721, 32022, 47683, 47684, 47685, 47686, 47687, 91785], "aggregated_rating": 92.4444444444444, "aggregated_rating_count": 13, "alternative_names": [50135], "artworks": [36972], "bundles": [55025, 191406], "category": 0, "collection": 87, "cover": {"id": 82660, "url": "//images.igdb.com/igdb/image/upload/t_thumb/co1rs4.jpg"}, "created_at": 1297956069, "dlcs": [99969, 114140], "external_games": [15150, 73156, 81867, 92870, 92979, 137388, 189642, 214010, 245334, 403070, 1303428, 1929756, 1931953, 2082680, 2161690, 2590310, 2600814], "first_release_date": 1303171200, "follows": 971, "franchises": [1724], "game_engines": [3], "game_modes": [1, 2, 3, 4], "genres": [{"id": 5, "name": "Shooter"}, {"id": 8, "name": "Platform"}, {"id": 9, "name": "Puzzle"}, {"id": 31, "name": "Adventure"}], "involved_companies": [{"id": 106733, "company": {"id": 56, "name": "Valve Corporation"}, "created_at": 1598486400, "developer": true, "game": 72, "porting": false, "publisher": true, "supporting": false, "updated_at": 1598486400, "checksum": "fa403088-a40a-1d83-16be-a68849472a6d"}, {"id": 106734, "company": {"id": 1, "name": "Electronic Arts"}, "created_at": 1598486400, "developer": false, "game": 72, "porting": false, "publisher": true, "supporting": false, "updated_at": 1598486400, "checksum": "53e59e19-f746-1195-c4e7-2b388e621317"}], "keywords": [350, 453, 575, 592, 1026, 1158, 1181, 1293, 1440, 1559, 1761, 2071, 2800, 3984, 4004, 4134, 4145, 4162, 4266, 4345, 4363, 4428, 4575, 4578, 4617, 4644, 4725, 4888, 4944, 4956, 4974, 5185, 5261, 5633, 5772, 5935, 5938, 5956, 6137, 6326, 6735, 6854, 7079, 7172, 7313, 7535, 7570, 7579, 8141, 8262, 8896, 9814, 10435, 11023, 11208, 12516, 14224, 18139, 18567, 27032], "multiplayer_modes": [11591, 11592, 11593, 11594, 11595], "name": "Portal 2", "platforms": [{"id": 3, "name": "Linux"}, {"id": 6, "name": "PC (Microsoft Windows)"}, {"id": 9, "name": "PlayStation 3"}, {"id": 12, "name": "Xbox 360"}, {"id": 14, "name": "Mac"}], "player_perspectives": [1], "rating": 91.6894220983232, "rating_count": 2765, "release_dates": [104964, 104965, 208203, 208204, 208205, 208206, 208207, 208208], "screenshots": [725, 726, 727, 728, 729], "similar_games": [71, 1877, 7350, 11646, 16992, 22387, 28070, 55038, 55190, 56033], "slug": "portal-2", "storyline": "You lost your memory, you are alone in a world full of danger, and your mission is survive using your mind. The only way to get out from this hell is.....Hi i'm GLAdOS, and welcome to the amazing world of portal 2, here i will expose you to a lot of tests, and try to k.. help Aperture Science envolve in a new era.\nYour job is advance in the levels i propose and get better and better, you will have an portal gun to help you, and remember nothing is impossible if you try, and try again and again and again....\nThe puzzles are waiting for you!", "summary": "Sequel to the acclaimed Portal (2007), Portal 2 pits the protagonist of the original game, Chell, and her new robot friend, Wheatley, against more puzzles conceived by GLaDOS, an A.I. with the sole purpose of testing the Portal Gun's mechanics and taking revenge on Chell for the events of Portal. As a result of several interactions and revelations, Chell once again pushes to escape Aperture Science Labs.", "tags": [1, 18, 27, 268435461, 268435464, 268435465, 268435487, 536871262, 536871365, 536871487, 536871504, 536871938, 536872070, 536872093, 536872205, 536872352, 536872471, 536872673, 536872983, 536873712, 536874896, 536874916, 536875046, 536875057, 536875074, 536875178, 536875257, 536875275, 536875340, 536875487, 536875490, 536875529, 536875556, 536875637, 536875800, 536875856, 536875868, 536875886, 536876097, 536876173, 536876545, 536876684, 536876847, 536876850, 536876868, 536877049, 536877238, 536877647, 536877766, 536877991, 536878084, 536878225, 536878447, 536878482, 536878491, 536879053, 536879174, 536879808, 536880726, 536881347, 536881935, 536882120, 536883428, 536885136, 536889051, 536889479, 536897944], "themes": [1, 18, 27], "total_rating": 92.0669332713838, "total_rating_count": 2778, "updated_at": 1670514780, "url": "https://www.igdb.com/games/portal-2", "videos": [432, 16451, 17844, 17845], "websites": [17869, 17870, 41194, 41195, 150881, 150882, 150883, 296808], "checksum": "bcca1b61-2b30-13b8-a0ec-faf45d2ffdad", "game_localizations": [726]}]

View file

@ -0,0 +1 @@
[{"id": 17870, "category": 13, "game": {"id": 72, "age_ratings": [11721, 32022, 47683, 47684, 47685, 47686, 47687, 91785], "aggregated_rating": 92.4444444444444, "aggregated_rating_count": 13, "alternative_names": [50135], "artworks": [36972], "bundles": [55025, 191406], "category": 0, "collection": 87, "cover": 82660, "created_at": 1297956069, "dlcs": [99969, 114140], "external_games": [15150, 73156, 81867, 92870, 92979, 137388, 189642, 214010, 245334, 403070, 1303428, 1929756, 1931953, 2082680, 2161690, 2590310, 2600814], "first_release_date": 1303171200, "follows": 971, "franchises": [1724], "game_engines": [3], "game_modes": [1, 2, 3, 4], "genres": [5, 8, 9, 31], "involved_companies": [106733, 106734], "keywords": [350, 453, 575, 592, 1026, 1158, 1181, 1293, 1440, 1559, 1761, 2071, 2800, 3984, 4004, 4134, 4145, 4162, 4266, 4345, 4363, 4428, 4575, 4578, 4617, 4644, 4725, 4888, 4944, 4956, 4974, 5185, 5261, 5633, 5772, 5935, 5938, 5956, 6137, 6326, 6735, 6854, 7079, 7172, 7313, 7535, 7570, 7579, 8141, 8262, 8896, 9814, 10435, 11023, 11208, 12516, 14224, 18139, 18567, 27032], "multiplayer_modes": [11591, 11592, 11593, 11594, 11595], "name": "Portal 2", "platforms": [3, 6, 9, 12, 14], "player_perspectives": [1], "rating": 91.6894220983232, "rating_count": 2765, "release_dates": [104964, 104965, 208203, 208204, 208205, 208206, 208207, 208208], "screenshots": [725, 726, 727, 728, 729], "similar_games": [71, 1877, 7350, 11646, 16992, 22387, 28070, 55038, 55190, 56033], "slug": "portal-2", "storyline": "You lost your memory, you are alone in a world full of danger, and your mission is survive using your mind. The only way to get out from this hell is.....Hi i'm GLAdOS, and welcome to the amazing world of portal 2, here i will expose you to a lot of tests, and try to k.. help Aperture Science envolve in a new era.\nYour job is advance in the levels i propose and get better and better, you will have an portal gun to help you, and remember nothing is impossible if you try, and try again and again and again....\nThe puzzles are waiting for you!", "summary": "Sequel to the acclaimed Portal (2007), Portal 2 pits the protagonist of the original game, Chell, and her new robot friend, Wheatley, against more puzzles conceived by GLaDOS, an A.I. with the sole purpose of testing the Portal Gun's mechanics and taking revenge on Chell for the events of Portal. As a result of several interactions and revelations, Chell once again pushes to escape Aperture Science Labs.", "tags": [1, 18, 27, 268435461, 268435464, 268435465, 268435487, 536871262, 536871365, 536871487, 536871504, 536871938, 536872070, 536872093, 536872205, 536872352, 536872471, 536872673, 536872983, 536873712, 536874896, 536874916, 536875046, 536875057, 536875074, 536875178, 536875257, 536875275, 536875340, 536875487, 536875490, 536875529, 536875556, 536875637, 536875800, 536875856, 536875868, 536875886, 536876097, 536876173, 536876545, 536876684, 536876847, 536876850, 536876868, 536877049, 536877238, 536877647, 536877766, 536877991, 536878084, 536878225, 536878447, 536878482, 536878491, 536879053, 536879174, 536879808, 536880726, 536881347, 536881935, 536882120, 536883428, 536885136, 536889051, 536889479, 536897944], "themes": [1, 18, 27], "total_rating": 92.0669332713838, "total_rating_count": 2778, "updated_at": 1670514780, "url": "https://www.igdb.com/games/portal-2", "videos": [432, 16451, 17844, 17845], "websites": [17869, 17870, 41194, 41195, 150881, 150882, 150883, 296808], "checksum": "bcca1b61-2b30-13b8-a0ec-faf45d2ffdad", "game_localizations": [726]}, "trusted": true, "url": "https://store.steampowered.com/app/620", "checksum": "5281f967-6dfe-7658-96c6-af00ce010bbc"}]

View file

@ -0,0 +1 @@
[{"id": 17869, "category": 1, "game": 72, "trusted": false, "url": "http://www.thinkwithportals.com/", "checksum": "c40d590f-93bd-b86e-243c-73746c08be3b"}, {"id": 17870, "category": 13, "game": 72, "trusted": true, "url": "https://store.steampowered.com/app/620", "checksum": "5281f967-6dfe-7658-96c6-af00ce010bbc"}, {"id": 41194, "category": 3, "game": 72, "trusted": true, "url": "https://en.wikipedia.org/wiki/Portal_2", "checksum": "7354f471-16d6-5ed9-b4e4-049cceaab562"}, {"id": 41195, "category": 4, "game": 72, "trusted": true, "url": "https://www.facebook.com/Portal", "checksum": "035f6b48-3be1-77d5-1567-cf6fd8116ee7"}, {"id": 150881, "category": 9, "game": 72, "trusted": true, "url": "https://www.youtube.com/user/Valve", "checksum": "c1d4afb9-e96d-02f1-73bd-3384622e6aee"}, {"id": 150882, "category": 5, "game": 72, "trusted": true, "url": "https://twitter.com/valvesoftware", "checksum": "62bb9586-3293-bb01-f675-d65323ae371c"}, {"id": 150883, "category": 2, "game": 72, "trusted": false, "url": "https://theportalwiki.com/wiki/Portal_2", "checksum": "af689276-28c8-b145-7b19-f1d7df878c2a"}, {"id": 296808, "category": 6, "game": 72, "trusted": true, "url": "https://www.twitch.tv/directory/game/Portal%202", "checksum": "65629340-6190-833d-41b1-8eaf31918df3"}]

View file

@ -0,0 +1 @@
[{"id": 12644, "category": 1, "game": 7346, "trusted": false, "url": "http://www.zelda.com/breath-of-the-wild/", "checksum": "3d2ca280-a2d0-5664-c8a5-69eeeaf13558"}, {"id": 12645, "category": 2, "game": 7346, "trusted": false, "url": "http://zelda.wikia.com/wiki/The_Legend_of_Zelda:_Breath_of_the_Wild", "checksum": "d5cb4657-dc8e-9de1-9643-b1ef64812d9f"}, {"id": 12646, "category": 3, "game": 7346, "trusted": true, "url": "https://en.wikipedia.org/wiki/The_Legend_of_Zelda:_Breath_of_the_Wild", "checksum": "c4570c3c-3a04-8d24-399a-0c04a17e7c56"}, {"id": 65034, "category": 14, "game": 7346, "trusted": true, "url": "https://www.reddit.com/r/Breath_of_the_Wild", "checksum": "f60505b3-18a4-3d60-9db2-febe4c6cb492"}, {"id": 169666, "category": 6, "game": 7346, "trusted": true, "url": "https://www.twitch.tv/nintendo", "checksum": "e2b20791-a9c4-76ad-4d76-3e7abc9148bb"}, {"id": 169667, "category": 9, "game": 7346, "trusted": true, "url": "https://www.youtube.com/nintendo", "checksum": "1e1c08ba-8f89-b567-0029-1d8aac22d147"}, {"id": 169668, "category": 4, "game": 7346, "trusted": true, "url": "https://www.facebook.com/Nintendo", "checksum": "046d8c8e-8f1d-8813-1266-c2911f490ba7"}, {"id": 169669, "category": 5, "game": 7346, "trusted": true, "url": "https://twitter.com/NintendoAmerica", "checksum": "e06dd12f-b6c5-ef72-f287-a9cebba12fa1"}, {"id": 169670, "category": 8, "game": 7346, "trusted": true, "url": "https://www.instagram.com/nintendo", "checksum": "dbff9e02-e9c2-f395-7e48-7e70cf58225c"}]

View file

@ -43,7 +43,7 @@
<body>
<div id="page-wrapper">
<div id="content-wrapper">
{% include "partial/_navbar.html" %}
{% include "partial/_navbar.html" with current="timeline" %}
<section id="content" class="container">
<div class="grid grid--reverse-order">

View file

@ -20,7 +20,7 @@
<body>
<div id="page-wrapper">
<div id="content-wrapper">
{% include "partial/_navbar.html" %}
{% include "partial/_navbar.html" with current="data" %}
<section id="content">
<div class="grid grid--reverse-order">

Some files were not shown because too many files have changed in this diff Show more