new data model: convert legacy catalog data

This commit is contained in:
Your Name 2022-12-17 02:04:12 -05:00
parent f58b075e48
commit 14861cf609
18 changed files with 307 additions and 44 deletions

View file

@ -43,5 +43,5 @@ jobs:
- name: Run Tests - name: Run Tests
run: | run: |
PGPASSWORD=admin123 psql template1 -U postgres -h localhost -c 'create extension hstore;' PGPASSWORD=admin123 psql template1 -U postgres -h localhost -c 'create extension hstore;'
new_data_model=1 python manage.py makemigrations auth mastodon users books movies games music sync management collection common sync management timeline catalog journal social new_data_model=1 python manage.py makemigrations auth mastodon users books movies games music sync management collection common sync management timeline catalog journal social legacy
new_data_model=1 python manage.py test new_data_model=1 python manage.py test

View file

@ -368,3 +368,5 @@ if ENABLE_NEW_MODEL:
INSTALLED_APPS.append('catalog.apps.CatalogConfig') INSTALLED_APPS.append('catalog.apps.CatalogConfig')
INSTALLED_APPS.append('journal.apps.JournalConfig') INSTALLED_APPS.append('journal.apps.JournalConfig')
INSTALLED_APPS.append('social.apps.SocialConfig') INSTALLED_APPS.append('social.apps.SocialConfig')
INSTALLED_APPS.append('legacy.apps.LegacyConfig')

View file

@ -50,7 +50,9 @@ def is_asin(asin):
def detect_isbn_asin(s): def detect_isbn_asin(s):
n = s.strip().upper() if s else '' if not s:
return None, None
n = re.sub(r'[^0-9A-Z]', '', s.upper())
if is_isbn_13(n): if is_isbn_13(n):
return IdType.ISBN, n return IdType.ISBN, n
if is_isbn_10(n): if is_isbn_10(n):

View file

@ -318,6 +318,8 @@ class ExternalResource(models.Model):
self.metadata = resource_content.metadata self.metadata = resource_content.metadata
if resource_content.cover_image and resource_content.cover_image_extention: if resource_content.cover_image and resource_content.cover_image_extention:
self.cover = SimpleUploadedFile('temp.' + resource_content.cover_image_extention, resource_content.cover_image) self.cover = SimpleUploadedFile('temp.' + resource_content.cover_image_extention, resource_content.cover_image)
else:
self.cover = resource_content.metadata.get('cover_image_path')
self.scraped_time = timezone.now() self.scraped_time = timezone.now()
self.save() self.save()

View file

@ -23,6 +23,12 @@ class ResourceContent:
cover_image: bytes = None cover_image: bytes = None
cover_image_extention: str = None cover_image_extention: str = None
def dict(self):
return {'metadata': self.metadata, 'lookup_ids': self.lookup_ids}
def to_json(self) -> str:
return json.dumps({'metadata': self.metadata, 'lookup_ids': self.lookup_ids})
class AbstractSite: class AbstractSite:
""" """
@ -67,10 +73,6 @@ class AbstractSite:
self.resource = ExternalResource(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url) self.resource = ExternalResource(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url)
return self.resource return self.resource
def bypass_scrape(self, data_from_link) -> ResourceContent:
"""subclass may implement this to use data from linked resource and bypass actual scrape"""
return None
def scrape(self) -> ResourceContent: def scrape(self) -> ResourceContent:
"""subclass should implement this, return ResourceContent object""" """subclass should implement this, return ResourceContent object"""
data = ResourceContent() data = ResourceContent()
@ -101,7 +103,7 @@ class AbstractSite:
def ready(self): def ready(self):
return bool(self.resource and self.resource.ready) return bool(self.resource and self.resource.ready)
def get_resource_ready(self, auto_save=True, auto_create=True, auto_link=True, data_from_link=None): def get_resource_ready(self, auto_save=True, auto_create=True, auto_link=True, preloaded_content=None, reload=False):
"""return a resource scraped, or scrape if not yet""" """return a resource scraped, or scrape if not yet"""
if auto_link: if auto_link:
auto_create = True auto_create = True
@ -111,9 +113,12 @@ class AbstractSite:
resource_content = {} resource_content = {}
if not self.resource: if not self.resource:
return None return None
if not p.ready: if not p.ready or reload:
resource_content = self.bypass_scrape(data_from_link) if isinstance(preloaded_content, ResourceContent):
if not resource_content: resource_content = preloaded_content
elif isinstance(preloaded_content, dict):
resource_content = ResourceContent(**preloaded_content)
else:
resource_content = self.scrape() resource_content = self.scrape()
p.update_content(resource_content) p.update_content(resource_content)
if not p.ready: if not p.ready:
@ -127,12 +132,12 @@ class AbstractSite:
p.item.merge_data_from_external_resources() p.item.merge_data_from_external_resources()
p.item.save() p.item.save()
if auto_link: if auto_link:
for linked_resources in p.required_resources: for linked_resource in p.required_resources:
linked_site = SiteManager.get_site_by_url(linked_resources['url']) linked_site = SiteManager.get_site_by_url(linked_resource['url'])
if linked_site: if linked_site:
linked_site.get_resource_ready(auto_link=False) linked_site.get_resource_ready(auto_link=False, preloaded_content=linked_resource.get('content'))
else: else:
_logger.error(f'unable to get site for {linked_resources["url"]}') _logger.error(f'unable to get site for {linked_resource["url"]}')
p.item.update_linked_items_from_external_resource(p) p.item.update_linked_items_from_external_resource(p)
p.item.save() p.item.save()
return p return p
@ -141,28 +146,28 @@ class AbstractSite:
class SiteManager: class SiteManager:
registry = {} registry = {}
@classmethod @staticmethod
def register(cls, target) -> Callable: def register(target) -> Callable:
id_type = target.ID_TYPE id_type = target.ID_TYPE
if id_type in cls.registry: if id_type in SiteManager.registry:
raise ValueError(f'Site for {id_type} already exists') raise ValueError(f'Site for {id_type} already exists')
cls.registry[id_type] = target SiteManager.registry[id_type] = target
return target return target
@classmethod @staticmethod
def get_site_by_id_type(cls, typ: str): def get_site_by_id_type(typ: str):
return cls.registry[typ]() if typ in cls.registry else None return SiteManager.registry[typ]() if typ in SiteManager.registry else None
@classmethod @staticmethod
def get_site_by_url(cls, url: str): def get_site_by_url(url: str):
cls = next(filter(lambda p: p.validate_url(url), cls.registry.values()), None) cls = next(filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None)
if cls is None: if cls is None:
cls = next(filter(lambda p: p.validate_url_fallback(url), cls.registry.values()), None) cls = next(filter(lambda p: p.validate_url_fallback(url), SiteManager.registry.values()), None)
return cls(url) if cls else None return cls(url) if cls else None
@classmethod @staticmethod
def get_id_by_url(cls, url: str): def get_id_by_url(url: str):
site = cls.get_site_by_url(url) site = SiteManager.get_site_by_url(url)
return site.url_to_id(url) if site else None return site.url_to_id(url) if site else None
@staticmethod @staticmethod

View file

@ -13,13 +13,14 @@ class Game(Item):
METADATA_COPY_LIST = [ METADATA_COPY_LIST = [
'title', 'title',
'brief',
'other_title', 'other_title',
'developer', 'developer',
'publisher', 'publisher',
'release_date', 'release_date',
'genre', 'genre',
'platform', 'platform',
'brief', 'official_site',
] ]
other_title = jsondata.ArrayField( other_title = jsondata.ArrayField(
@ -63,3 +64,7 @@ class Game(Item):
blank=True, blank=True,
default=list, default=list,
) )
official_site = jsondata.CharField(
default='',
)

View file

@ -9,7 +9,6 @@ class Movie(Item):
imdb = PrimaryLookupIdDescriptor(IdType.IMDB) imdb = PrimaryLookupIdDescriptor(IdType.IMDB)
tmdb_movie = PrimaryLookupIdDescriptor(IdType.TMDB_Movie) tmdb_movie = PrimaryLookupIdDescriptor(IdType.TMDB_Movie)
douban_movie = PrimaryLookupIdDescriptor(IdType.DoubanMovie) douban_movie = PrimaryLookupIdDescriptor(IdType.DoubanMovie)
duration = jsondata.IntegerField(blank=True, default=None)
demonstrative = _('这部电影') demonstrative = _('这部电影')
METADATA_COPY_LIST = [ METADATA_COPY_LIST = [

View file

@ -173,6 +173,7 @@ class DoubanBook(AbstractSite):
'id_value': r[1] if r else None, 'id_value': r[1] if r else None,
'title': data['title'], 'title': data['title'],
'url': works_element[0], 'url': works_element[0],
'content': {'metadata': {'title': data['title']}}
}] }]
pd = ResourceContent(metadata=data) pd = ResourceContent(metadata=data)
@ -195,14 +196,6 @@ class DoubanBook_Work(AbstractSite):
def id_to_url(self, id_value): def id_to_url(self, id_value):
return "https://book.douban.com/works/" + id_value + "/" return "https://book.douban.com/works/" + id_value + "/"
def bypass_scrape(self, data_from_link):
if not data_from_link:
return None
pd = ResourceContent(metadata={
'title': data_from_link['title'],
})
return pd
def scrape(self): def scrape(self):
content = DoubanDownloader(self.url).download().html() content = DoubanDownloader(self.url).download().html()
title_elem = content.xpath("//h1/text()") title_elem = content.xpath("//h1/text()")

View file

@ -73,13 +73,10 @@
{% endif %} {% endif %}
</div> </div>
{% if item.other_info %} <div>{% if item.official_site %}
{% for k, v in item.other_info.items %} {% trans '官方网站:' %}{{ item.official_site|urlizetrunc:42 }}
<div> {% endif %}
{{ k }}{{ v | urlize }}
</div> </div>
{% endfor %}
{% endif %}
</div> </div>
<div class="entity-detail__fields"> <div class="entity-detail__fields">

View file

@ -15,6 +15,7 @@ def _get_all_url_paths():
urlpatterns = [ urlpatterns = [
re_path(r'item/(?P<item_uuid>[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})/', retrieve_by_uuid, name='retrieve_by_uuid'),
re_path(r'(?P<item_path>' + _get_all_url_paths() + ')/(?P<item_uid>[A-Za-z0-9]{21,22})/', retrieve, name='retrieve'), re_path(r'(?P<item_path>' + _get_all_url_paths() + ')/(?P<item_uid>[A-Za-z0-9]{21,22})/', retrieve, name='retrieve'),
path("api/", api.urls), path("api/", api.urls),
] ]

View file

@ -27,6 +27,11 @@ from journal.models import Mark
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
def retrieve_by_uuid(request, item_uuid):
item = get_object_or_404(Item, uid=item_uuid)
return redirect(item.url)
def retrieve(request, item_path, item_uid): def retrieve(request, item_path, item_uid):
if request.method == 'GET': if request.method == 'GET':
item = get_object_or_404(Item, uid=base62.decode(item_uid)) item = get_object_or_404(Item, uid=base62.decode(item_uid))

0
legacy/__init__.py Normal file
View file

3
legacy/admin.py Normal file
View file

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

6
legacy/apps.py Normal file
View file

@ -0,0 +1,6 @@
from django.apps import AppConfig
class LegacyConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'legacy'

View file

@ -0,0 +1,216 @@
from books.models import Book as Legacy_Book
from movies.models import Movie as Legacy_Movie
from music.models import Album as Legacy_Album
from games.models import Game as Legacy_Game
from catalog.common import *
from catalog.models import *
from catalog.sites import *
from catalog.book.utils import detect_isbn_asin
from journal import models as journal_models
from social import models as social_models
from django.core.management.base import BaseCommand
from django.core.paginator import Paginator
import pprint
from tqdm import tqdm
from django.db.models import Q, Count, Sum
from django.utils import dateparse, timezone
import re
from legacy.models import *
BATCH_SIZE = 1000
def _book_convert(entity):
content = ResourceContent(metadata={
'title': entity.title,
'brief': entity.brief,
'cover_image_path': str(entity.cover),
'subtitle': entity.subtitle,
'orig_title': entity.orig_title,
'author': entity.author,
'translator': entity.translator,
'language': entity.language,
'pub_house': entity.pub_house,
'pub_year': entity.pub_year,
'pub_month': entity.pub_month,
'binding': entity.binding,
'price': entity.price,
'pages': entity.pages,
'contents': entity.contents,
'series': entity.other_info.get('丛书') if entity.other_info else None,
'imprint': entity.other_info.get('出品方') if entity.other_info else None,
})
if entity.isbn:
t, v = detect_isbn_asin(entity.isbn)
if t:
content.lookup_ids[t] = v
if entity.other_info and entity.other_info.get('统一书号'):
content.lookup_ids[IdType.CUBN] = entity.other_info.get('统一书号')
return content
def _album_convert(entity):
content = ResourceContent(metadata={
'title': entity.title,
'brief': entity.brief,
'cover_image_path': str(entity.cover),
'other_title': entity.other_info.get('又名') if entity.other_info else None,
'album_type': entity.other_info.get('专辑类型') if entity.other_info else None,
'media': entity.other_info.get('介质') if entity.other_info else None,
'disc_count': entity.other_info.get('碟片数') if entity.other_info else None,
'artist': entity.artist,
'genre': entity.genre,
'release_date': entity.release_date.strftime('%Y-%m-%d') if entity.release_date else None,
'duration': entity.duration,
'company': entity.company,
'track_list': entity.track_list,
'bandcamp_album_id': entity.other_info.get('bandcamp_album_id') if entity.other_info else None,
})
if entity.other_info and entity.other_info.get('ISRC'):
content.lookup_ids[IdType.ISRC] = entity.other_info.get('ISRC')
if entity.other_info and entity.other_info.get('条形码'):
content.lookup_ids[IdType.GTIN] = entity.other_info.get('条形码')
if entity.other_info and entity.other_info.get('UPC'):
content.lookup_ids[IdType.GTIN] = entity.other_info.get('UPC')
return content
def _game_convert(entity):
content = ResourceContent(metadata={
'title': entity.title,
'brief': entity.brief,
'cover_image_path': str(entity.cover),
'other_title': entity.other_title,
'developer': entity.developer,
'publisher': entity.publisher,
'release_date': entity.release_date.strftime('%Y-%m-%d') if entity.release_date else None,
'genre': entity.genre,
'platform': entity.platform,
'official_site': entity.other_info.get('official_site') if entity.other_info else None,
})
if entity.other_info and entity.other_info.get('steam_url'):
content.lookup_ids[IdType.Steam] = re.search(r'store\.steampowered\.com/app/(\d+)', entity.other_info.get('steam_url'))[1]
return content
def _movie_tv_convert(entity):
content = ResourceContent(metadata={
'title': entity.title,
'brief': entity.brief,
'cover_image_path': str(entity.cover),
'orig_title': entity.orig_title,
'other_title': entity.other_title,
'director': entity.director,
'playwright': entity.playwright,
'actor': entity.actor,
'genre': entity.genre,
'showtime': entity.showtime,
'site': entity.site,
'area': entity.area,
'language': entity.language,
'year': entity.year,
'duration': entity.duration,
'season_count': entity.other_info.get('Seasons') if entity.other_info else None,
'season_number': entity.season,
'episodes': entity.episodes,
'single_episode_length': entity.single_episode_length,
'is_series': entity.is_series,
})
if entity.imdb_code:
content.lookup_ids[IdType.IMDB] = entity.imdb_code
if entity.other_info and entity.other_info.get('TMDB_ID'):
content.lookup_ids[IdType.TMDB_TV] = entity.other_info.get('TMDB_ID')
return content
Legacy_Book.convert = _book_convert
Legacy_Movie.convert = _movie_tv_convert
Legacy_Game.convert = _game_convert
Legacy_Album.convert = _album_convert
model_map = {
Legacy_Book: Edition,
Legacy_Movie: Movie,
Legacy_Game: Game,
Legacy_Album: Album,
}
model_link = {
Legacy_Book: BookLink,
Legacy_Movie: MovieLink,
Legacy_Game: GameLink,
Legacy_Album: AlbumLink,
}
class Command(BaseCommand):
help = 'Migrate legacy books'
def add_arguments(self, parser):
parser.add_argument('--book', dest='types', action='append_const', const=Legacy_Book)
parser.add_argument('--movie', dest='types', action='append_const', const=Legacy_Movie)
parser.add_argument('--album', dest='types', action='append_const', const=Legacy_Album)
parser.add_argument('--game', dest='types', action='append_const', const=Legacy_Game)
parser.add_argument('--id', help='id to convert; or, if using with --max-id, the min id')
parser.add_argument('--maxid', help='max id to convert')
parser.add_argument('--failstop', help='stop on fail', action='store_true')
parser.add_argument('--clearlink', help='clear legacy link table', action='store_true')
parser.add_argument('--reload', help='reload and ignore existing ExternalResource', action='store_true')
def handle(self, *args, **options):
types = options['types'] or [Legacy_Game, Legacy_Album, Legacy_Movie, Legacy_Book]
reload = options['reload']
for typ in types:
print(typ)
LinkModel = model_link[typ]
if options['clearlink']:
LinkModel.objects.all().delete()
qs = typ.objects.all().order_by('id') # if h == 0 else c.objects.filter(edited_time__gt=timezone.now() - timedelta(hours=h))
if options['id']:
if options['maxid']:
qs = qs.filter(id__gte=int(options['id']), id__lte=int(options['maxid']))
else:
qs = qs.filter(id=int(options['id']))
pg = Paginator(qs, BATCH_SIZE)
for p in tqdm(pg.page_range):
links = []
for entity in pg.get_page(p).object_list:
try:
content = entity.convert()
site = SiteManager.get_site_by_url(entity.source_url)
item = None
if site:
if not site.DEFAULT_MODEL and not content.metadata.get('preferred_model'):
if model_map[typ] != Movie or not content.metadata.get('is_series'):
content.metadata['preferred_model'] = model_map[typ].__name__
else: # TV
content.metadata['preferred_model'] = 'TVSeason' if content.metadata.get('season') else 'TVShow'
item = site.get_resource_ready(preloaded_content=content, reload=reload).item
else:
# not known site, try save item without external resource
item = None
model = Edition
t, v = None, None
if content.lookup_ids:
t, v = Item.get_best_lookup_id(content.lookup_ids)
item = model.objects.filter(primary_lookup_id_type=t, primary_lookup_id_value=v).first()
if not item:
obj = model.copy_metadata(content.metadata)
obj['primary_lookup_id_type'] = t
obj['primary_lookup_id_value'] = v
item = model.objects.create(**obj)
item.cover = content.metadata['cover_image_path']
item.save()
links.append(LinkModel(old_id=entity.id, new_uid=item.uid))
# pprint.pp(site.get_item())
except Exception as e:
print(f'Convert failed for {entity}: {e}')
if options['failstop']:
raise(e)
# return
LinkModel.objects.bulk_create(links)
self.stdout.write(self.style.SUCCESS(f'Done.'))

21
legacy/models.py Normal file
View file

@ -0,0 +1,21 @@
from django.db import models
class BookLink(models.Model):
old_id = models.IntegerField(db_index=True)
new_uid = models.UUIDField()
class MovieLink(models.Model):
old_id = models.IntegerField(db_index=True)
new_uid = models.UUIDField()
class AlbumLink(models.Model):
old_id = models.IntegerField(db_index=True)
new_uid = models.UUIDField()
class GameLink(models.Model):
old_id = models.IntegerField(db_index=True)
new_uid = models.UUIDField()

3
legacy/tests.py Normal file
View file

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

3
legacy/views.py Normal file
View file

@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.