diff --git a/.github/workflows/django.yml b/.github/workflows/django.yml index 4454d434..dfe51d98 100644 --- a/.github/workflows/django.yml +++ b/.github/workflows/django.yml @@ -43,5 +43,5 @@ jobs: - name: Run Tests run: | PGPASSWORD=admin123 psql template1 -U postgres -h localhost -c 'create extension hstore;' - new_data_model=1 python manage.py makemigrations auth mastodon users books movies games music sync management collection common sync management timeline catalog journal social + new_data_model=1 python manage.py makemigrations auth mastodon users books movies games music sync management collection common sync management timeline catalog journal social legacy new_data_model=1 python manage.py test diff --git a/boofilsic/settings.py b/boofilsic/settings.py index f75610ec..7362d7c7 100644 --- a/boofilsic/settings.py +++ b/boofilsic/settings.py @@ -368,3 +368,5 @@ if ENABLE_NEW_MODEL: INSTALLED_APPS.append('catalog.apps.CatalogConfig') INSTALLED_APPS.append('journal.apps.JournalConfig') INSTALLED_APPS.append('social.apps.SocialConfig') + INSTALLED_APPS.append('legacy.apps.LegacyConfig') + diff --git a/catalog/book/utils.py b/catalog/book/utils.py index a8775ec5..6598e65b 100644 --- a/catalog/book/utils.py +++ b/catalog/book/utils.py @@ -50,7 +50,9 @@ def is_asin(asin): def detect_isbn_asin(s): - n = s.strip().upper() if s else '' + if not s: + return None, None + n = re.sub(r'[^0-9A-Z]', '', s.upper()) if is_isbn_13(n): return IdType.ISBN, n if is_isbn_10(n): diff --git a/catalog/common/models.py b/catalog/common/models.py index da5e6ba2..7cbe0825 100644 --- a/catalog/common/models.py +++ b/catalog/common/models.py @@ -318,6 +318,8 @@ class ExternalResource(models.Model): self.metadata = resource_content.metadata if resource_content.cover_image and resource_content.cover_image_extention: self.cover = SimpleUploadedFile('temp.' + resource_content.cover_image_extention, resource_content.cover_image) + else: + self.cover = resource_content.metadata.get('cover_image_path') self.scraped_time = timezone.now() self.save() diff --git a/catalog/common/sites.py b/catalog/common/sites.py index 579e9061..1e9b8d25 100644 --- a/catalog/common/sites.py +++ b/catalog/common/sites.py @@ -23,6 +23,12 @@ class ResourceContent: cover_image: bytes = None cover_image_extention: str = None + def dict(self): + return {'metadata': self.metadata, 'lookup_ids': self.lookup_ids} + + def to_json(self) -> str: + return json.dumps({'metadata': self.metadata, 'lookup_ids': self.lookup_ids}) + class AbstractSite: """ @@ -67,10 +73,6 @@ class AbstractSite: self.resource = ExternalResource(id_type=self.ID_TYPE, id_value=self.id_value, url=self.url) return self.resource - def bypass_scrape(self, data_from_link) -> ResourceContent: - """subclass may implement this to use data from linked resource and bypass actual scrape""" - return None - def scrape(self) -> ResourceContent: """subclass should implement this, return ResourceContent object""" data = ResourceContent() @@ -101,7 +103,7 @@ class AbstractSite: def ready(self): return bool(self.resource and self.resource.ready) - def get_resource_ready(self, auto_save=True, auto_create=True, auto_link=True, data_from_link=None): + def get_resource_ready(self, auto_save=True, auto_create=True, auto_link=True, preloaded_content=None, reload=False): """return a resource scraped, or scrape if not yet""" if auto_link: auto_create = True @@ -111,9 +113,12 @@ class AbstractSite: resource_content = {} if not self.resource: return None - if not p.ready: - resource_content = self.bypass_scrape(data_from_link) - if not resource_content: + if not p.ready or reload: + if isinstance(preloaded_content, ResourceContent): + resource_content = preloaded_content + elif isinstance(preloaded_content, dict): + resource_content = ResourceContent(**preloaded_content) + else: resource_content = self.scrape() p.update_content(resource_content) if not p.ready: @@ -127,12 +132,12 @@ class AbstractSite: p.item.merge_data_from_external_resources() p.item.save() if auto_link: - for linked_resources in p.required_resources: - linked_site = SiteManager.get_site_by_url(linked_resources['url']) + for linked_resource in p.required_resources: + linked_site = SiteManager.get_site_by_url(linked_resource['url']) if linked_site: - linked_site.get_resource_ready(auto_link=False) + linked_site.get_resource_ready(auto_link=False, preloaded_content=linked_resource.get('content')) else: - _logger.error(f'unable to get site for {linked_resources["url"]}') + _logger.error(f'unable to get site for {linked_resource["url"]}') p.item.update_linked_items_from_external_resource(p) p.item.save() return p @@ -141,28 +146,28 @@ class AbstractSite: class SiteManager: registry = {} - @classmethod - def register(cls, target) -> Callable: + @staticmethod + def register(target) -> Callable: id_type = target.ID_TYPE - if id_type in cls.registry: + if id_type in SiteManager.registry: raise ValueError(f'Site for {id_type} already exists') - cls.registry[id_type] = target + SiteManager.registry[id_type] = target return target - @classmethod - def get_site_by_id_type(cls, typ: str): - return cls.registry[typ]() if typ in cls.registry else None + @staticmethod + def get_site_by_id_type(typ: str): + return SiteManager.registry[typ]() if typ in SiteManager.registry else None - @classmethod - def get_site_by_url(cls, url: str): - cls = next(filter(lambda p: p.validate_url(url), cls.registry.values()), None) + @staticmethod + def get_site_by_url(url: str): + cls = next(filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None) if cls is None: - cls = next(filter(lambda p: p.validate_url_fallback(url), cls.registry.values()), None) + cls = next(filter(lambda p: p.validate_url_fallback(url), SiteManager.registry.values()), None) return cls(url) if cls else None - @classmethod - def get_id_by_url(cls, url: str): - site = cls.get_site_by_url(url) + @staticmethod + def get_id_by_url(url: str): + site = SiteManager.get_site_by_url(url) return site.url_to_id(url) if site else None @staticmethod diff --git a/catalog/game/models.py b/catalog/game/models.py index bd4281e0..ea6b0b51 100644 --- a/catalog/game/models.py +++ b/catalog/game/models.py @@ -13,13 +13,14 @@ class Game(Item): METADATA_COPY_LIST = [ 'title', + 'brief', 'other_title', 'developer', 'publisher', 'release_date', 'genre', 'platform', - 'brief', + 'official_site', ] other_title = jsondata.ArrayField( @@ -63,3 +64,7 @@ class Game(Item): blank=True, default=list, ) + + official_site = jsondata.CharField( + default='', + ) diff --git a/catalog/movie/models.py b/catalog/movie/models.py index 052a34b6..1d001437 100644 --- a/catalog/movie/models.py +++ b/catalog/movie/models.py @@ -9,7 +9,6 @@ class Movie(Item): imdb = PrimaryLookupIdDescriptor(IdType.IMDB) tmdb_movie = PrimaryLookupIdDescriptor(IdType.TMDB_Movie) douban_movie = PrimaryLookupIdDescriptor(IdType.DoubanMovie) - duration = jsondata.IntegerField(blank=True, default=None) demonstrative = _('这部电影') METADATA_COPY_LIST = [ diff --git a/catalog/sites/douban_book.py b/catalog/sites/douban_book.py index 8532e07e..22ae9119 100644 --- a/catalog/sites/douban_book.py +++ b/catalog/sites/douban_book.py @@ -173,6 +173,7 @@ class DoubanBook(AbstractSite): 'id_value': r[1] if r else None, 'title': data['title'], 'url': works_element[0], + 'content': {'metadata': {'title': data['title']}} }] pd = ResourceContent(metadata=data) @@ -195,14 +196,6 @@ class DoubanBook_Work(AbstractSite): def id_to_url(self, id_value): return "https://book.douban.com/works/" + id_value + "/" - def bypass_scrape(self, data_from_link): - if not data_from_link: - return None - pd = ResourceContent(metadata={ - 'title': data_from_link['title'], - }) - return pd - def scrape(self): content = DoubanDownloader(self.url).download().html() title_elem = content.xpath("//h1/text()") diff --git a/catalog/templates/game.html b/catalog/templates/game.html index 41690755..0bfad623 100644 --- a/catalog/templates/game.html +++ b/catalog/templates/game.html @@ -73,13 +73,10 @@ {% endif %} - {% if item.other_info %} - {% for k, v in item.other_info.items %} -
- {{ k }}:{{ v | urlize }} +
{% if item.official_site %} + {% trans '官方网站:' %}{{ item.official_site|urlizetrunc:42 }} + {% endif %}
- {% endfor %} - {% endif %}
diff --git a/catalog/urls.py b/catalog/urls.py index 9be0f947..4f43cd6a 100644 --- a/catalog/urls.py +++ b/catalog/urls.py @@ -15,6 +15,7 @@ def _get_all_url_paths(): urlpatterns = [ + re_path(r'item/(?P[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})/', retrieve_by_uuid, name='retrieve_by_uuid'), re_path(r'(?P' + _get_all_url_paths() + ')/(?P[A-Za-z0-9]{21,22})/', retrieve, name='retrieve'), path("api/", api.urls), ] diff --git a/catalog/views.py b/catalog/views.py index ed2fab53..e16d23bd 100644 --- a/catalog/views.py +++ b/catalog/views.py @@ -27,6 +27,11 @@ from journal.models import Mark _logger = logging.getLogger(__name__) +def retrieve_by_uuid(request, item_uuid): + item = get_object_or_404(Item, uid=item_uuid) + return redirect(item.url) + + def retrieve(request, item_path, item_uid): if request.method == 'GET': item = get_object_or_404(Item, uid=base62.decode(item_uid)) diff --git a/legacy/__init__.py b/legacy/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/legacy/admin.py b/legacy/admin.py new file mode 100644 index 00000000..8c38f3f3 --- /dev/null +++ b/legacy/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/legacy/apps.py b/legacy/apps.py new file mode 100644 index 00000000..4e1c1ece --- /dev/null +++ b/legacy/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class LegacyConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'legacy' diff --git a/legacy/management/commands/migrate_catalog.py b/legacy/management/commands/migrate_catalog.py new file mode 100644 index 00000000..44c8241a --- /dev/null +++ b/legacy/management/commands/migrate_catalog.py @@ -0,0 +1,216 @@ +from books.models import Book as Legacy_Book +from movies.models import Movie as Legacy_Movie +from music.models import Album as Legacy_Album +from games.models import Game as Legacy_Game +from catalog.common import * +from catalog.models import * +from catalog.sites import * +from catalog.book.utils import detect_isbn_asin +from journal import models as journal_models +from social import models as social_models +from django.core.management.base import BaseCommand +from django.core.paginator import Paginator +import pprint +from tqdm import tqdm +from django.db.models import Q, Count, Sum +from django.utils import dateparse, timezone +import re +from legacy.models import * + + +BATCH_SIZE = 1000 + + +def _book_convert(entity): + content = ResourceContent(metadata={ + 'title': entity.title, + 'brief': entity.brief, + 'cover_image_path': str(entity.cover), + + 'subtitle': entity.subtitle, + 'orig_title': entity.orig_title, + 'author': entity.author, + 'translator': entity.translator, + 'language': entity.language, + 'pub_house': entity.pub_house, + 'pub_year': entity.pub_year, + 'pub_month': entity.pub_month, + 'binding': entity.binding, + 'price': entity.price, + 'pages': entity.pages, + 'contents': entity.contents, + 'series': entity.other_info.get('丛书') if entity.other_info else None, + 'imprint': entity.other_info.get('出品方') if entity.other_info else None, + }) + if entity.isbn: + t, v = detect_isbn_asin(entity.isbn) + if t: + content.lookup_ids[t] = v + if entity.other_info and entity.other_info.get('统一书号'): + content.lookup_ids[IdType.CUBN] = entity.other_info.get('统一书号') + return content + + +def _album_convert(entity): + content = ResourceContent(metadata={ + 'title': entity.title, + 'brief': entity.brief, + 'cover_image_path': str(entity.cover), + + 'other_title': entity.other_info.get('又名') if entity.other_info else None, + 'album_type': entity.other_info.get('专辑类型') if entity.other_info else None, + 'media': entity.other_info.get('介质') if entity.other_info else None, + 'disc_count': entity.other_info.get('碟片数') if entity.other_info else None, + 'artist': entity.artist, + 'genre': entity.genre, + 'release_date': entity.release_date.strftime('%Y-%m-%d') if entity.release_date else None, + 'duration': entity.duration, + 'company': entity.company, + 'track_list': entity.track_list, + 'bandcamp_album_id': entity.other_info.get('bandcamp_album_id') if entity.other_info else None, + }) + if entity.other_info and entity.other_info.get('ISRC'): + content.lookup_ids[IdType.ISRC] = entity.other_info.get('ISRC') + if entity.other_info and entity.other_info.get('条形码'): + content.lookup_ids[IdType.GTIN] = entity.other_info.get('条形码') + if entity.other_info and entity.other_info.get('UPC'): + content.lookup_ids[IdType.GTIN] = entity.other_info.get('UPC') + return content + + +def _game_convert(entity): + content = ResourceContent(metadata={ + 'title': entity.title, + 'brief': entity.brief, + 'cover_image_path': str(entity.cover), + + 'other_title': entity.other_title, + 'developer': entity.developer, + 'publisher': entity.publisher, + 'release_date': entity.release_date.strftime('%Y-%m-%d') if entity.release_date else None, + 'genre': entity.genre, + 'platform': entity.platform, + 'official_site': entity.other_info.get('official_site') if entity.other_info else None, + }) + if entity.other_info and entity.other_info.get('steam_url'): + content.lookup_ids[IdType.Steam] = re.search(r'store\.steampowered\.com/app/(\d+)', entity.other_info.get('steam_url'))[1] + return content + + +def _movie_tv_convert(entity): + content = ResourceContent(metadata={ + 'title': entity.title, + 'brief': entity.brief, + 'cover_image_path': str(entity.cover), + + 'orig_title': entity.orig_title, + 'other_title': entity.other_title, + 'director': entity.director, + 'playwright': entity.playwright, + 'actor': entity.actor, + 'genre': entity.genre, + 'showtime': entity.showtime, + 'site': entity.site, + 'area': entity.area, + 'language': entity.language, + 'year': entity.year, + 'duration': entity.duration, + 'season_count': entity.other_info.get('Seasons') if entity.other_info else None, + 'season_number': entity.season, + 'episodes': entity.episodes, + 'single_episode_length': entity.single_episode_length, + 'is_series': entity.is_series, + }) + if entity.imdb_code: + content.lookup_ids[IdType.IMDB] = entity.imdb_code + if entity.other_info and entity.other_info.get('TMDB_ID'): + content.lookup_ids[IdType.TMDB_TV] = entity.other_info.get('TMDB_ID') + return content + + +Legacy_Book.convert = _book_convert +Legacy_Movie.convert = _movie_tv_convert +Legacy_Game.convert = _game_convert +Legacy_Album.convert = _album_convert +model_map = { + Legacy_Book: Edition, + Legacy_Movie: Movie, + Legacy_Game: Game, + Legacy_Album: Album, +} +model_link = { + Legacy_Book: BookLink, + Legacy_Movie: MovieLink, + Legacy_Game: GameLink, + Legacy_Album: AlbumLink, +} + + +class Command(BaseCommand): + help = 'Migrate legacy books' + + def add_arguments(self, parser): + parser.add_argument('--book', dest='types', action='append_const', const=Legacy_Book) + parser.add_argument('--movie', dest='types', action='append_const', const=Legacy_Movie) + parser.add_argument('--album', dest='types', action='append_const', const=Legacy_Album) + parser.add_argument('--game', dest='types', action='append_const', const=Legacy_Game) + parser.add_argument('--id', help='id to convert; or, if using with --max-id, the min id') + parser.add_argument('--maxid', help='max id to convert') + parser.add_argument('--failstop', help='stop on fail', action='store_true') + parser.add_argument('--clearlink', help='clear legacy link table', action='store_true') + parser.add_argument('--reload', help='reload and ignore existing ExternalResource', action='store_true') + + def handle(self, *args, **options): + types = options['types'] or [Legacy_Game, Legacy_Album, Legacy_Movie, Legacy_Book] + reload = options['reload'] + for typ in types: + print(typ) + LinkModel = model_link[typ] + if options['clearlink']: + LinkModel.objects.all().delete() + qs = typ.objects.all().order_by('id') # if h == 0 else c.objects.filter(edited_time__gt=timezone.now() - timedelta(hours=h)) + if options['id']: + if options['maxid']: + qs = qs.filter(id__gte=int(options['id']), id__lte=int(options['maxid'])) + else: + qs = qs.filter(id=int(options['id'])) + + pg = Paginator(qs, BATCH_SIZE) + for p in tqdm(pg.page_range): + links = [] + for entity in pg.get_page(p).object_list: + try: + content = entity.convert() + site = SiteManager.get_site_by_url(entity.source_url) + item = None + if site: + if not site.DEFAULT_MODEL and not content.metadata.get('preferred_model'): + if model_map[typ] != Movie or not content.metadata.get('is_series'): + content.metadata['preferred_model'] = model_map[typ].__name__ + else: # TV + content.metadata['preferred_model'] = 'TVSeason' if content.metadata.get('season') else 'TVShow' + item = site.get_resource_ready(preloaded_content=content, reload=reload).item + else: + # not known site, try save item without external resource + item = None + model = Edition + t, v = None, None + if content.lookup_ids: + t, v = Item.get_best_lookup_id(content.lookup_ids) + item = model.objects.filter(primary_lookup_id_type=t, primary_lookup_id_value=v).first() + if not item: + obj = model.copy_metadata(content.metadata) + obj['primary_lookup_id_type'] = t + obj['primary_lookup_id_value'] = v + item = model.objects.create(**obj) + item.cover = content.metadata['cover_image_path'] + item.save() + links.append(LinkModel(old_id=entity.id, new_uid=item.uid)) + # pprint.pp(site.get_item()) + except Exception as e: + print(f'Convert failed for {entity}: {e}') + if options['failstop']: + raise(e) + # return + LinkModel.objects.bulk_create(links) + self.stdout.write(self.style.SUCCESS(f'Done.')) diff --git a/legacy/models.py b/legacy/models.py new file mode 100644 index 00000000..f9e4fe51 --- /dev/null +++ b/legacy/models.py @@ -0,0 +1,21 @@ +from django.db import models + + +class BookLink(models.Model): + old_id = models.IntegerField(db_index=True) + new_uid = models.UUIDField() + + +class MovieLink(models.Model): + old_id = models.IntegerField(db_index=True) + new_uid = models.UUIDField() + + +class AlbumLink(models.Model): + old_id = models.IntegerField(db_index=True) + new_uid = models.UUIDField() + + +class GameLink(models.Model): + old_id = models.IntegerField(db_index=True) + new_uid = models.UUIDField() diff --git a/legacy/tests.py b/legacy/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/legacy/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/legacy/views.py b/legacy/views.py new file mode 100644 index 00000000..91ea44a2 --- /dev/null +++ b/legacy/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here.