From d343d6e71e6fcd27685b9b6cee3a3557c99f9eb3 Mon Sep 17 00:00:00 2001 From: mein Name Date: Thu, 6 Mar 2025 11:21:01 -0500 Subject: [PATCH] import ndjson --- journal/importers/__init__.py | 8 +- journal/importers/base.py | 197 ++++++++ journal/importers/csv.py | 203 +------- journal/importers/ndjson.py | 447 ++++++++++++++++++ journal/tests/__init__.py | 1 + journal/tests/ndjson.py | 496 ++++++++++++++++++++ users/templates/users/data.html | 190 ++++++-- users/templates/users/user_task_status.html | 27 +- users/urls.py | 2 +- users/views/data.py | 74 ++- 10 files changed, 1378 insertions(+), 267 deletions(-) create mode 100644 journal/importers/base.py create mode 100644 journal/importers/ndjson.py create mode 100644 journal/tests/ndjson.py diff --git a/journal/importers/__init__.py b/journal/importers/__init__.py index ab794d2f..67693cc3 100644 --- a/journal/importers/__init__.py +++ b/journal/importers/__init__.py @@ -5,16 +5,19 @@ from .csv import CsvImporter from .douban import DoubanImporter from .goodreads import GoodreadsImporter from .letterboxd import LetterboxdImporter +from .ndjson import NdjsonImporter from .opml import OPMLImporter -def get_neodb_importer(filename: str) -> type[CsvImporter] | None: +def get_neodb_importer( + filename: str, +) -> type[CsvImporter] | type[NdjsonImporter] | None: if not os.path.exists(filename) or not zipfile.is_zipfile(filename): return None with zipfile.ZipFile(filename, "r") as z: files = z.namelist() if any(f == "journal.ndjson" for f in files): - return None + return NdjsonImporter if any( f.endswith("_mark.csv") or f.endswith("_review.csv") @@ -26,6 +29,7 @@ def get_neodb_importer(filename: str) -> type[CsvImporter] | None: __all__ = [ "CsvImporter", + "NdjsonImporter", "LetterboxdImporter", "OPMLImporter", "DoubanImporter", diff --git a/journal/importers/base.py b/journal/importers/base.py new file mode 100644 index 00000000..8edde4f7 --- /dev/null +++ b/journal/importers/base.py @@ -0,0 +1,197 @@ +import datetime +from typing import Dict, List, Literal, Optional + +from django.conf import settings +from django.utils.dateparse import parse_datetime +from loguru import logger + +from catalog.common.sites import SiteManager +from catalog.models import Edition, IdType, Item, SiteName +from journal.models import ShelfType +from users.models import Task + +_PREFERRED_SITES = [ + SiteName.Fediverse, + SiteName.RSS, + SiteName.TMDB, + SiteName.IMDB, + SiteName.GoogleBooks, + SiteName.Goodreads, + SiteName.IGDB, +] + + +class BaseImporter(Task): + class Meta: + app_label = "journal" # workaround bug in TypedModel + + ImportResult = Literal["imported", "skipped", "failed"] + TaskQueue = "import" + DefaultMetadata = { + "total": 0, + "processed": 0, + "skipped": 0, + "imported": 0, + "failed": 0, + "failed_items": [], + "file": None, + "visibility": 0, + } + + def progress(self, result: ImportResult) -> None: + """Update import progress. + + Args: + result: The import result ('imported', 'skipped', or 'failed') + """ + self.metadata["processed"] += 1 + self.metadata[result] = self.metadata.get(result, 0) + 1 + + if self.metadata["total"]: + progress_percentage = round( + self.metadata["processed"] / self.metadata["total"] * 100 + ) + self.message = f"Progress: {progress_percentage}% - " + else: + self.message = "" + self.message += ( + f"{self.metadata['imported']} imported, " + f"{self.metadata['skipped']} skipped, " + f"{self.metadata['failed']} failed" + ) + self.save(update_fields=["metadata", "message"]) + + def run(self) -> None: + raise NotImplementedError + + def get_item_by_info_and_links( + self, title: str, info_str: str, links: list[str] + ) -> Optional[Item]: + """Find an item based on information from CSV export. + + Args: + title: Item title + info_str: Item info string (space-separated key:value pairs) + links_str: Space-separated URLs + + Returns: + Item if found, None otherwise + """ + site_url = settings.SITE_INFO["site_url"] + "/" + # look for local items first + for link in links: + if link.startswith("/") or link.startswith(site_url): + item = Item.get_by_url(link, resolve_merge=True) + if item and not item.is_deleted: + return item + + sites = [ + SiteManager.get_site_by_url(link, detect_redirection=False) + for link in links + ] + sites = [site for site in sites if site] + sites.sort( + key=lambda x: _PREFERRED_SITES.index(x.SITE_NAME) + if x.SITE_NAME in _PREFERRED_SITES + else 99 + ) + + # match items without extra requests + for site in sites: + item = site.get_item() + if item: + return item + + # match items after HEAD + sites = [ + SiteManager.get_site_by_url(site.url) if site.url else site + for site in sites + ] + sites = [site for site in sites if site] + for site in sites: + item = site.get_item() + if item: + return item + + # fetch from remote + for site in sites: + try: + logger.debug(f"fetching {site.url}") + site.get_resource_ready() + item = site.get_item() + if item: + return item + except Exception as e: + logger.error(f"Error fetching item: {e}") + + # Try using the info string + if info_str: + info_dict = {} + for pair in info_str.strip().split(): + if ":" in pair: + key, value = pair.split(":", 1) + info_dict[key] = value + + # Check for ISBN, IMDB, etc. + item = None + for key, value in info_dict.items(): + if key == "isbn" and value: + item = Edition.objects.filter( + primary_lookup_id_type=IdType.ISBN, + primary_lookup_id_value=value, + ).first() + elif key == "imdb" and value: + item = Item.objects.filter( + primary_lookup_id_type=IdType.IMDB, + primary_lookup_id_value=value, + ).first() + if item: + return item + return None + + def parse_tags(self, tags_str: str) -> List[str]: + """Parse space-separated tags string into a list of tags.""" + if not tags_str: + return [] + return [tag.strip() for tag in tags_str.split() if tag.strip()] + + def parse_info(self, info_str: str) -> Dict[str, str]: + """Parse info string into a dictionary.""" + info_dict = {} + if not info_str: + return info_dict + + for pair in info_str.split(): + if ":" in pair: + key, value = pair.split(":", 1) + info_dict[key] = value + + return info_dict + + def parse_datetime(self, timestamp_str: str | None) -> Optional[datetime.datetime]: + """Parse ISO format timestamp into datetime.""" + if not timestamp_str: + return None + + try: + dt = parse_datetime(timestamp_str) + if dt and dt.tzinfo is None: + dt = dt.replace(tzinfo=datetime.UTC) + return dt + except Exception as e: + logger.error(f"Error parsing datetime {timestamp_str}: {e}") + return None + + def parse_shelf_type(self, status_str: str) -> ShelfType: + """Parse shelf type string into ShelfType enum.""" + if not status_str: + return ShelfType.WISHLIST + + status_map = { + "wishlist": ShelfType.WISHLIST, + "progress": ShelfType.PROGRESS, + "complete": ShelfType.COMPLETE, + "dropped": ShelfType.DROPPED, + } + + return status_map.get(status_str.lower(), ShelfType.WISHLIST) diff --git a/journal/importers/csv.py b/journal/importers/csv.py index 93656bba..f84bc98a 100644 --- a/journal/importers/csv.py +++ b/journal/importers/csv.py @@ -1,181 +1,20 @@ import csv -import datetime import os import tempfile import zipfile -from typing import Dict, List, Optional +from typing import Dict -from django.conf import settings from django.utils import timezone -from django.utils.dateparse import parse_datetime from django.utils.translation import gettext as _ from loguru import logger -from catalog.common.sites import SiteManager -from catalog.models import Edition, IdType, Item, ItemCategory, SiteName -from journal.models import Mark, Note, Review, ShelfType -from users.models import Task +from catalog.models import ItemCategory +from journal.models import Mark, Note, Review -_PREFERRED_SITES = [ - SiteName.Fediverse, - SiteName.RSS, - SiteName.TMDB, - SiteName.IMDB, - SiteName.GoogleBooks, - SiteName.Goodreads, - SiteName.IGDB, -] +from .base import BaseImporter -class CsvImporter(Task): - class Meta: - app_label = "journal" # workaround bug in TypedModel - - TaskQueue = "import" - DefaultMetadata = { - "total": 0, - "processed": 0, - "skipped": 0, - "imported": 0, - "failed": 0, - "failed_items": [], - "file": None, - "visibility": 0, - } - - def get_item_by_info_and_links( - self, title: str, info_str: str, links_str: str - ) -> Optional[Item]: - """Find an item based on information from CSV export. - - Args: - title: Item title - info_str: Item info string (space-separated key:value pairs) - links_str: Space-separated URLs - - Returns: - Item if found, None otherwise - """ - site_url = settings.SITE_INFO["site_url"] + "/" - links = links_str.strip().split() - # look for local items first - for link in links: - if link.startswith("/") or link.startswith(site_url): - item = Item.get_by_url(link, resolve_merge=True) - if item and not item.is_deleted: - return item - - sites = [ - SiteManager.get_site_by_url(link, detect_redirection=False) - for link in links - ] - sites = [site for site in sites if site] - sites.sort( - key=lambda x: _PREFERRED_SITES.index(x.SITE_NAME) - if x.SITE_NAME in _PREFERRED_SITES - else 99 - ) - - # match items without extra requests - for site in sites: - item = site.get_item() - if item: - return item - - # match items after HEAD - sites = [ - SiteManager.get_site_by_url(site.url) if site.url else site - for site in sites - ] - sites = [site for site in sites if site] - for site in sites: - item = site.get_item() - if item: - return item - - # fetch from remote - for site in sites: - try: - logger.debug(f"fetching {site.url}") - site.get_resource_ready() - item = site.get_item() - if item: - return item - except Exception as e: - logger.error(f"Error fetching item: {e}") - - # Try using the info string - if info_str: - info_dict = {} - for pair in info_str.strip().split(): - if ":" in pair: - key, value = pair.split(":", 1) - info_dict[key] = value - - # Check for ISBN, IMDB, etc. - item = None - for key, value in info_dict.items(): - if key == "isbn" and value: - item = Edition.objects.filter( - primary_lookup_id_type=IdType.ISBN, - primary_lookup_id_value=value, - ).first() - elif key == "imdb" and value: - item = Item.objects.filter( - primary_lookup_id_type=IdType.IMDB, - primary_lookup_id_value=value, - ).first() - if item: - return item - return None - - def parse_tags(self, tags_str: str) -> List[str]: - """Parse space-separated tags string into a list of tags.""" - if not tags_str: - return [] - return [tag.strip() for tag in tags_str.split() if tag.strip()] - - def parse_info(self, info_str: str) -> Dict[str, str]: - """Parse info string into a dictionary.""" - info_dict = {} - if not info_str: - return info_dict - - for pair in info_str.split(): - if ":" in pair: - key, value = pair.split(":", 1) - info_dict[key] = value - - return info_dict - - def parse_datetime(self, timestamp_str: str) -> Optional[datetime.datetime]: - """Parse ISO format timestamp into datetime.""" - if not timestamp_str: - return None - - try: - dt = parse_datetime(timestamp_str) - if dt and dt.tzinfo is None: - dt = dt.replace(tzinfo=datetime.UTC) - return dt - except Exception as e: - logger.error(f"Error parsing datetime {timestamp_str}: {e}") - return None - - def parse_shelf_type(self, status_str: str) -> ShelfType: - """Parse shelf type string into ShelfType enum.""" - if not status_str: - return ShelfType.WISHLIST - - status_map = { - "wishlist": ShelfType.WISHLIST, - "progress": ShelfType.PROGRESS, - "complete": ShelfType.COMPLETE, - "dropped": ShelfType.DROPPED, - } - - return status_map.get(status_str.lower(), ShelfType.WISHLIST) - +class CsvImporter(BaseImporter): def import_mark(self, row: Dict[str, str]) -> str: """Import a mark from a CSV row. @@ -184,7 +23,9 @@ class CsvImporter(Task): """ try: item = self.get_item_by_info_and_links( - row.get("title", ""), row.get("info", ""), row.get("links", "") + row.get("title", ""), + row.get("info", ""), + row.get("links", "").strip().split(), ) if not item: @@ -246,7 +87,9 @@ class CsvImporter(Task): """ try: item = self.get_item_by_info_and_links( - row.get("title", ""), row.get("info", ""), row.get("links", "") + row.get("title", ""), + row.get("info", ""), + row.get("links", "").strip().split(), ) if not item: @@ -304,7 +147,9 @@ class CsvImporter(Task): """ try: item = self.get_item_by_info_and_links( - row.get("title", ""), row.get("info", ""), row.get("links", "") + row.get("title", ""), + row.get("info", ""), + row.get("links", "").strip().split(), ) if not item: @@ -361,26 +206,6 @@ class CsvImporter(Task): ) return "failed" - def progress(self, result: str) -> None: - """Update import progress. - - Args: - result: The import result ('imported', 'skipped', or 'failed') - """ - self.metadata["processed"] += 1 - self.metadata[result] = self.metadata.get(result, 0) + 1 - - progress_percentage = round( - self.metadata["processed"] / self.metadata["total"] * 100 - ) - self.message = ( - f"Progress: {progress_percentage}% - " - f"{self.metadata['imported']} imported, " - f"{self.metadata['skipped']} skipped, " - f"{self.metadata['failed']} failed" - ) - self.save(update_fields=["metadata", "message"]) - def process_csv_file(self, file_path: str, import_function) -> None: """Process a CSV file using the specified import function.""" logger.debug(f"Processing {file_path}") diff --git a/journal/importers/ndjson.py b/journal/importers/ndjson.py new file mode 100644 index 00000000..bb243923 --- /dev/null +++ b/journal/importers/ndjson.py @@ -0,0 +1,447 @@ +import json +import os +import tempfile +import zipfile +from typing import Any, Dict + +from django.utils.translation import gettext as _ +from loguru import logger + +from journal.models import ( + Collection, + Comment, + Mark, + Note, + Rating, + Review, + ShelfLogEntry, + ShelfType, + Tag, + TagMember, +) + +from .base import BaseImporter + + +class NdjsonImporter(BaseImporter): + """Importer for NDJSON files exported from NeoDB.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.items = {} + + def import_collection(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: + """Import a collection from NDJSON data.""" + try: + owner = self.user.identity + visibility = data.get("visibility", self.metadata.get("visibility", 0)) + metadata = data.get("metadata", {}) + content_data = data.get("content", {}) + published_dt = self.parse_datetime(content_data.get("published")) + name = content_data.get("name", "") + content = content_data.get("content", "") + collection = Collection.objects.create( + owner=owner, + title=name, + brief=content, + visibility=visibility, + metadata=data.get("metadata", {}), + created_time=published_dt, + ) + item_data = data.get("items", []) + for item_entry in item_data: + item_url = item_entry.get("item") + if not item_url: + continue + item = self.items.get(item_url) + if not item: + logger.warning(f"Could not find item for collection: {item_url}") + continue + metadata = item_entry.get("metadata", {}) + collection.append_item(item, metadata=metadata) + return "imported" + except Exception as e: + logger.error(f"Error importing collection: {e}") + return "failed" + + def import_shelf_member(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: + """Import a shelf member (mark) from NDJSON data.""" + try: + owner = self.user.identity + visibility = data.get("visibility", self.metadata.get("visibility", 0)) + metadata = data.get("metadata", {}) + content_data = data.get("content", {}) + published_dt = self.parse_datetime(content_data.get("published")) + item = self.items.get(content_data.get("withRegardTo", "")) + if not item: + raise KeyError(f"Could not find item: {data.get('item', '')}") + shelf_type = content_data.get("status", ShelfType.WISHLIST) + mark = Mark(owner, item) + if mark.created_time and published_dt and mark.created_time >= published_dt: + return "skipped" + mark.update( + shelf_type=shelf_type, + visibility=visibility, + metadata=metadata, + created_time=published_dt, + ) + return "imported" + except Exception as e: + logger.error(f"Error importing shelf member: {e}") + return "failed" + + def import_shelf_log(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: + """Import a shelf log entry from NDJSON data.""" + try: + item = self.items.get(data.get("item", "")) + if not item: + raise KeyError(f"Could not find item: {data.get('item', '')}") + owner = self.user.identity + shelf_type = data.get("status", ShelfType.WISHLIST) + # posts = data.get("posts", []) # TODO but will be tricky + timestamp = data.get("timestamp") + timestamp_dt = self.parse_datetime(timestamp) if timestamp else None + _, created = ShelfLogEntry.objects.update_or_create( + owner=owner, + item=item, + shelf_type=shelf_type, + timestamp=timestamp_dt, + ) + # return "imported" if created else "skipped" + # count skip as success otherwise it may confuse user + return "imported" + except Exception as e: + logger.error(f"Error importing shelf log: {e}") + return "failed" + + def import_post(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: + """Import a post from NDJSON data.""" + # TODO + return "skipped" + + def import_review(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: + """Import a review from NDJSON data.""" + try: + owner = self.user.identity + visibility = data.get("visibility", self.metadata.get("visibility", 0)) + metadata = data.get("metadata", {}) + content_data = data.get("content", {}) + published_dt = self.parse_datetime(content_data.get("published")) + item = self.items.get(content_data.get("withRegardTo", "")) + if not item: + raise KeyError(f"Could not find item: {data.get('item', '')}") + name = content_data.get("name", "") + content = content_data.get("content", "") + existing_review = Review.objects.filter( + owner=owner, item=item, title=name + ).first() + if ( + existing_review + and existing_review.created_time + and published_dt + and existing_review.created_time >= published_dt + ): + return "skipped" + Review.objects.create( + owner=owner, + item=item, + title=name, + body=content, + created_time=published_dt, + visibility=visibility, + metadata=metadata, + ) + return "imported" + except Exception as e: + logger.error(f"Error importing review: {e}") + return "failed" + + def import_note(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: + """Import a note from NDJSON data.""" + try: + owner = self.user.identity + visibility = data.get("visibility", self.metadata.get("visibility", 0)) + content_data = data.get("content", {}) + published_dt = self.parse_datetime(content_data.get("published")) + item = self.items.get(content_data.get("withRegardTo", "")) + if not item: + raise KeyError(f"Could not find item: {data.get('item', '')}") + title = content_data.get("title", "") + content = content_data.get("content", "") + sensitive = content_data.get("sensitive", False) + progress = content_data.get("progress", {}) + progress_type = progress.get("type", "") + progress_value = progress.get("value", "") + Note.objects.create( + item=item, + owner=owner, + title=title, + content=content, + sensitive=sensitive, + progress_type=progress_type, + progress_value=progress_value, + created_time=published_dt, + visibility=visibility, + metadata=data.get("metadata", {}), + ) + return "imported" + except Exception as e: + logger.error(f"Error importing note: {e}") + return "failed" + + def import_comment(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: + """Import a comment from NDJSON data.""" + try: + owner = self.user.identity + visibility = data.get("visibility", self.metadata.get("visibility", 0)) + metadata = data.get("metadata", {}) + content_data = data.get("content", {}) + published_dt = self.parse_datetime(content_data.get("published")) + item = self.items.get(content_data.get("withRegardTo", "")) + if not item: + raise KeyError(f"Could not find item: {data.get('item', '')}") + content = content_data.get("content", "") + existing_comment = Comment.objects.filter(owner=owner, item=item).first() + if ( + existing_comment + and existing_comment.created_time + and published_dt + and existing_comment.created_time >= published_dt + ): + return "skipped" + Comment.objects.create( + owner=owner, + item=item, + text=content, + created_time=published_dt, + visibility=visibility, + metadata=metadata, + ) + return "imported" + except Exception as e: + logger.error(f"Error importing comment: {e}") + return "failed" + + def import_rating(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: + """Import a rating from NDJSON data.""" + try: + owner = self.user.identity + visibility = data.get("visibility", self.metadata.get("visibility", 0)) + metadata = data.get("metadata", {}) + content_data = data.get("content", {}) + published_dt = self.parse_datetime(content_data.get("published")) + item = self.items.get(content_data.get("withRegardTo", "")) + if not item: + raise KeyError(f"Could not find item: {data.get('item', '')}") + rating_grade = int(float(content_data.get("value", 0))) + existing_rating = Comment.objects.filter(owner=owner, item=item).first() + if ( + existing_rating + and existing_rating.created_time + and published_dt + and existing_rating.created_time >= published_dt + ): + return "skipped" + Rating.objects.create( + owner=owner, + item=item, + grade=rating_grade, + created_time=published_dt, + visibility=visibility, + metadata=metadata, + ) + return "imported" + except Exception as e: + logger.error(f"Error importing rating: {e}") + return "failed" + + def import_tag(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: + """Import tags from NDJSON data.""" + try: + owner = self.user.identity + visibility = data.get("visibility", self.metadata.get("visibility", 0)) + pinned = data.get("pinned", self.metadata.get("pinned", False)) + tag_title = Tag.cleanup_title(data.get("name", "")) + _, created = Tag.objects.update_or_create( + owner=owner, + title=tag_title, + defaults={ + "visibility": visibility, + "pinned": pinned, + }, + ) + return "imported" if created else "skipped" + except Exception as e: + logger.error(f"Error importing tag member: {e}") + return "failed" + + def import_tag_member(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: + """Import tags from NDJSON data.""" + try: + owner = self.user.identity + visibility = data.get("visibility", self.metadata.get("visibility", 0)) + metadata = data.get("metadata", {}) + content_data = data.get("content", {}) + published_dt = self.parse_datetime(content_data.get("published")) + item = self.items.get(content_data.get("withRegardTo", "")) + if not item: + raise KeyError(f"Could not find item: {data.get('item', '')}") + tag_title = Tag.cleanup_title(content_data.get("tag", "")) + tag, _ = Tag.objects.get_or_create( + owner=owner, + title=tag_title, + defaults={ + "created_time": published_dt, + "visibility": visibility, + "pinned": False, + "metadata": metadata, + }, + ) + _, created = TagMember.objects.update_or_create( + owner=owner, + item=item, + parent=tag, + defaults={ + "created_time": published_dt, + "visibility": visibility, + "metadata": metadata, + "position": 0, + }, + ) + return "imported" if created else "skipped" + except Exception as e: + logger.error(f"Error importing tag member: {e}") + return "failed" + + def process_journal(self, file_path: str) -> None: + """Process a NDJSON file and import all items.""" + logger.debug(f"Processing {file_path}") + lines_error = 0 + import_funcs = { + "Tag": self.import_tag, + "TagMember": self.import_tag_member, + "Rating": self.import_rating, + "Comment": self.import_comment, + "ShelfMember": self.import_shelf_member, + "Review": self.import_review, + "Note": self.import_note, + "Collection": self.import_collection, + "ShelfLog": self.import_shelf_log, + "Post": self.import_post, + } + journal = {k: [] for k in import_funcs.keys()} + with open(file_path, "r") as jsonfile: + # Skip header line + next(jsonfile, None) + + for line in jsonfile: + try: + data = json.loads(line) + except json.JSONDecodeError: + lines_error += 1 + continue + data_type = data.get("type") + if not data_type: + continue + if data_type not in journal: + journal[data_type] = [] + journal[data_type].append(data) + + self.metadata["total"] = sum(len(items) for items in journal.values()) + logger.debug(f"Processing {self.metadata['total']} entries") + if lines_error: + logger.error(f"Error processing journal.ndjson: {lines_error} lines") + + for typ, func in import_funcs.items(): + for data in journal.get(typ, []): + result = func(data) + self.progress(result) + logger.info( + f"Imported {self.metadata['imported']}, skipped {self.metadata['skipped']}, failed {self.metadata['failed']}" + ) + + def parse_catalog(self, file_path: str) -> None: + """Parse the catalog.ndjson file and build item lookup tables.""" + logger.debug(f"Parsing catalog file: {file_path}") + item_count = 0 + try: + with open(file_path, "r") as jsonfile: + for line in jsonfile: + try: + i = json.loads(line) + except (json.JSONDecodeError, Exception) as e: + logger.error(f"Error processing catalog item: {e}") + continue + u = i.get("id") + if not u: + continue + # self.catalog_items[u] = i + item_count += 1 + links = [u] + [r["url"] for r in i.get("external_resources", [])] + self.items[u] = self.get_item_by_info_and_links("", "", links) + logger.info(f"Loaded {item_count} items from catalog") + self.metadata["catalog_processed"] = item_count + except Exception as e: + logger.error(f"Error parsing catalog file: {e}") + + def parse_header(self, file_path: str) -> Dict[str, Any]: + try: + with open(file_path, "r") as jsonfile: + first_line = jsonfile.readline().strip() + if first_line: + header = json.loads(first_line) + if header.get("server"): + return header + except (json.JSONDecodeError, IOError) as e: + logger.error(f"Error parsing NDJSON header: {e}") + return {} + + def run(self) -> None: + """Run the NDJSON import.""" + filename = self.metadata["file"] + logger.debug(f"Importing {filename}") + + with zipfile.ZipFile(filename, "r") as zipref: + with tempfile.TemporaryDirectory() as tmpdirname: + zipref.extractall(tmpdirname) + + catalog_path = os.path.join(tmpdirname, "catalog.ndjson") + if os.path.exists(catalog_path): + catalog_header = self.parse_header(catalog_path) + logger.debug(f"Loading catalog.ndjson with {catalog_header}") + self.parse_catalog(catalog_path) + else: + logger.warning("catalog.ndjson file not found in the archive") + + journal_path = os.path.join(tmpdirname, "journal.ndjson") + if not os.path.exists(journal_path): + logger.error("journal.ndjson file not found in the archive") + self.message = "Import failed: journal.ndjson file not found" + self.save() + return + header = self.parse_header(journal_path) + self.metadata["journal_header"] = header + logger.debug(f"Importing journal.ndjson with {header}") + self.process_journal(journal_path) + + source_info = self.metadata.get("journal_header", {}) + source_summary = f" from {source_info.get('username', 'unknown')}@{source_info.get('server', 'unknown')} ver:{source_info.get('neodb_version', 'unknown')}." + self.message = _("Import complete") + source_summary + + metadata_stats = self.metadata.get("metadata_stats", {}) + partial_updates = metadata_stats.get("partial_updates", 0) + if partial_updates > 0: + self.message += f", {partial_updates} items with partial metadata updates" + + ratings = metadata_stats.get("ratings_updated", 0) + comments = metadata_stats.get("comments_updated", 0) + tags = metadata_stats.get("tags_updated", 0) + + if ratings > 0 or comments > 0 or tags > 0: + self.message += ( + f" ({ratings} ratings, {comments} comments, {tags} tag sets)" + ) + + if self.metadata.get("failed_items", []): + self.message += f": {self.metadata['failed']} items failed ({len(self.metadata['failed_items'])} unique items)" + self.save() diff --git a/journal/tests/__init__.py b/journal/tests/__init__.py index e94df8d2..f3bf6f07 100644 --- a/journal/tests/__init__.py +++ b/journal/tests/__init__.py @@ -1,3 +1,4 @@ from .csv import * +from .ndjson import * from .piece import * from .search import * diff --git a/journal/tests/ndjson.py b/journal/tests/ndjson.py new file mode 100644 index 00000000..f882e6d0 --- /dev/null +++ b/journal/tests/ndjson.py @@ -0,0 +1,496 @@ +import json +import os +import zipfile +from tempfile import TemporaryDirectory + +from django.test import TestCase +from django.utils.dateparse import parse_datetime +from loguru import logger + +from catalog.models import ( + Edition, + IdType, + Movie, + Podcast, + PodcastEpisode, + TVEpisode, + TVSeason, + TVShow, +) +from journal.exporters import NdjsonExporter +from journal.importers import NdjsonImporter, get_neodb_importer +from users.models import User + +from ..models import * + + +class NdjsonExportImportTest(TestCase): + databases = "__all__" + maxDiff = None + + def setUp(self): + self.user1 = User.register( + email="ndjson_export@test.com", username="ndjson_exporter" + ) + self.user2 = User.register( + email="ndjson_import@test.com", username="ndjson_importer" + ) + self.tag1 = Tag.objects.create( + owner=self.user1.identity, title="favorite", pinned=True, visibility=2 + ) + self.dt = parse_datetime("2021-01-01T00:00:00Z") + self.dt2 = parse_datetime("2021-02-01T00:00:00Z") + self.dt3 = parse_datetime("2021-03-01T00:00:00Z") + self.book1 = Edition.objects.create( + localized_title=[{"lang": "en", "text": "Hyperion"}], + primary_lookup_id_type=IdType.ISBN, + primary_lookup_id_value="9780553283686", + author=["Dan Simmons"], + pub_year=1989, + ) + self.book2 = Edition.objects.create( + localized_title=[{"lang": "en", "text": "Dune"}], + primary_lookup_id_type=IdType.ISBN, + primary_lookup_id_value="9780441172719", + author=["Frank Herbert"], + pub_year=1965, + ) + self.movie1 = Movie.objects.create( + localized_title=[{"lang": "en", "text": "Inception"}], + primary_lookup_id_type=IdType.IMDB, + primary_lookup_id_value="tt1375666", + director=["Christopher Nolan"], + year=2010, + ) + self.movie2 = Movie.objects.create( + localized_title=[{"lang": "en", "text": "The Matrix"}], + primary_lookup_id_type=IdType.IMDB, + primary_lookup_id_value="tt0133093", + director=["Lana Wachowski", "Lilly Wachowski"], + year=1999, + ) + self.tvshow = TVShow.objects.create( + localized_title=[{"lang": "en", "text": "Breaking Bad"}], + primary_lookup_id_type=IdType.IMDB, + primary_lookup_id_value="tt0903747", + year=2008, + ) + self.tvseason = TVSeason.objects.create( + localized_title=[{"lang": "en", "text": "Breaking Bad Season 1"}], + show=self.tvshow, + season_number=1, + ) + self.tvepisode1 = TVEpisode.objects.create( + localized_title=[{"lang": "en", "text": "Pilot"}], + season=self.tvseason, + episode_number=1, + ) + self.tvepisode2 = TVEpisode.objects.create( + localized_title=[{"lang": "en", "text": "Cat's in the Bag..."}], + season=self.tvseason, + episode_number=2, + ) + # Create podcast test items + self.podcast = Podcast.objects.create( + localized_title=[{"lang": "en", "text": "Test Podcast"}], + primary_lookup_id_type=IdType.RSS, + primary_lookup_id_value="https://example.com/feed.xml", + host=["Test Host"], + ) + self.podcastepisode = PodcastEpisode.objects.create( + localized_title=[{"lang": "en", "text": "Test Episode 1"}], + program=self.podcast, + guid="111", + pub_date=self.dt, + ) + + def test_ndjson_export_import(self): + # Create marks, reviews and notes for user1 + + # Book marks with ratings and tags + mark_book1 = Mark(self.user1.identity, self.book1) + mark_book1.update( + ShelfType.COMPLETE, + "Great sci-fi classic", + 10, + ["sci-fi", "favorite", "space"], + 1, + created_time=self.dt, + ) + mark_book2 = Mark(self.user1.identity, self.book2) + mark_book2.update( + ShelfType.WISHLIST, + "Read it?", + None, + ["sci-fi", "desert"], + 1, + created_time=self.dt, + ) + mark_book2.update( + ShelfType.PROGRESS, + "Reading!", + None, + ["sci-fi", "desert"], + 0, + created_time=self.dt2, + ) + mark_book2.update( + ShelfType.COMPLETE, + "Read.", + None, + ["sci-fi", "desert"], + 0, + created_time=self.dt3, + ) + + # Movie marks with ratings + mark_movie1 = Mark(self.user1.identity, self.movie1) + mark_movie1.update( + ShelfType.COMPLETE, + "Mind-bending", + 8, + ["mindbender", "scifi"], + 1, + created_time=self.dt, + ) + + mark_movie2 = Mark(self.user1.identity, self.movie2) + mark_movie2.update( + ShelfType.WISHLIST, "Need to rewatch", None, [], 1, created_time=self.dt2 + ) + + # TV show mark + mark_tvshow = Mark(self.user1.identity, self.tvshow) + mark_tvshow.update( + ShelfType.WISHLIST, + "Heard it's good", + None, + ["drama"], + 1, + created_time=self.dt, + ) + + # TV episode marks + mark_episode1 = Mark(self.user1.identity, self.tvepisode1) + mark_episode1.update( + ShelfType.COMPLETE, + "Great start", + 9, + ["pilot", "drama"], + 1, + created_time=self.dt2, + ) + + mark_episode2 = Mark(self.user1.identity, self.tvepisode2) + mark_episode2.update( + ShelfType.COMPLETE, "It gets better", 9, [], 1, created_time=self.dt3 + ) + + # Podcast episode mark + mark_podcast = Mark(self.user1.identity, self.podcastepisode) + mark_podcast.update( + ShelfType.COMPLETE, + "Insightful episode", + 8, + ["tech", "interview"], + 1, + created_time=self.dt, + ) + + # Create reviews + Review.update_item_review( + self.book1, + self.user1.identity, + "My thoughts on Hyperion", + "A masterpiece of science fiction that weaves multiple storylines into a captivating narrative.", + visibility=1, + created_time=self.dt, + ) + + Review.update_item_review( + self.movie1, + self.user1.identity, + "Inception Review", + "Christopher Nolan at his best. The movie plays with reality and dreams in a fascinating way.", + visibility=1, + ) + + # Create notes + Note.objects.create( + item=self.book2, + owner=self.user1.identity, + title="Reading progress", + content="Just finished the first part. The world-building is incredible.\n\n - p 125", + progress_type=Note.ProgressType.PAGE, + progress_value="125", + visibility=1, + ) + + Note.objects.create( + item=self.tvshow, + owner=self.user1.identity, + title="Before watching", + content="Things to look out for according to friends:\n- Character development\n- Color symbolism\n\n - e 0", + progress_type=Note.ProgressType.EPISODE, + progress_value="2", + visibility=1, + ) + + # Create TV episode note + Note.objects.create( + item=self.tvepisode1, + owner=self.user1.identity, + title="Episode thoughts", + content="Great pilot episode. Sets up the character arcs really well.", + visibility=1, + ) + + # Create podcast episode note + Note.objects.create( + item=self.podcastepisode, + owner=self.user1.identity, + title="Podcast episode notes", + content="Interesting discussion about tech trends. Timestamp 23:45 has a good point about AI.", + progress_type=Note.ProgressType.TIMESTAMP, + progress_value="23:45", + visibility=1, + ) + + # Create collections + items = [self.book1, self.movie1] + collection = Collection.objects.create( + owner=self.user1.identity, + title="Favorites", + brief="My all-time favorites", + visibility=1, + ) + for i in items: + collection.append_item(i) + + # Create another collection with different items + items2 = [self.book2, self.movie2, self.tvshow] + collection2 = Collection.objects.create( + owner=self.user1.identity, + title="To Review", + brief="Items I need to review soon", + visibility=1, + ) + for i in items2: + collection2.append_item(i) + + # Create shelf log entries + logs = ShelfLogEntry.objects.filter(owner=self.user1.identity).order_by( + "timestamp", "item_id" + ) + + # Export data to NDJSON + exporter = NdjsonExporter.create(user=self.user1) + exporter.run() + export_path = exporter.metadata["file"] + logger.debug(f"exported to {export_path}") + self.assertTrue(os.path.exists(export_path)) + + # Validate the NDJSON export file structure + with TemporaryDirectory() as extract_dir: + with zipfile.ZipFile(export_path, "r") as zip_ref: + zip_ref.extractall(extract_dir) + logger.debug(f"unzipped to {extract_dir}") + + # Check journal.ndjson exists + journal_path = os.path.join(extract_dir, "journal.ndjson") + self.assertTrue( + os.path.exists(journal_path), "journal.ndjson file missing" + ) + + # Check catalog.ndjson exists + catalog_path = os.path.join(extract_dir, "catalog.ndjson") + self.assertTrue( + os.path.exists(catalog_path), "catalog.ndjson file missing" + ) + + # Check attachments directory exists + attachments_path = os.path.join(extract_dir, "attachments") + self.assertTrue( + os.path.exists(attachments_path), "attachments directory missing" + ) + + # Count the number of JSON objects in journal.ndjson + with open(journal_path, "r") as f: + lines = f.readlines() + # First line is header, rest are data + self.assertGreater( + len(lines), 1, "journal.ndjson has no data lines" + ) + + # Check the first line is a header + header = json.loads(lines[0]) + self.assertIn("server", header, "Missing server in header") + self.assertIn("username", header, "Missing username in header") + self.assertEqual( + header["username"], + "ndjson_exporter", + "Wrong username in header", + ) + + # Count data objects by type + type_counts = { + "ShelfMember": 0, + "Review": 0, + "Note": 0, + "Collection": 0, + "ShelfLog": 0, + "post": 0, + } + + for line in lines[1:]: + data = json.loads(line) + if "type" in data: + type_counts[data["type"]] = ( + type_counts.get(data["type"], 0) + 1 + ) + + # Verify counts + self.assertEqual( + type_counts["ShelfMember"], 8, "Expected 8 ShelfMember entries" + ) + self.assertEqual( + type_counts["Review"], 2, "Expected 2 Review entries" + ) + self.assertEqual(type_counts["Note"], 4, "Expected 4 Note entries") + self.assertEqual( + type_counts["Collection"], 2, "Expected 2 Collection entries" + ) + self.assertEqual(type_counts["ShelfLog"], logs.count()) + + # Now import the export file into a different user account + self.assertEqual(get_neodb_importer(export_path), NdjsonImporter) + importer = NdjsonImporter.create( + user=self.user2, file=export_path, visibility=2 + ) + importer.run() + self.assertIn("Import complete", importer.message) + + # Verify imported data + + # Check marks + mark_book1_imported = Mark(self.user2.identity, self.book1) + self.assertEqual(mark_book1_imported.shelf_type, ShelfType.COMPLETE) + self.assertEqual(mark_book1_imported.comment_text, "Great sci-fi classic") + self.assertEqual(mark_book1_imported.rating_grade, 10) + self.assertEqual(mark_book1_imported.visibility, 1) + self.assertEqual( + set(mark_book1_imported.tags), set(["sci-fi", "favorite", "space"]) + ) + + mark_book2_imported = Mark(self.user2.identity, self.book2) + self.assertEqual(mark_book2_imported.shelf_type, ShelfType.COMPLETE) + self.assertEqual(mark_book2_imported.comment_text, "Read.") + self.assertIsNone(mark_book2_imported.rating_grade) + self.assertEqual(set(mark_book2_imported.tags), set(["sci-fi", "desert"])) + self.assertEqual(mark_book2_imported.visibility, 0) + + mark_movie1_imported = Mark(self.user2.identity, self.movie1) + self.assertEqual(mark_movie1_imported.shelf_type, ShelfType.COMPLETE) + self.assertEqual(mark_movie1_imported.comment_text, "Mind-bending") + self.assertEqual(mark_movie1_imported.rating_grade, 8) + self.assertEqual(set(mark_movie1_imported.tags), set(["mindbender", "scifi"])) + + mark_episode1_imported = Mark(self.user2.identity, self.tvepisode1) + self.assertEqual(mark_episode1_imported.shelf_type, ShelfType.COMPLETE) + self.assertEqual(mark_episode1_imported.comment_text, "Great start") + self.assertEqual(mark_episode1_imported.rating_grade, 9) + self.assertEqual(set(mark_episode1_imported.tags), set(["pilot", "drama"])) + + # Check podcast episode mark + mark_podcast_imported = Mark(self.user2.identity, self.podcastepisode) + self.assertEqual(mark_podcast_imported.shelf_type, ShelfType.COMPLETE) + self.assertEqual(mark_podcast_imported.comment_text, "Insightful episode") + self.assertEqual(mark_podcast_imported.rating_grade, 8) + self.assertEqual(set(mark_podcast_imported.tags), set(["tech", "interview"])) + + # Check reviews + book1_reviews = Review.objects.filter( + owner=self.user2.identity, item=self.book1 + ) + self.assertEqual(book1_reviews.count(), 1) + self.assertEqual(book1_reviews[0].title, "My thoughts on Hyperion") + self.assertIn("masterpiece of science fiction", book1_reviews[0].body) + + movie1_reviews = Review.objects.filter( + owner=self.user2.identity, item=self.movie1 + ) + self.assertEqual(movie1_reviews.count(), 1) + self.assertEqual(movie1_reviews[0].title, "Inception Review") + self.assertIn("Christopher Nolan", movie1_reviews[0].body) + + # Check notes + book2_notes = Note.objects.filter(owner=self.user2.identity, item=self.book2) + self.assertEqual(book2_notes.count(), 1) + self.assertEqual(book2_notes[0].title, "Reading progress") + self.assertIn("world-building is incredible", book2_notes[0].content) + self.assertEqual(book2_notes[0].progress_type, Note.ProgressType.PAGE) + self.assertEqual(book2_notes[0].progress_value, "125") + + tvshow_notes = Note.objects.filter(owner=self.user2.identity, item=self.tvshow) + self.assertEqual(tvshow_notes.count(), 1) + self.assertEqual(tvshow_notes[0].title, "Before watching") + self.assertIn("Character development", tvshow_notes[0].content) + + # Check TV episode notes + tvepisode_notes = Note.objects.filter( + owner=self.user2.identity, item=self.tvepisode1 + ) + self.assertEqual(tvepisode_notes.count(), 1) + self.assertEqual(tvepisode_notes[0].title, "Episode thoughts") + self.assertIn("Sets up the character arcs", tvepisode_notes[0].content) + + # Check podcast episode notes + podcast_notes = Note.objects.filter( + owner=self.user2.identity, item=self.podcastepisode + ) + self.assertEqual(podcast_notes.count(), 1) + self.assertEqual(podcast_notes[0].title, "Podcast episode notes") + self.assertIn( + "Interesting discussion about tech trends", podcast_notes[0].content + ) + self.assertEqual(podcast_notes[0].progress_type, Note.ProgressType.TIMESTAMP) + self.assertEqual(podcast_notes[0].progress_value, "23:45") + + # Check first collection + collections = Collection.objects.filter( + owner=self.user2.identity, title="Favorites" + ) + self.assertEqual(collections.count(), 1) + self.assertEqual(collections[0].brief, "My all-time favorites") + self.assertEqual(collections[0].visibility, 1) + collection_items = list(collections[0].ordered_items) + self.assertEqual([self.book1, self.movie1], collection_items) + + # Check second collection + collections2 = Collection.objects.filter( + owner=self.user2.identity, title="To Review" + ) + self.assertEqual(collections2.count(), 1) + self.assertEqual(collections2[0].brief, "Items I need to review soon") + self.assertEqual(collections2[0].visibility, 1) + + # Check second collection items + collection2_items = [m.item for m in collections2[0].members.all()] + self.assertEqual(len(collection2_items), 3) + self.assertIn(self.book2, collection2_items) + self.assertIn(self.movie2, collection2_items) + self.assertIn(self.tvshow, collection2_items) + + tag1 = Tag.objects.filter(owner=self.user2.identity, title="favorite").first() + self.assertIsNotNone(tag1) + if tag1: + self.assertTrue(tag1.pinned) + self.assertEqual(tag1.visibility, 2) + + # Check shelf log entries + logs2 = ShelfLogEntry.objects.filter(owner=self.user2.identity).order_by( + "timestamp", "item_id" + ) + l1 = [(log.item, log.shelf_type, log.timestamp) for log in logs] + l2 = [(log.item, log.shelf_type, log.timestamp) for log in logs2] + self.assertEqual(l1, l2) diff --git a/users/templates/users/data.html b/users/templates/users/data.html index 20279ea0..631a1fe9 100644 --- a/users/templates/users/data.html +++ b/users/templates/users/data.html @@ -48,7 +48,7 @@
{{ ndjson_export_task.message }} {% if ndjson_export_task.metadata.file %} - {% trans 'Download' %} + {% trans 'Download' %} {% endif %} {% endif %} @@ -86,58 +86,148 @@ {% trans 'Upload a .zip file containing .csv or .ndjson files exported from NeoDB.' %}
  • {% trans 'Existing marks and reviews with newer dates will be preserved.' %}
  • +
  • + {% trans 'Both CSV and NDJSON formats exported from NeoDB are supported. NDJSON format includes more data, like collections.' %} +

  • - -

    - {% trans 'Visibility' %}: .csv only -
    - - - -

    + + + + - - {% if csv_import_task %} - {% trans 'Last import started' %}: {{ csv_import_task.created_time }} - {% if csv_import_task.state == 0 or csv_import_task.state == 1 %} -
    - {% else %} - {% trans 'Status' %}: {{ csv_import_task.get_state_display }}。 - {{ csv_import_task.message }} - {% endif %} - {% if csv_import_task.metadata.failed_items %} - {% trans 'Failed items' %}: -
    - - {% endif %} - {% endif %} -
    + + + {% if neodb_import_task %} + {% include "users/user_task_status.html" with task=neodb_import_task %} + {% endif %} diff --git a/users/templates/users/user_task_status.html b/users/templates/users/user_task_status.html index b4f58447..25a8c474 100644 --- a/users/templates/users/user_task_status.html +++ b/users/templates/users/user_task_status.html @@ -1,19 +1,20 @@ {% load i18n %} -
    - {% trans 'Status' %}: {{ task.get_state_display }}。 + {% trans 'Requested' %}: {{ task.created_time }} + ({{ task.get_state_display }}) {{ task.message }} -
    - {% if task.metadata.total and task.metadata.processed %} -
    - -
    - {{ task.metadata.processed }} / {{ task.metadata.total }} - ({{ task.metadata.imported }} imported, - {{ task.metadata.skipped }} skipped, - {{ task.metadata.failed }} failed) + {% if task.state == 0 or task.state == 1 %} + {% if task.metadata.total and task.metadata.processed %} +
    +
    -
    + {% endif %} + {% endif %} + {% if task.metadata.failed_items %} + {% trans 'Failed items' %}: +
    + {% endif %}
    diff --git a/users/urls.py b/users/urls.py index 8f306c6b..7751a9fa 100644 --- a/users/urls.py +++ b/users/urls.py @@ -10,7 +10,7 @@ urlpatterns = [ path("data", data, name="data"), path("info", account_info, name="info"), path("profile", account_profile, name="profile"), - path("task//status", user_task_status, name="user_task_status"), + path("task//status", user_task_status, name="user_task_status"), path("data/import/status", data_import_status, name="import_status"), path("data/import/goodreads", import_goodreads, name="import_goodreads"), path("data/import/douban", import_douban, name="import_douban"), diff --git a/users/views/data.py b/users/views/data.py index 732ff635..eb4104fa 100644 --- a/users/views/data.py +++ b/users/views/data.py @@ -18,6 +18,7 @@ from journal.importers import ( DoubanImporter, GoodreadsImporter, LetterboxdImporter, + NdjsonImporter, OPMLImporter, get_neodb_importer, ) @@ -92,6 +93,20 @@ def data(request): start_date = queryset.aggregate(Min("created_time"))["created_time__min"] start_year = start_date.year if start_date else current_year years = reversed(range(start_year, current_year + 1)) + + # Import tasks - check for both CSV and NDJSON importers + csv_import_task = CsvImporter.latest_task(request.user) + ndjson_import_task = NdjsonImporter.latest_task(request.user) + + # Use the most recent import task for display + if ndjson_import_task and ( + not csv_import_task + or ndjson_import_task.created_time > csv_import_task.created_time + ): + neodb_import_task = ndjson_import_task + else: + neodb_import_task = csv_import_task + return render( request, "users/data.html", @@ -100,7 +115,7 @@ def data(request): "import_task": DoubanImporter.latest_task(request.user), "export_task": DoufenExporter.latest_task(request.user), "csv_export_task": CsvExporter.latest_task(request.user), - "csv_import_task": CsvImporter.latest_task(request.user), + "neodb_import_task": neodb_import_task, # Use the most recent import task "ndjson_export_task": NdjsonExporter.latest_task(request.user), "letterboxd_task": LetterboxdImporter.latest_task(request.user), "goodreads_task": GoodreadsImporter.latest_task(request.user), @@ -121,19 +136,21 @@ def data_import_status(request): @login_required -def user_task_status(request, task_name: str): - match task_name: - case "csv_import": +def user_task_status(request, task_type: str): + match task_type: + case "journal.csvimporter": task_cls = CsvImporter - case "csv_export": + case "journal.ndjsonimporter": + task_cls = NdjsonImporter + case "journal.csvexporter": task_cls = CsvExporter - case "ndjson_export": + case "journal.ndjsonexporter": task_cls = NdjsonExporter - case "letterboxd": + case "journal.letterboxdimporter": task_cls = LetterboxdImporter - case "goodreads": + case "journal.goodreadsimporter": task_cls = GoodreadsImporter - case "douban": + case "journal.doubanimporter": task_cls = DoubanImporter case _: return redirect(reverse("users:data")) @@ -357,16 +374,49 @@ def import_neodb(request): with open(f, "wb+") as destination: for chunk in request.FILES["file"].chunks(): destination.write(chunk) - importer = get_neodb_importer(f) + + # Get format type hint from frontend, if provided + format_type_hint = request.POST.get("format_type", "").lower() + + # Import appropriate class based on format type or auto-detect + from journal.importers import CsvImporter, NdjsonImporter + + if format_type_hint == "csv": + importer = CsvImporter + format_type = "CSV" + elif format_type_hint == "ndjson": + importer = NdjsonImporter + format_type = "NDJSON" + else: + # Fall back to auto-detection if no hint provided + importer = get_neodb_importer(f) + if importer == CsvImporter: + format_type = "CSV" + elif importer == NdjsonImporter: + format_type = "NDJSON" + else: + format_type = "" + importer = None # Make sure importer is None if auto-detection fails + if not importer: - messages.add_message(request, messages.ERROR, _("Invalid file.")) + messages.add_message( + request, + messages.ERROR, + _( + "Invalid file. Expected a ZIP containing either CSV or NDJSON files exported from NeoDB." + ), + ) return redirect(reverse("users:data")) + importer.create( request.user, visibility=int(request.POST.get("visibility", 0)), file=f, ).enqueue() + messages.add_message( - request, messages.INFO, _("File is uploaded and will be imported soon.") + request, + messages.INFO, + _(f"{format_type} file is uploaded and will be imported soon."), ) return redirect(reverse("users:data"))