import json import os import tempfile import zipfile from typing import Any, Dict from django.utils.translation import gettext as _ from loguru import logger from journal.models import ( Collection, Comment, Mark, Note, Rating, Review, ShelfLogEntry, ShelfType, Tag, TagMember, ) from .base import BaseImporter class NdjsonImporter(BaseImporter): """Importer for NDJSON files exported from NeoDB.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.items = {} def import_collection(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: """Import a collection from NDJSON data.""" try: owner = self.user.identity visibility = data.get("visibility", self.metadata.get("visibility", 0)) metadata = data.get("metadata", {}) content_data = data.get("content", {}) published_dt = self.parse_datetime(content_data.get("published")) name = content_data.get("name", "") content = content_data.get("content", "") collection = Collection.objects.create( owner=owner, title=name, brief=content, visibility=visibility, metadata=data.get("metadata", {}), created_time=published_dt, ) item_data = data.get("items", []) for item_entry in item_data: item_url = item_entry.get("item") if not item_url: continue item = self.items.get(item_url) if not item: logger.warning(f"Could not find item for collection: {item_url}") continue metadata = item_entry.get("metadata", {}) collection.append_item(item, metadata=metadata) return "imported" except Exception as e: logger.error(f"Error importing collection: {e}") return "failed" def import_shelf_member(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: """Import a shelf member (mark) from NDJSON data.""" try: owner = self.user.identity visibility = data.get("visibility", self.metadata.get("visibility", 0)) metadata = data.get("metadata", {}) content_data = data.get("content", {}) published_dt = self.parse_datetime(content_data.get("published")) item = self.items.get(content_data.get("withRegardTo", "")) if not item: raise KeyError(f"Could not find item: {data.get('item', '')}") shelf_type = content_data.get("status", ShelfType.WISHLIST) mark = Mark(owner, item) if mark.created_time and published_dt and mark.created_time >= published_dt: return "skipped" mark.update( shelf_type=shelf_type, visibility=visibility, metadata=metadata, created_time=published_dt, ) return "imported" except Exception as e: logger.error(f"Error importing shelf member: {e}") return "failed" def import_shelf_log(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: """Import a shelf log entry from NDJSON data.""" try: item = self.items.get(data.get("item", "")) if not item: raise KeyError(f"Could not find item: {data.get('item', '')}") owner = self.user.identity shelf_type = data.get("status", ShelfType.WISHLIST) # posts = data.get("posts", []) # TODO but will be tricky timestamp = data.get("timestamp") timestamp_dt = self.parse_datetime(timestamp) if timestamp else None _, created = ShelfLogEntry.objects.update_or_create( owner=owner, item=item, shelf_type=shelf_type, timestamp=timestamp_dt, ) # return "imported" if created else "skipped" # count skip as success otherwise it may confuse user return "imported" except Exception as e: logger.error(f"Error importing shelf log: {e}") return "failed" def import_post(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: """Import a post from NDJSON data.""" # TODO return "skipped" def import_review(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: """Import a review from NDJSON data.""" try: owner = self.user.identity visibility = data.get("visibility", self.metadata.get("visibility", 0)) metadata = data.get("metadata", {}) content_data = data.get("content", {}) published_dt = self.parse_datetime(content_data.get("published")) item = self.items.get(content_data.get("withRegardTo", "")) if not item: raise KeyError(f"Could not find item: {data.get('item', '')}") name = content_data.get("name", "") content = content_data.get("content", "") existing_review = Review.objects.filter( owner=owner, item=item, title=name ).first() if ( existing_review and existing_review.created_time and published_dt and existing_review.created_time >= published_dt ): return "skipped" Review.objects.create( owner=owner, item=item, title=name, body=content, created_time=published_dt, visibility=visibility, metadata=metadata, ) return "imported" except Exception as e: logger.error(f"Error importing review: {e}") return "failed" def import_note(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: """Import a note from NDJSON data.""" try: owner = self.user.identity visibility = data.get("visibility", self.metadata.get("visibility", 0)) content_data = data.get("content", {}) published_dt = self.parse_datetime(content_data.get("published")) item = self.items.get(content_data.get("withRegardTo", "")) if not item: raise KeyError(f"Could not find item: {data.get('item', '')}") title = content_data.get("title", "") content = content_data.get("content", "") sensitive = content_data.get("sensitive", False) progress = content_data.get("progress", {}) progress_type = progress.get("type", "") progress_value = progress.get("value", "") Note.objects.create( item=item, owner=owner, title=title, content=content, sensitive=sensitive, progress_type=progress_type, progress_value=progress_value, created_time=published_dt, visibility=visibility, metadata=data.get("metadata", {}), ) return "imported" except Exception as e: logger.error(f"Error importing note: {e}") return "failed" def import_comment(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: """Import a comment from NDJSON data.""" try: owner = self.user.identity visibility = data.get("visibility", self.metadata.get("visibility", 0)) metadata = data.get("metadata", {}) content_data = data.get("content", {}) published_dt = self.parse_datetime(content_data.get("published")) item = self.items.get(content_data.get("withRegardTo", "")) if not item: raise KeyError(f"Could not find item: {data.get('item', '')}") content = content_data.get("content", "") existing_comment = Comment.objects.filter(owner=owner, item=item).first() if ( existing_comment and existing_comment.created_time and published_dt and existing_comment.created_time >= published_dt ): return "skipped" Comment.objects.create( owner=owner, item=item, text=content, created_time=published_dt, visibility=visibility, metadata=metadata, ) return "imported" except Exception as e: logger.error(f"Error importing comment: {e}") return "failed" def import_rating(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: """Import a rating from NDJSON data.""" try: owner = self.user.identity visibility = data.get("visibility", self.metadata.get("visibility", 0)) metadata = data.get("metadata", {}) content_data = data.get("content", {}) published_dt = self.parse_datetime(content_data.get("published")) item = self.items.get(content_data.get("withRegardTo", "")) if not item: raise KeyError(f"Could not find item: {data.get('item', '')}") rating_grade = int(float(content_data.get("value", 0))) existing_rating = Comment.objects.filter(owner=owner, item=item).first() if ( existing_rating and existing_rating.created_time and published_dt and existing_rating.created_time >= published_dt ): return "skipped" Rating.objects.create( owner=owner, item=item, grade=rating_grade, created_time=published_dt, visibility=visibility, metadata=metadata, ) return "imported" except Exception as e: logger.error(f"Error importing rating: {e}") return "failed" def import_tag(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: """Import tags from NDJSON data.""" try: owner = self.user.identity visibility = data.get("visibility", self.metadata.get("visibility", 0)) pinned = data.get("pinned", self.metadata.get("pinned", False)) tag_title = Tag.cleanup_title(data.get("name", "")) _, created = Tag.objects.update_or_create( owner=owner, title=tag_title, defaults={ "visibility": visibility, "pinned": pinned, }, ) return "imported" if created else "skipped" except Exception as e: logger.error(f"Error importing tag member: {e}") return "failed" def import_tag_member(self, data: Dict[str, Any]) -> BaseImporter.ImportResult: """Import tags from NDJSON data.""" try: owner = self.user.identity visibility = data.get("visibility", self.metadata.get("visibility", 0)) metadata = data.get("metadata", {}) content_data = data.get("content", {}) published_dt = self.parse_datetime(content_data.get("published")) item = self.items.get(content_data.get("withRegardTo", "")) if not item: raise KeyError(f"Could not find item: {data.get('item', '')}") tag_title = Tag.cleanup_title(content_data.get("tag", "")) tag, _ = Tag.objects.get_or_create( owner=owner, title=tag_title, defaults={ "created_time": published_dt, "visibility": visibility, "pinned": False, "metadata": metadata, }, ) _, created = TagMember.objects.update_or_create( owner=owner, item=item, parent=tag, defaults={ "created_time": published_dt, "visibility": visibility, "metadata": metadata, "position": 0, }, ) return "imported" if created else "skipped" except Exception as e: logger.error(f"Error importing tag member: {e}") return "failed" def process_journal(self, file_path: str) -> None: """Process a NDJSON file and import all items.""" logger.debug(f"Processing {file_path}") lines_error = 0 import_funcs = { "Tag": self.import_tag, "TagMember": self.import_tag_member, "Rating": self.import_rating, "Comment": self.import_comment, "ShelfMember": self.import_shelf_member, "Review": self.import_review, "Note": self.import_note, "Collection": self.import_collection, "ShelfLog": self.import_shelf_log, "Post": self.import_post, } journal = {k: [] for k in import_funcs.keys()} with open(file_path, "r") as jsonfile: # Skip header line next(jsonfile, None) for line in jsonfile: try: data = json.loads(line) except json.JSONDecodeError: lines_error += 1 continue data_type = data.get("type") if not data_type: continue if data_type not in journal: journal[data_type] = [] journal[data_type].append(data) self.metadata["total"] = sum(len(items) for items in journal.values()) logger.debug(f"Processing {self.metadata['total']} entries") if lines_error: logger.error(f"Error processing journal.ndjson: {lines_error} lines") for typ, func in import_funcs.items(): for data in journal.get(typ, []): result = func(data) self.progress(result) logger.info( f"Imported {self.metadata['imported']}, skipped {self.metadata['skipped']}, failed {self.metadata['failed']}" ) def parse_catalog(self, file_path: str) -> None: """Parse the catalog.ndjson file and build item lookup tables.""" logger.debug(f"Parsing catalog file: {file_path}") item_count = 0 try: with open(file_path, "r") as jsonfile: for line in jsonfile: try: i = json.loads(line) except (json.JSONDecodeError, Exception) as e: logger.error(f"Error processing catalog item: {e}") continue u = i.get("id") if not u: continue # self.catalog_items[u] = i item_count += 1 links = [u] + [r["url"] for r in i.get("external_resources", [])] self.items[u] = self.get_item_by_info_and_links("", "", links) logger.info(f"Loaded {item_count} items from catalog") self.metadata["catalog_processed"] = item_count except Exception as e: logger.error(f"Error parsing catalog file: {e}") def parse_header(self, file_path: str) -> Dict[str, Any]: try: with open(file_path, "r") as jsonfile: first_line = jsonfile.readline().strip() if first_line: header = json.loads(first_line) if header.get("server"): return header except (json.JSONDecodeError, IOError) as e: logger.error(f"Error parsing NDJSON header: {e}") return {} def run(self) -> None: """Run the NDJSON import.""" filename = self.metadata["file"] logger.debug(f"Importing {filename}") with zipfile.ZipFile(filename, "r") as zipref: with tempfile.TemporaryDirectory() as tmpdirname: zipref.extractall(tmpdirname) catalog_path = os.path.join(tmpdirname, "catalog.ndjson") if os.path.exists(catalog_path): catalog_header = self.parse_header(catalog_path) logger.debug(f"Loading catalog.ndjson with {catalog_header}") self.parse_catalog(catalog_path) else: logger.warning("catalog.ndjson file not found in the archive") journal_path = os.path.join(tmpdirname, "journal.ndjson") if not os.path.exists(journal_path): logger.error("journal.ndjson file not found in the archive") self.message = "Import failed: journal.ndjson file not found" self.save() return header = self.parse_header(journal_path) self.metadata["journal_header"] = header logger.debug(f"Importing journal.ndjson with {header}") self.process_journal(journal_path) source_info = self.metadata.get("journal_header", {}) source_summary = f" from {source_info.get('username', 'unknown')}@{source_info.get('server', 'unknown')} ver:{source_info.get('neodb_version', 'unknown')}." self.message = _("Import complete") + source_summary metadata_stats = self.metadata.get("metadata_stats", {}) partial_updates = metadata_stats.get("partial_updates", 0) if partial_updates > 0: self.message += f", {partial_updates} items with partial metadata updates" ratings = metadata_stats.get("ratings_updated", 0) comments = metadata_stats.get("comments_updated", 0) tags = metadata_stats.get("tags_updated", 0) if ratings > 0 or comments > 0 or tags > 0: self.message += ( f" ({ratings} ratings, {comments} comments, {tags} tag sets)" ) if self.metadata.get("failed_items", []): self.message += f": {self.metadata['failed']} items failed ({len(self.metadata['failed_items'])} unique items)" self.save()