lib.itmens/journal/importers/ndjson.py
2025-03-07 20:23:11 -05:00

447 lines
18 KiB
Python

import json
import os
import tempfile
import zipfile
from typing import Any, Dict
from django.utils.translation import gettext as _
from loguru import logger
from journal.models import (
Collection,
Comment,
Mark,
Note,
Rating,
Review,
ShelfLogEntry,
ShelfType,
Tag,
TagMember,
)
from .base import BaseImporter
class NdjsonImporter(BaseImporter):
"""Importer for NDJSON files exported from NeoDB."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.items = {}
def import_collection(self, data: Dict[str, Any]) -> BaseImporter.ImportResult:
"""Import a collection from NDJSON data."""
try:
owner = self.user.identity
visibility = data.get("visibility", self.metadata.get("visibility", 0))
metadata = data.get("metadata", {})
content_data = data.get("content", {})
published_dt = self.parse_datetime(content_data.get("published"))
name = content_data.get("name", "")
content = content_data.get("content", "")
collection = Collection.objects.create(
owner=owner,
title=name,
brief=content,
visibility=visibility,
metadata=data.get("metadata", {}),
created_time=published_dt,
)
item_data = data.get("items", [])
for item_entry in item_data:
item_url = item_entry.get("item")
if not item_url:
continue
item = self.items.get(item_url)
if not item:
logger.warning(f"Could not find item for collection: {item_url}")
continue
metadata = item_entry.get("metadata", {})
collection.append_item(item, metadata=metadata)
return "imported"
except Exception as e:
logger.error(f"Error importing collection: {e}")
return "failed"
def import_shelf_member(self, data: Dict[str, Any]) -> BaseImporter.ImportResult:
"""Import a shelf member (mark) from NDJSON data."""
try:
owner = self.user.identity
visibility = data.get("visibility", self.metadata.get("visibility", 0))
metadata = data.get("metadata", {})
content_data = data.get("content", {})
published_dt = self.parse_datetime(content_data.get("published"))
item = self.items.get(content_data.get("withRegardTo", ""))
if not item:
raise KeyError(f"Could not find item: {data.get('item', '')}")
shelf_type = content_data.get("status", ShelfType.WISHLIST)
mark = Mark(owner, item)
if mark.created_time and published_dt and mark.created_time >= published_dt:
return "skipped"
mark.update(
shelf_type=shelf_type,
visibility=visibility,
metadata=metadata,
created_time=published_dt,
)
return "imported"
except Exception as e:
logger.error(f"Error importing shelf member: {e}")
return "failed"
def import_shelf_log(self, data: Dict[str, Any]) -> BaseImporter.ImportResult:
"""Import a shelf log entry from NDJSON data."""
try:
item = self.items.get(data.get("item", ""))
if not item:
raise KeyError(f"Could not find item: {data.get('item', '')}")
owner = self.user.identity
shelf_type = data.get("status", ShelfType.WISHLIST)
# posts = data.get("posts", []) # TODO but will be tricky
timestamp = data.get("timestamp")
timestamp_dt = self.parse_datetime(timestamp) if timestamp else None
_, created = ShelfLogEntry.objects.update_or_create(
owner=owner,
item=item,
shelf_type=shelf_type,
timestamp=timestamp_dt,
)
# return "imported" if created else "skipped"
# count skip as success otherwise it may confuse user
return "imported"
except Exception as e:
logger.error(f"Error importing shelf log: {e}")
return "failed"
def import_post(self, data: Dict[str, Any]) -> BaseImporter.ImportResult:
"""Import a post from NDJSON data."""
# TODO
return "skipped"
def import_review(self, data: Dict[str, Any]) -> BaseImporter.ImportResult:
"""Import a review from NDJSON data."""
try:
owner = self.user.identity
visibility = data.get("visibility", self.metadata.get("visibility", 0))
metadata = data.get("metadata", {})
content_data = data.get("content", {})
published_dt = self.parse_datetime(content_data.get("published"))
item = self.items.get(content_data.get("withRegardTo", ""))
if not item:
raise KeyError(f"Could not find item: {data.get('item', '')}")
name = content_data.get("name", "")
content = content_data.get("content", "")
existing_review = Review.objects.filter(
owner=owner, item=item, title=name
).first()
if (
existing_review
and existing_review.created_time
and published_dt
and existing_review.created_time >= published_dt
):
return "skipped"
Review.objects.create(
owner=owner,
item=item,
title=name,
body=content,
created_time=published_dt,
visibility=visibility,
metadata=metadata,
)
return "imported"
except Exception as e:
logger.error(f"Error importing review: {e}")
return "failed"
def import_note(self, data: Dict[str, Any]) -> BaseImporter.ImportResult:
"""Import a note from NDJSON data."""
try:
owner = self.user.identity
visibility = data.get("visibility", self.metadata.get("visibility", 0))
content_data = data.get("content", {})
published_dt = self.parse_datetime(content_data.get("published"))
item = self.items.get(content_data.get("withRegardTo", ""))
if not item:
raise KeyError(f"Could not find item: {data.get('item', '')}")
title = content_data.get("title", "")
content = content_data.get("content", "")
sensitive = content_data.get("sensitive", False)
progress = content_data.get("progress", {})
progress_type = progress.get("type", "")
progress_value = progress.get("value", "")
Note.objects.create(
item=item,
owner=owner,
title=title,
content=content,
sensitive=sensitive,
progress_type=progress_type,
progress_value=progress_value,
created_time=published_dt,
visibility=visibility,
metadata=data.get("metadata", {}),
)
return "imported"
except Exception as e:
logger.error(f"Error importing note: {e}")
return "failed"
def import_comment(self, data: Dict[str, Any]) -> BaseImporter.ImportResult:
"""Import a comment from NDJSON data."""
try:
owner = self.user.identity
visibility = data.get("visibility", self.metadata.get("visibility", 0))
metadata = data.get("metadata", {})
content_data = data.get("content", {})
published_dt = self.parse_datetime(content_data.get("published"))
item = self.items.get(content_data.get("withRegardTo", ""))
if not item:
raise KeyError(f"Could not find item: {data.get('item', '')}")
content = content_data.get("content", "")
existing_comment = Comment.objects.filter(owner=owner, item=item).first()
if (
existing_comment
and existing_comment.created_time
and published_dt
and existing_comment.created_time >= published_dt
):
return "skipped"
Comment.objects.create(
owner=owner,
item=item,
text=content,
created_time=published_dt,
visibility=visibility,
metadata=metadata,
)
return "imported"
except Exception as e:
logger.error(f"Error importing comment: {e}")
return "failed"
def import_rating(self, data: Dict[str, Any]) -> BaseImporter.ImportResult:
"""Import a rating from NDJSON data."""
try:
owner = self.user.identity
visibility = data.get("visibility", self.metadata.get("visibility", 0))
metadata = data.get("metadata", {})
content_data = data.get("content", {})
published_dt = self.parse_datetime(content_data.get("published"))
item = self.items.get(content_data.get("withRegardTo", ""))
if not item:
raise KeyError(f"Could not find item: {data.get('item', '')}")
rating_grade = int(float(content_data.get("value", 0)))
existing_rating = Comment.objects.filter(owner=owner, item=item).first()
if (
existing_rating
and existing_rating.created_time
and published_dt
and existing_rating.created_time >= published_dt
):
return "skipped"
Rating.objects.create(
owner=owner,
item=item,
grade=rating_grade,
created_time=published_dt,
visibility=visibility,
metadata=metadata,
)
return "imported"
except Exception as e:
logger.error(f"Error importing rating: {e}")
return "failed"
def import_tag(self, data: Dict[str, Any]) -> BaseImporter.ImportResult:
"""Import tags from NDJSON data."""
try:
owner = self.user.identity
visibility = data.get("visibility", self.metadata.get("visibility", 0))
pinned = data.get("pinned", self.metadata.get("pinned", False))
tag_title = Tag.cleanup_title(data.get("name", ""))
_, created = Tag.objects.update_or_create(
owner=owner,
title=tag_title,
defaults={
"visibility": visibility,
"pinned": pinned,
},
)
return "imported" if created else "skipped"
except Exception as e:
logger.error(f"Error importing tag member: {e}")
return "failed"
def import_tag_member(self, data: Dict[str, Any]) -> BaseImporter.ImportResult:
"""Import tags from NDJSON data."""
try:
owner = self.user.identity
visibility = data.get("visibility", self.metadata.get("visibility", 0))
metadata = data.get("metadata", {})
content_data = data.get("content", {})
published_dt = self.parse_datetime(content_data.get("published"))
item = self.items.get(content_data.get("withRegardTo", ""))
if not item:
raise KeyError(f"Could not find item: {data.get('item', '')}")
tag_title = Tag.cleanup_title(content_data.get("tag", ""))
tag, _ = Tag.objects.get_or_create(
owner=owner,
title=tag_title,
defaults={
"created_time": published_dt,
"visibility": visibility,
"pinned": False,
"metadata": metadata,
},
)
_, created = TagMember.objects.update_or_create(
owner=owner,
item=item,
parent=tag,
defaults={
"created_time": published_dt,
"visibility": visibility,
"metadata": metadata,
"position": 0,
},
)
return "imported" if created else "skipped"
except Exception as e:
logger.error(f"Error importing tag member: {e}")
return "failed"
def process_journal(self, file_path: str) -> None:
"""Process a NDJSON file and import all items."""
logger.debug(f"Processing {file_path}")
lines_error = 0
import_funcs = {
"Tag": self.import_tag,
"TagMember": self.import_tag_member,
"Rating": self.import_rating,
"Comment": self.import_comment,
"ShelfMember": self.import_shelf_member,
"Review": self.import_review,
"Note": self.import_note,
"Collection": self.import_collection,
"ShelfLog": self.import_shelf_log,
"Post": self.import_post,
}
journal = {k: [] for k in import_funcs.keys()}
with open(file_path, "r") as jsonfile:
# Skip header line
next(jsonfile, None)
for line in jsonfile:
try:
data = json.loads(line)
except json.JSONDecodeError:
lines_error += 1
continue
data_type = data.get("type")
if not data_type:
continue
if data_type not in journal:
journal[data_type] = []
journal[data_type].append(data)
self.metadata["total"] = sum(len(items) for items in journal.values())
logger.debug(f"Processing {self.metadata['total']} entries")
if lines_error:
logger.error(f"Error processing journal.ndjson: {lines_error} lines")
for typ, func in import_funcs.items():
for data in journal.get(typ, []):
result = func(data)
self.progress(result)
logger.info(
f"Imported {self.metadata['imported']}, skipped {self.metadata['skipped']}, failed {self.metadata['failed']}"
)
def parse_catalog(self, file_path: str) -> None:
"""Parse the catalog.ndjson file and build item lookup tables."""
logger.debug(f"Parsing catalog file: {file_path}")
item_count = 0
try:
with open(file_path, "r") as jsonfile:
for line in jsonfile:
try:
i = json.loads(line)
except (json.JSONDecodeError, Exception) as e:
logger.error(f"Error processing catalog item: {e}")
continue
u = i.get("id")
if not u:
continue
# self.catalog_items[u] = i
item_count += 1
links = [u] + [r["url"] for r in i.get("external_resources", [])]
self.items[u] = self.get_item_by_info_and_links("", "", links)
logger.info(f"Loaded {item_count} items from catalog")
self.metadata["catalog_processed"] = item_count
except Exception as e:
logger.error(f"Error parsing catalog file: {e}")
def parse_header(self, file_path: str) -> Dict[str, Any]:
try:
with open(file_path, "r") as jsonfile:
first_line = jsonfile.readline().strip()
if first_line:
header = json.loads(first_line)
if header.get("server"):
return header
except (json.JSONDecodeError, IOError) as e:
logger.error(f"Error parsing NDJSON header: {e}")
return {}
def run(self) -> None:
"""Run the NDJSON import."""
filename = self.metadata["file"]
logger.debug(f"Importing {filename}")
with zipfile.ZipFile(filename, "r") as zipref:
with tempfile.TemporaryDirectory() as tmpdirname:
zipref.extractall(tmpdirname)
catalog_path = os.path.join(tmpdirname, "catalog.ndjson")
if os.path.exists(catalog_path):
catalog_header = self.parse_header(catalog_path)
logger.debug(f"Loading catalog.ndjson with {catalog_header}")
self.parse_catalog(catalog_path)
else:
logger.warning("catalog.ndjson file not found in the archive")
journal_path = os.path.join(tmpdirname, "journal.ndjson")
if not os.path.exists(journal_path):
logger.error("journal.ndjson file not found in the archive")
self.message = "Import failed: journal.ndjson file not found"
self.save()
return
header = self.parse_header(journal_path)
self.metadata["journal_header"] = header
logger.debug(f"Importing journal.ndjson with {header}")
self.process_journal(journal_path)
source_info = self.metadata.get("journal_header", {})
source_summary = f" from {source_info.get('username', 'unknown')}@{source_info.get('server', 'unknown')} ver:{source_info.get('neodb_version', 'unknown')}."
self.message = _("Import complete") + source_summary
metadata_stats = self.metadata.get("metadata_stats", {})
partial_updates = metadata_stats.get("partial_updates", 0)
if partial_updates > 0:
self.message += f", {partial_updates} items with partial metadata updates"
ratings = metadata_stats.get("ratings_updated", 0)
comments = metadata_stats.get("comments_updated", 0)
tags = metadata_stats.get("tags_updated", 0)
if ratings > 0 or comments > 0 or tags > 0:
self.message += (
f" ({ratings} ratings, {comments} comments, {tags} tag sets)"
)
if self.metadata.get("failed_items", []):
self.message += f": {self.metadata['failed']} items failed ({len(self.metadata['failed_items'])} unique items)"
self.save()