lib.itmens/journal/exporters/ndjson.py
2025-03-07 20:23:11 -05:00

238 lines
8.8 KiB
Python

import json
import os
import re
import shutil
import tempfile
import uuid
from django.conf import settings
from django.utils import timezone
from loguru import logger
from catalog.common import ProxiedImageDownloader
from common.utils import GenerateDateUUIDMediaFilePath
from journal.models import (
Collection,
Content,
Note,
Review,
ShelfLogEntry,
ShelfMember,
Tag,
TagMember,
)
from takahe.models import Post
from users.models import Task
class NdjsonExporter(Task):
class Meta:
app_label = "journal" # workaround bug in TypedModel
TaskQueue = "export"
DefaultMetadata = {
"file": None,
"total": 0,
}
ref_items = []
@property
def filename(self) -> str:
d = self.created_time.strftime("%Y%m%d%H%M%S")
return f"neodb_{self.user.username}_{d}_ndjson"
def ref(self, item) -> str:
if item not in self.ref_items:
self.ref_items.append(item)
return item.absolute_url
def get_header(self):
return {
"server": settings.SITE_DOMAIN,
"neodb_version": settings.NEODB_VERSION,
"username": self.user.username,
"actor": self.user.identity.actor_uri,
"request_time": self.created_time.isoformat(),
"created_time": timezone.now().isoformat(),
}
def run(self):
user = self.user
temp_dir = tempfile.mkdtemp()
temp_folder_path = os.path.join(temp_dir, self.filename)
os.makedirs(temp_folder_path)
attachment_path = os.path.join(temp_folder_path, "attachments")
os.makedirs(attachment_path, exist_ok=True)
def _save_image(url):
if url.startswith("http"):
try:
raw_img, ext = ProxiedImageDownloader.download_image(url, "")
if raw_img:
file = "%s/%s.%s" % (attachment_path, uuid.uuid4(), ext)
with open(file, "wb") as binary_file:
binary_file.write(raw_img)
return file
except Exception:
logger.debug(f"error downloading {url}")
elif url.startswith("/"):
p = os.path.abspath(
os.path.join(settings.MEDIA_ROOT, url[len(settings.MEDIA_URL) :])
)
if p.startswith(settings.MEDIA_ROOT):
try:
shutil.copy2(p, attachment_path)
except Exception:
logger.error(f"error copying {p} to {attachment_path}")
return p
return url
filename = os.path.join(temp_folder_path, "journal.ndjson")
total = 0
with open(filename, "w") as f:
f.write(json.dumps(self.get_header()) + "\n")
for cls in list(Content.__subclasses__()):
pieces = cls.objects.filter(owner=user.identity)
for p in pieces:
total += 1
self.ref(p.item)
o = {
"type": p.__class__.__name__,
"content": p.ap_object,
"visibility": p.visibility,
"metadata": p.metadata,
}
f.write(json.dumps(o, default=str) + "\n")
if cls == Review:
re.sub(
r"(?<=!\[\]\()([^)]+)(?=\))",
lambda x: _save_image(x[1]),
p.body, # type: ignore
)
elif cls == Note and p.latest_post:
for a in p.latest_post.attachments.all():
dest = os.path.join(
attachment_path, os.path.basename(a.file.name)
)
try:
shutil.copy2(a.file.path, dest)
except Exception as e:
logger.error(
f"error copying {a.file.path} to {dest}",
extra={"exception": e},
)
collections = Collection.objects.filter(owner=user.identity)
for c in collections:
total += 1
o = {
"type": "Collection",
"content": c.ap_object,
"visibility": c.visibility,
"metadata": c.metadata,
"items": [
{"item": self.ref(m.item), "metadata": m.metadata}
for m in c.ordered_members
],
}
f.write(json.dumps(o, default=str) + "\n")
tags = Tag.objects.filter(owner=user.identity)
for t in tags:
total += 1
o = {
"type": "Tag",
"name": t.title,
"visibility": t.visibility,
"pinned": t.pinned,
}
f.write(json.dumps(o, default=str) + "\n")
tags = TagMember.objects.filter(owner=user.identity)
for t in tags:
total += 1
o = {
"type": "TagMember",
"content": t.ap_object,
"visibility": t.visibility,
"metadata": t.metadata,
}
f.write(json.dumps(o, default=str) + "\n")
marks = ShelfMember.objects.filter(owner=user.identity)
for m in marks:
total += 1
o = {
"type": "ShelfMember",
"content": m.ap_object,
"visibility": m.visibility,
"metadata": m.metadata,
}
f.write(json.dumps(o, default=str) + "\n")
logs = ShelfLogEntry.objects.filter(owner=user.identity)
for log in logs:
total += 1
o = {
"type": "ShelfLog",
"item": self.ref(log.item),
"status": log.shelf_type,
"posts": list(log.all_post_ids()),
"timestamp": log.timestamp,
}
f.write(json.dumps(o, default=str) + "\n")
posts = Post.objects.filter(author_id=user.identity.pk).exclude(
type_data__has_key="object"
)
for p in posts:
total += 1
o = {"type": "post", "post": p.to_mastodon_json()}
for a in p.attachments.all():
dest = os.path.join(attachment_path, os.path.basename(a.file.name))
try:
shutil.copy2(a.file.path, dest)
except Exception as e:
logger.error(
f"error copying {a.file.path} to {dest}",
extra={"exception": e},
)
f.write(json.dumps(o, default=str) + "\n")
filename = os.path.join(temp_folder_path, "catalog.ndjson")
with open(filename, "w") as f:
f.write(json.dumps(self.get_header()) + "\n")
for item in self.ref_items:
f.write(json.dumps(item.ap_object, default=str) + "\n")
# Export actor.ndjson with Takahe identity data
filename = os.path.join(temp_folder_path, "actor.ndjson")
with open(filename, "w") as f:
f.write(json.dumps(self.get_header()) + "\n")
takahe_identity = self.user.identity.takahe_identity
identity_data = {
"type": "Identity",
"username": takahe_identity.username,
"domain": takahe_identity.domain_id,
"actor_uri": takahe_identity.actor_uri,
"name": takahe_identity.name,
"summary": takahe_identity.summary,
"metadata": takahe_identity.metadata,
"private_key": takahe_identity.private_key,
"public_key": takahe_identity.public_key,
"public_key_id": takahe_identity.public_key_id,
}
f.write(json.dumps(identity_data, default=str) + "\n")
filename = GenerateDateUUIDMediaFilePath(
"f.zip", settings.MEDIA_ROOT + "/" + settings.EXPORT_FILE_PATH_ROOT
)
if not os.path.exists(os.path.dirname(filename)):
os.makedirs(os.path.dirname(filename))
shutil.make_archive(filename[:-4], "zip", temp_folder_path)
self.metadata["file"] = filename
self.metadata["total"] = total
self.message = f"{total} records exported."
self.save()