lib.itmens/journal/exporters/ndjson.py

220 lines
7.9 KiB
Python
Raw Normal View History

2025-01-28 21:38:02 -05:00
import json
2025-01-11 23:44:30 -05:00
import os
2025-01-28 21:38:02 -05:00
import re
import shutil
import tempfile
2025-01-11 23:44:30 -05:00
from django.conf import settings
2025-01-28 21:38:02 -05:00
from django.utils import timezone
2025-03-06 11:23:15 -05:00
from loguru import logger
2025-01-11 23:44:30 -05:00
2025-02-03 15:31:40 -05:00
from catalog.common import ProxiedImageDownloader
2025-01-11 23:44:30 -05:00
from common.utils import GenerateDateUUIDMediaFilePath
2025-03-06 11:23:15 -05:00
from journal.models import (
Collection,
Content,
Note,
Review,
ShelfLogEntry,
ShelfMember,
Tag,
TagMember,
)
2025-01-28 21:38:02 -05:00
from takahe.models import Post
2025-01-11 23:44:30 -05:00
from users.models import Task
class NdjsonExporter(Task):
class Meta:
app_label = "journal" # workaround bug in TypedModel
TaskQueue = "export"
DefaultMetadata = {
"file": None,
"total": 0,
}
2025-01-28 21:38:02 -05:00
ref_items = []
@property
def filename(self) -> str:
d = self.created_time.strftime("%Y%m%d%H%M%S")
return f"neodb_{self.user.username}_{d}_ndjson"
def ref(self, item) -> str:
if item not in self.ref_items:
self.ref_items.append(item)
return item.absolute_url
def get_header(self):
return {
"server": settings.SITE_DOMAIN,
"neodb_version": settings.NEODB_VERSION,
"username": self.user.username,
"actor": self.user.identity.actor_uri,
"request_time": self.created_time.isoformat(),
"created_time": timezone.now().isoformat(),
}
2025-01-11 23:44:30 -05:00
def run(self):
user = self.user
2025-01-28 21:38:02 -05:00
temp_dir = tempfile.mkdtemp()
temp_folder_path = os.path.join(temp_dir, self.filename)
os.makedirs(temp_folder_path)
attachment_path = os.path.join(temp_folder_path, "attachments")
os.makedirs(attachment_path, exist_ok=True)
2025-01-11 23:44:30 -05:00
2025-01-28 21:38:02 -05:00
def _save_image(url):
if url.startswith("http"):
imgdl = ProxiedImageDownloader(url)
raw_img = imgdl.download().content
ext = imgdl.extention
file = GenerateDateUUIDMediaFilePath(f"x.{ext}", attachment_path)
with open(file, "wb") as binary_file:
binary_file.write(raw_img)
return file
elif url.startswith("/"):
p = os.path.abspath(
os.path.join(settings.MEDIA_ROOT, url[len(settings.MEDIA_URL) :])
2025-01-11 23:44:30 -05:00
)
2025-01-28 21:38:02 -05:00
if p.startswith(settings.MEDIA_ROOT):
2025-03-06 11:23:15 -05:00
try:
shutil.copy2(p, attachment_path)
except Exception as e:
logger.error(
f"error copying {p} to {attachment_path}",
extra={"exception": e},
)
2025-01-28 21:38:02 -05:00
return p
return url
2025-01-11 23:44:30 -05:00
2025-01-28 21:38:02 -05:00
filename = os.path.join(temp_folder_path, "journal.ndjson")
total = 0
with open(filename, "w") as f:
f.write(json.dumps(self.get_header()) + "\n")
2025-01-11 23:44:30 -05:00
2025-01-28 21:38:02 -05:00
for cls in list(Content.__subclasses__()):
pieces = cls.objects.filter(owner=user.identity)
for p in pieces:
total += 1
self.ref(p.item)
o = {
"type": p.__class__.__name__,
"content": p.ap_object,
"visibility": p.visibility,
"metadata": p.metadata,
}
f.write(json.dumps(o, default=str) + "\n")
if cls == Review:
re.sub(
r"(?<=!\[\]\()([^)]+)(?=\))",
lambda x: _save_image(x[1]),
p.body, # type: ignore
)
elif cls == Note and p.latest_post:
for a in p.latest_post.attachments.all():
dest = os.path.join(
attachment_path, os.path.basename(a.file.name)
)
2025-03-06 11:23:15 -05:00
try:
shutil.copy2(a.file.path, dest)
except Exception as e:
logger.error(
f"error copying {a.file.path} to {dest}",
extra={"exception": e},
)
2025-01-28 21:38:02 -05:00
collections = Collection.objects.filter(owner=user.identity)
for c in collections:
total += 1
o = {
"type": "Collection",
"content": c.ap_object,
"visibility": c.visibility,
"metadata": c.metadata,
"items": [
{"item": self.ref(m.item), "metadata": m.metadata}
for m in c.ordered_members
],
}
f.write(json.dumps(o, default=str) + "\n")
2025-01-11 23:44:30 -05:00
2025-03-06 11:23:15 -05:00
tags = Tag.objects.filter(owner=user.identity)
for t in tags:
total += 1
o = {
"type": "Tag",
"name": t.title,
"visibility": t.visibility,
"pinned": t.pinned,
}
f.write(json.dumps(o, default=str) + "\n")
tags = TagMember.objects.filter(owner=user.identity)
for t in tags:
total += 1
o = {
"type": "TagMember",
"content": t.ap_object,
"visibility": t.visibility,
"metadata": t.metadata,
}
f.write(json.dumps(o, default=str) + "\n")
2025-01-28 21:38:02 -05:00
marks = ShelfMember.objects.filter(owner=user.identity)
for m in marks:
total += 1
o = {
"type": "ShelfMember",
2025-03-06 11:23:15 -05:00
"content": m.ap_object,
2025-01-28 21:38:02 -05:00
"visibility": m.visibility,
"metadata": m.metadata,
}
f.write(json.dumps(o, default=str) + "\n")
2025-01-11 23:44:30 -05:00
2025-01-28 21:38:02 -05:00
logs = ShelfLogEntry.objects.filter(owner=user.identity)
for log in logs:
total += 1
o = {
"type": "ShelfLog",
"item": self.ref(log.item),
2025-03-06 11:23:15 -05:00
"status": log.shelf_type,
2025-01-28 21:38:02 -05:00
"posts": list(log.all_post_ids()),
2025-03-06 11:23:15 -05:00
"timestamp": log.timestamp,
2025-01-28 21:38:02 -05:00
}
f.write(json.dumps(o, default=str) + "\n")
posts = Post.objects.filter(author_id=user.identity.pk).exclude(
type_data__has_key="object"
2025-01-11 23:44:30 -05:00
)
2025-01-28 21:38:02 -05:00
for p in posts:
total += 1
o = {"type": "post", "post": p.to_mastodon_json()}
for a in p.attachments.all():
dest = os.path.join(attachment_path, os.path.basename(a.file.name))
2025-03-06 11:23:15 -05:00
try:
shutil.copy2(a.file.path, dest)
except Exception as e:
logger.error(
f"error copying {a.file.path} to {dest}",
extra={"exception": e},
)
2025-01-28 21:38:02 -05:00
f.write(json.dumps(o, default=str) + "\n")
filename = os.path.join(temp_folder_path, "catalog.ndjson")
with open(filename, "w") as f:
f.write(json.dumps(self.get_header()) + "\n")
for item in self.ref_items:
f.write(json.dumps(item.ap_object, default=str) + "\n")
filename = GenerateDateUUIDMediaFilePath(
"f.zip", settings.MEDIA_ROOT + "/" + settings.EXPORT_FILE_PATH_ROOT
)
if not os.path.exists(os.path.dirname(filename)):
os.makedirs(os.path.dirname(filename))
shutil.make_archive(filename[:-4], "zip", temp_folder_path)
2025-01-11 23:44:30 -05:00
self.metadata["file"] = filename
2025-01-28 21:38:02 -05:00
self.metadata["total"] = total
self.message = f"{total} records exported."
2025-01-11 23:44:30 -05:00
self.save()