lib.itmens/journal/exporters/ndjson.py

177 lines
6.5 KiB
Python
Raw Normal View History

2025-01-28 21:38:02 -05:00
import json
2025-01-11 23:44:30 -05:00
import os
2025-01-28 21:38:02 -05:00
import re
import shutil
import tempfile
2025-01-11 23:44:30 -05:00
from django.conf import settings
2025-01-28 21:38:02 -05:00
from django.utils import timezone
2025-01-11 23:44:30 -05:00
2025-01-28 21:38:02 -05:00
from catalog.common.downloaders import ProxiedImageDownloader
2025-01-11 23:44:30 -05:00
from common.utils import GenerateDateUUIDMediaFilePath
2025-01-28 21:38:02 -05:00
from journal.models import ShelfMember
from journal.models.collection import Collection
from journal.models.common import Content
from journal.models.note import Note
from journal.models.review import Review
from journal.models.shelf import ShelfLogEntry
from takahe.models import Post
2025-01-11 23:44:30 -05:00
from users.models import Task
class NdjsonExporter(Task):
class Meta:
app_label = "journal" # workaround bug in TypedModel
TaskQueue = "export"
DefaultMetadata = {
"file": None,
"total": 0,
}
2025-01-28 21:38:02 -05:00
ref_items = []
@property
def filename(self) -> str:
d = self.created_time.strftime("%Y%m%d%H%M%S")
return f"neodb_{self.user.username}_{d}_ndjson"
def ref(self, item) -> str:
if item not in self.ref_items:
self.ref_items.append(item)
return item.absolute_url
def get_header(self):
return {
"server": settings.SITE_DOMAIN,
"neodb_version": settings.NEODB_VERSION,
"username": self.user.username,
"actor": self.user.identity.actor_uri,
"request_time": self.created_time.isoformat(),
"created_time": timezone.now().isoformat(),
}
2025-01-11 23:44:30 -05:00
def run(self):
user = self.user
2025-01-28 21:38:02 -05:00
temp_dir = tempfile.mkdtemp()
temp_folder_path = os.path.join(temp_dir, self.filename)
os.makedirs(temp_folder_path)
attachment_path = os.path.join(temp_folder_path, "attachments")
os.makedirs(attachment_path, exist_ok=True)
2025-01-11 23:44:30 -05:00
2025-01-28 21:38:02 -05:00
def _save_image(url):
if url.startswith("http"):
imgdl = ProxiedImageDownloader(url)
raw_img = imgdl.download().content
ext = imgdl.extention
file = GenerateDateUUIDMediaFilePath(f"x.{ext}", attachment_path)
with open(file, "wb") as binary_file:
binary_file.write(raw_img)
return file
elif url.startswith("/"):
p = os.path.abspath(
os.path.join(settings.MEDIA_ROOT, url[len(settings.MEDIA_URL) :])
2025-01-11 23:44:30 -05:00
)
2025-01-28 21:38:02 -05:00
if p.startswith(settings.MEDIA_ROOT):
shutil.copy2(p, attachment_path)
return p
return url
2025-01-11 23:44:30 -05:00
2025-01-28 21:38:02 -05:00
filename = os.path.join(temp_folder_path, "journal.ndjson")
total = 0
with open(filename, "w") as f:
f.write(json.dumps(self.get_header()) + "\n")
2025-01-11 23:44:30 -05:00
2025-01-28 21:38:02 -05:00
for cls in list(Content.__subclasses__()):
pieces = cls.objects.filter(owner=user.identity)
for p in pieces:
total += 1
self.ref(p.item)
o = {
"type": p.__class__.__name__,
"content": p.ap_object,
"visibility": p.visibility,
"metadata": p.metadata,
}
f.write(json.dumps(o, default=str) + "\n")
if cls == Review:
re.sub(
r"(?<=!\[\]\()([^)]+)(?=\))",
lambda x: _save_image(x[1]),
p.body, # type: ignore
)
elif cls == Note and p.latest_post:
for a in p.latest_post.attachments.all():
dest = os.path.join(
attachment_path, os.path.basename(a.file.name)
)
shutil.copy2(a.file.path, dest)
collections = Collection.objects.filter(owner=user.identity)
for c in collections:
total += 1
o = {
"type": "Collection",
"content": c.ap_object,
"visibility": c.visibility,
"metadata": c.metadata,
"items": [
{"item": self.ref(m.item), "metadata": m.metadata}
for m in c.ordered_members
],
}
f.write(json.dumps(o, default=str) + "\n")
2025-01-11 23:44:30 -05:00
2025-01-28 21:38:02 -05:00
marks = ShelfMember.objects.filter(owner=user.identity)
for m in marks:
total += 1
o = {
"type": "ShelfMember",
"item": self.ref(m.item),
"status": m.shelf_type,
"visibility": m.visibility,
"metadata": m.metadata,
"published": self.created_time.isoformat(),
}
f.write(json.dumps(o, default=str) + "\n")
2025-01-11 23:44:30 -05:00
2025-01-28 21:38:02 -05:00
logs = ShelfLogEntry.objects.filter(owner=user.identity)
for log in logs:
total += 1
o = {
"type": "ShelfLog",
"item": self.ref(log.item),
"posts": list(log.all_post_ids()),
"timestamp": log.created_time,
}
f.write(json.dumps(o, default=str) + "\n")
posts = Post.objects.filter(author_id=user.identity.pk).exclude(
type_data__has_key="object"
2025-01-11 23:44:30 -05:00
)
2025-01-28 21:38:02 -05:00
for p in posts:
total += 1
o = {"type": "post", "post": p.to_mastodon_json()}
for a in p.attachments.all():
dest = os.path.join(attachment_path, os.path.basename(a.file.name))
shutil.copy2(a.file.path, dest)
f.write(json.dumps(o, default=str) + "\n")
filename = os.path.join(temp_folder_path, "catalog.ndjson")
with open(filename, "w") as f:
f.write(json.dumps(self.get_header()) + "\n")
for item in self.ref_items:
f.write(json.dumps(item.ap_object, default=str) + "\n")
filename = GenerateDateUUIDMediaFilePath(
"f.zip", settings.MEDIA_ROOT + "/" + settings.EXPORT_FILE_PATH_ROOT
)
if not os.path.exists(os.path.dirname(filename)):
os.makedirs(os.path.dirname(filename))
shutil.make_archive(filename[:-4], "zip", temp_folder_path)
2025-01-11 23:44:30 -05:00
self.metadata["file"] = filename
2025-01-28 21:38:02 -05:00
self.metadata["total"] = total
2025-01-11 23:44:30 -05:00
self.message = "Export complete."
self.save()