tweak importer to prefer fedi item link

This commit is contained in:
mein Name 2025-03-03 22:30:08 -05:00 committed by Henri Dickson
parent 1d7816d9d7
commit 584cc9516a
10 changed files with 244 additions and 155 deletions

View file

@ -1,13 +1,34 @@
import os
import zipfile
from .csv import CsvImporter
from .douban import DoubanImporter
from .goodreads import GoodreadsImporter
from .letterboxd import LetterboxdImporter
from .opml import OPMLImporter
def get_neodb_importer(filename: str) -> type[CsvImporter] | None:
if not os.path.exists(filename) or not zipfile.is_zipfile(filename):
return None
with zipfile.ZipFile(filename, "r") as z:
files = z.namelist()
if any(f == "journal.ndjson" for f in files):
return None
if any(
f.endswith("_mark.csv")
or f.endswith("_review.csv")
or f.endswith("_note.csv")
for f in files
):
return CsvImporter
__all__ = [
"CsvImporter",
"LetterboxdImporter",
"OPMLImporter",
"DoubanImporter",
"GoodreadsImporter",
"get_neodb_importer",
]

View file

@ -12,10 +12,20 @@ from django.utils.translation import gettext as _
from loguru import logger
from catalog.common.sites import SiteManager
from catalog.models import Edition, IdType, Item, ItemCategory
from catalog.models import Edition, IdType, Item, ItemCategory, SiteName
from journal.models import Mark, Note, Review, ShelfType
from users.models import Task
_PREFERRED_SITES = [
SiteName.Fediverse,
SiteName.RSS,
SiteName.TMDB,
SiteName.IMDB,
SiteName.GoogleBooks,
SiteName.Goodreads,
SiteName.IGDB,
]
class CsvImporter(Task):
class Meta:
@ -49,18 +59,39 @@ class CsvImporter(Task):
site_url = settings.SITE_INFO["site_url"] + "/"
links = links_str.strip().split()
# look for local items first
for link in links:
if link.startswith("/") or link.startswith(site_url):
item = Item.get_by_url(link)
if item:
return item
for link in links:
site = SiteManager.get_site_by_url(link)
if site:
sites = [SiteManager.get_site_by_url(link) for link in links]
sites = [site for site in sites if site]
sites.sort(
key=lambda x: _PREFERRED_SITES.index(x.SITE_NAME)
if x.SITE_NAME in _PREFERRED_SITES
else 99
)
# look for external items that already matched
for site in sites:
logger.debug(f"matching {site.url}")
item = site.get_item()
if item:
return item
# fetch external item if possible
for site in sites:
try:
logger.debug(f"fetching {site.url}")
site.get_resource_ready()
item = site.get_item()
if item:
return item
except Exception as e:
logger.error(f"Error fetching item: {e}")
# Try using the info string
if info_str:
info_dict = {}
@ -304,8 +335,6 @@ class CsvImporter(Task):
return True
except Exception as e:
logger.error(f"Error importing note: {e}")
if "failed_items" not in self.metadata:
self.metadata["failed_items"] = []
self.metadata["failed_items"].append(
f"Error importing note for {row.get('title', '')}: {str(e)}"
)
@ -333,32 +362,11 @@ class CsvImporter(Task):
success = import_function(row)
self.progress(success)
@classmethod
def validate_file(cls, filename: str) -> bool:
"""Validate that the given file is a valid CSV export ZIP file.
Args:
filename: Path to the file to validate
Returns:
bool: True if the file is valid, False otherwise
"""
return os.path.exists(filename) and zipfile.is_zipfile(filename)
def run(self) -> None:
"""Run the CSV import."""
# Ensure failed_items is initialized
if "failed_items" not in self.metadata:
self.metadata["failed_items"] = []
filename = self.metadata["file"]
logger.debug(f"Importing {filename}")
# Validate the file before processing
if not self.validate_file(filename):
self.save()
return
with zipfile.ZipFile(filename, "r") as zipref:
with tempfile.TemporaryDirectory() as tmpdirname:
logger.debug(f"Extracting {filename} to {tmpdirname}")

View file

@ -0,0 +1,23 @@
# Generated by Django 4.2.18 on 2025-03-03 23:16
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("users", "0008_alter_task_type"),
("journal", "0005_csvexporter"),
]
operations = [
migrations.CreateModel(
name="CsvImporter",
fields=[],
options={
"proxy": True,
"indexes": [],
"constraints": [],
},
bases=("users.task",),
),
]

View file

@ -9,7 +9,7 @@ from loguru import logger
from catalog.models import Edition, IdType, Movie, TVEpisode, TVSeason, TVShow
from journal.exporters import CsvExporter
from journal.importers import CsvImporter
from journal.importers import CsvImporter, get_neodb_importer
from users.models import User
from ..models import *
@ -219,6 +219,7 @@ class CsvExportImportTest(TestCase):
f"Expected file {filename} with {expected_data_count} data rows, but file not found"
)
self.assertEqual(get_neodb_importer(export_path), CsvImporter)
importer = CsvImporter.create(user=self.user2, file=export_path, visibility=2)
importer.run()
self.assertEqual(importer.message, "Import complete")

View file

@ -28,6 +28,8 @@ attrs==25.1.0
# via aiohttp
babel==2.17.0
# via mkdocs-material
backrefs==5.8
# via mkdocs-material
beautifulsoup4==4.13.3
# via markdownify
bleach==5.0.1
@ -54,7 +56,7 @@ click==8.1.8
colorama==0.4.6
# via djlint
# via mkdocs-material
cryptography==44.0.1
cryptography==44.0.2
# via atproto
cssbeautifier==1.15.4
# via djlint
@ -169,7 +171,7 @@ mkdocs==1.6.1
# via mkdocs-material
mkdocs-get-deps==0.2.0
# via mkdocs
mkdocs-material==9.6.5
mkdocs-material==9.6.7
mkdocs-material-extensions==1.3.1
# via mkdocs-material
multidict==6.1.0
@ -213,7 +215,7 @@ pygments==2.19.1
# via mkdocs-material
pymdown-extensions==10.14.3
# via mkdocs-material
pyright==1.1.395
pyright==1.1.396
python-dateutil==2.9.0.post0
# via dateparser
# via django-auditlog
@ -241,7 +243,6 @@ redis==5.2.1
regex==2024.11.6
# via dateparser
# via djlint
# via mkdocs-material
requests==2.32.3
# via django-anymail
# via igdb-api-v4
@ -251,7 +252,7 @@ rjsmin==1.2.2
# via django-compressor
rq==2.1.0
# via django-rq
ruff==0.9.8
ruff==0.9.9
sentry-sdk==2.22.0
setproctitle==1.3.5
six==1.17.0

View file

@ -44,7 +44,7 @@ charset-normalizer==3.4.1
click==8.1.8
# via atproto
# via rq
cryptography==44.0.1
cryptography==44.0.2
# via atproto
dateparser==1.2.1
deepmerge==2.0

View file

@ -0,0 +1,29 @@
# Generated by Django 4.2.18 on 2025-03-03 23:16
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("users", "0007_alter_task_type"),
]
operations = [
migrations.AlterField(
model_name="task",
name="type",
field=models.CharField(
choices=[
("journal.csvexporter", "csv exporter"),
("journal.csvimporter", "csv importer"),
("journal.doubanimporter", "douban importer"),
("journal.doufenexporter", "doufen exporter"),
("journal.goodreadsimporter", "goodreads importer"),
("journal.letterboxdimporter", "letterboxd importer"),
("journal.ndjsonexporter", "ndjson exporter"),
],
db_index=True,
max_length=255,
),
),
]

View file

@ -15,6 +15,127 @@
{% include "_header.html" %}
<main>
<div class="grid__main">
<article>
<details>
<summary>{% trans 'Export Data' %}</summary>
<form action="{% url 'users:export_csv' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<input type="submit"
value="{% trans 'Export marks, reviews and notes in CSV' %}" />
{% if csv_export_task %}
<br>
{% trans 'Last export' %}: {{ csv_export_task.created_time }}
{% trans 'Status' %}: {{ csv_export_task.get_state_display }}
<br>
{{ csv_export_task.message }}
{% if csv_export_task.metadata.file %}
<a href="{% url 'users:export_csv' %}" download>{% trans 'Download' %}</a>
{% endif %}
{% endif %}
</form>
<hr>
<form action="{% url 'users:export_ndjson' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<input type="submit" value="{% trans 'Export everything in NDJSON' %}" />
{% if ndjson_export_task %}
<br>
{% trans 'Last export' %}: {{ ndjson_export_task.created_time }}
{% trans 'Status' %}: {{ ndjson_export_task.get_state_display }}
<br>
{{ ndjson_export_task.message }}
{% if ndjson_export_task.metadata.file %}
<a href="{% url 'users:export_ndjson' %}" download>{% trans 'Download' %}</a>
{% endif %}
{% endif %}
</form>
<hr>
<form action="{% url 'users:export_marks' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<input type="submit"
class="secondary"
value="{% trans 'Export marks and reviews in XLSX (Doufen format)' %}" />
<small>exporting to this format will be deprecated soon.</small>
{% if export_task %}
<br>
{% trans 'Last export' %}: {{ export_task.created_time }}
{% trans 'Status' %}: {{ export_task.get_state_display }}
<br>
{{ export_task.message }}
{% if export_task.metadata.file %}
<a href="{% url 'users:export_marks' %}" download>{% trans 'Download' %}</a>
{% endif %}
{% endif %}
</form>
</details>
</article>
<article>
<details>
<summary>{% trans 'Import Data' %}</summary>
<form action="{% url 'users:import_neodb' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<ul>
<li>
{% trans 'Upload a <code>.zip</code> file containing <code>.csv</code> or <code>.ndjson</code> files exported from NeoDB.' %}
</li>
<li>{% trans 'Existing marks and reviews with newer dates will be preserved.' %}</li>
</ul>
<br>
<input type="file" name="file" required accept=".zip">
<p>
{% trans 'Visibility' %}: <small><code>.csv</code> only</small>
<br>
<label for="csv_visibility_0">
<input type="radio"
name="visibility"
value="0"
required=""
id="csv_visibility_0"
checked>
{% trans 'Public' %}
</label>
<label for="csv_visibility_1">
<input type="radio"
name="visibility"
value="1"
required=""
id="csv_visibility_1">
{% trans 'Followers Only' %}
</label>
<label for="csv_visibility_2">
<input type="radio"
name="visibility"
value="2"
required=""
id="csv_visibility_2">
{% trans 'Mentioned Only' %}
</label>
</p>
<input type="submit" value="{% trans 'Import' %}" />
<small>
{% if csv_import_task %}
<br>
{% trans 'Last import started' %}: {{ csv_import_task.created_time }}
{% trans 'Status' %}: {{ csv_import_task.get_state_display }}。
<br>
{{ csv_import_task.message }}
{% if csv_import_task.metadata.failed_items %}
{% trans 'Failed items' %}:
<br>
<textarea readonly>{% for item in csv_import_task.metadata.failed_items %}{{item}}&#10;{% endfor %}</textarea>
{% endif %}
{% endif %}
</small>
</form>
</details>
</article>
<article>
<details>
<summary>{% trans 'Import Marks and Reviews from Douban' %}</summary>
@ -213,123 +334,6 @@
</form>
</details>
</article>
<article>
<details>
<summary>{% trans 'Import marks, reviews and notes from CSV' %}</summary>
<form action="{% url 'users:import_csv' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<ul>
<li>{% trans 'Upload a ZIP file containing CSV files exported from NeoDB.' %}</li>
<li>{% trans 'Existing marks and reviews with newer dates will be preserved.' %}</li>
</ul>
<br>
<input type="file" name="file" required accept=".zip">
<p>
{% trans 'Visibility' %}:
<br>
<label for="csv_visibility_0">
<input type="radio"
name="visibility"
value="0"
required=""
id="csv_visibility_0"
checked>
{% trans 'Public' %}
</label>
<label for="csv_visibility_1">
<input type="radio"
name="visibility"
value="1"
required=""
id="csv_visibility_1">
{% trans 'Followers Only' %}
</label>
<label for="csv_visibility_2">
<input type="radio"
name="visibility"
value="2"
required=""
id="csv_visibility_2">
{% trans 'Mentioned Only' %}
</label>
</p>
<input type="submit" value="{% trans 'Import' %}" />
<small>
{% if csv_import_task %}
<br>
{% trans 'Last import started' %}: {{ csv_import_task.created_time }}
{% trans 'Status' %}: {{ csv_import_task.get_state_display }}。
<br>
{{ csv_import_task.message }}
{% if csv_import_task.metadata.failed_items %}
{% trans 'Failed items' %}:
<br>
<textarea readonly>{% for item in csv_import_task.metadata.failed_items %}{{item}}&#10;{% endfor %}</textarea>
{% endif %}
{% endif %}
</small>
</form>
</details>
</article>
<article>
<details>
<summary>{% trans 'Export Data' %}</summary>
<form action="{% url 'users:export_marks' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<input type="submit"
value="{% trans 'Export marks and reviews in XLSX (Doufen format)' %}" />
{% if export_task %}
<br>
{% trans 'Last export' %}: {{ export_task.created_time }}
{% trans 'Status' %}: {{ export_task.get_state_display }}
<br>
{{ export_task.message }}
{% if export_task.metadata.file %}
<a href="{% url 'users:export_marks' %}" download>{% trans 'Download' %}</a>
{% endif %}
{% endif %}
</form>
<hr>
<form action="{% url 'users:export_csv' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<input type="submit"
value="{% trans 'Export marks, reviews and notes in CSV' %}" />
{% if csv_export_task %}
<br>
{% trans 'Last export' %}: {{ csv_export_task.created_time }}
{% trans 'Status' %}: {{ csv_export_task.get_state_display }}
<br>
{{ csv_export_task.message }}
{% if csv_export_task.metadata.file %}
<a href="{% url 'users:export_csv' %}" download>{% trans 'Download' %}</a>
{% endif %}
{% endif %}
</form>
<hr>
<form action="{% url 'users:export_ndjson' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<input type="submit" value="{% trans 'Export everything in NDJSON' %}" />
{% if ndjson_export_task %}
<br>
{% trans 'Last export' %}: {{ ndjson_export_task.created_time }}
{% trans 'Status' %}: {{ ndjson_export_task.get_state_display }}
<br>
{{ ndjson_export_task.message }}
{% if ndjson_export_task.metadata.file %}
<a href="{% url 'users:export_ndjson' %}" download>{% trans 'Download' %}</a>
{% endif %}
{% endif %}
</form>
</details>
</article>
<article>
<details>
<summary>{% trans 'View Annual Summary' %}</summary>

View file

@ -15,7 +15,7 @@ urlpatterns = [
path("data/import/douban", import_douban, name="import_douban"),
path("data/import/letterboxd", import_letterboxd, name="import_letterboxd"),
path("data/import/opml", import_opml, name="import_opml"),
path("data/import/csv", import_csv, name="import_csv"),
path("data/import/neodb", import_neodb, name="import_neodb"),
path("data/export/reviews", export_reviews, name="export_reviews"),
path("data/export/marks", export_marks, name="export_marks"),
path("data/export/csv", export_csv, name="export_csv"),

View file

@ -19,6 +19,7 @@ from journal.importers import (
GoodreadsImporter,
LetterboxdImporter,
OPMLImporter,
get_neodb_importer,
)
from journal.models import ShelfType
from takahe.utils import Takahe
@ -324,7 +325,7 @@ def import_opml(request):
@login_required
def import_csv(request):
def import_neodb(request):
if request.method == "POST":
f = (
settings.MEDIA_ROOT
@ -335,10 +336,11 @@ def import_csv(request):
with open(f, "wb+") as destination:
for chunk in request.FILES["file"].chunks():
destination.write(chunk)
if not CsvImporter.validate_file(f):
importer = get_neodb_importer(f)
if not importer:
messages.add_message(request, messages.ERROR, _("Invalid file."))
return redirect(reverse("users:data"))
CsvImporter.create(
importer.create(
request.user,
visibility=int(request.POST.get("visibility", 0)),
file=f,