tweak importer to prefer fedi item link

This commit is contained in:
mein Name 2025-03-03 22:30:08 -05:00 committed by Henri Dickson
parent 1d7816d9d7
commit 584cc9516a
10 changed files with 244 additions and 155 deletions

View file

@ -1,13 +1,34 @@
import os
import zipfile
from .csv import CsvImporter from .csv import CsvImporter
from .douban import DoubanImporter from .douban import DoubanImporter
from .goodreads import GoodreadsImporter from .goodreads import GoodreadsImporter
from .letterboxd import LetterboxdImporter from .letterboxd import LetterboxdImporter
from .opml import OPMLImporter from .opml import OPMLImporter
def get_neodb_importer(filename: str) -> type[CsvImporter] | None:
if not os.path.exists(filename) or not zipfile.is_zipfile(filename):
return None
with zipfile.ZipFile(filename, "r") as z:
files = z.namelist()
if any(f == "journal.ndjson" for f in files):
return None
if any(
f.endswith("_mark.csv")
or f.endswith("_review.csv")
or f.endswith("_note.csv")
for f in files
):
return CsvImporter
__all__ = [ __all__ = [
"CsvImporter", "CsvImporter",
"LetterboxdImporter", "LetterboxdImporter",
"OPMLImporter", "OPMLImporter",
"DoubanImporter", "DoubanImporter",
"GoodreadsImporter", "GoodreadsImporter",
"get_neodb_importer",
] ]

View file

@ -12,10 +12,20 @@ from django.utils.translation import gettext as _
from loguru import logger from loguru import logger
from catalog.common.sites import SiteManager from catalog.common.sites import SiteManager
from catalog.models import Edition, IdType, Item, ItemCategory from catalog.models import Edition, IdType, Item, ItemCategory, SiteName
from journal.models import Mark, Note, Review, ShelfType from journal.models import Mark, Note, Review, ShelfType
from users.models import Task from users.models import Task
_PREFERRED_SITES = [
SiteName.Fediverse,
SiteName.RSS,
SiteName.TMDB,
SiteName.IMDB,
SiteName.GoogleBooks,
SiteName.Goodreads,
SiteName.IGDB,
]
class CsvImporter(Task): class CsvImporter(Task):
class Meta: class Meta:
@ -49,18 +59,39 @@ class CsvImporter(Task):
site_url = settings.SITE_INFO["site_url"] + "/" site_url = settings.SITE_INFO["site_url"] + "/"
links = links_str.strip().split() links = links_str.strip().split()
# look for local items first
for link in links: for link in links:
if link.startswith("/") or link.startswith(site_url): if link.startswith("/") or link.startswith(site_url):
item = Item.get_by_url(link) item = Item.get_by_url(link)
if item: if item:
return item return item
for link in links:
site = SiteManager.get_site_by_url(link) sites = [SiteManager.get_site_by_url(link) for link in links]
if site: sites = [site for site in sites if site]
sites.sort(
key=lambda x: _PREFERRED_SITES.index(x.SITE_NAME)
if x.SITE_NAME in _PREFERRED_SITES
else 99
)
# look for external items that already matched
for site in sites:
logger.debug(f"matching {site.url}")
item = site.get_item()
if item:
return item
# fetch external item if possible
for site in sites:
try:
logger.debug(f"fetching {site.url}")
site.get_resource_ready() site.get_resource_ready()
item = site.get_item() item = site.get_item()
if item: if item:
return item return item
except Exception as e:
logger.error(f"Error fetching item: {e}")
# Try using the info string # Try using the info string
if info_str: if info_str:
info_dict = {} info_dict = {}
@ -304,8 +335,6 @@ class CsvImporter(Task):
return True return True
except Exception as e: except Exception as e:
logger.error(f"Error importing note: {e}") logger.error(f"Error importing note: {e}")
if "failed_items" not in self.metadata:
self.metadata["failed_items"] = []
self.metadata["failed_items"].append( self.metadata["failed_items"].append(
f"Error importing note for {row.get('title', '')}: {str(e)}" f"Error importing note for {row.get('title', '')}: {str(e)}"
) )
@ -333,32 +362,11 @@ class CsvImporter(Task):
success = import_function(row) success = import_function(row)
self.progress(success) self.progress(success)
@classmethod
def validate_file(cls, filename: str) -> bool:
"""Validate that the given file is a valid CSV export ZIP file.
Args:
filename: Path to the file to validate
Returns:
bool: True if the file is valid, False otherwise
"""
return os.path.exists(filename) and zipfile.is_zipfile(filename)
def run(self) -> None: def run(self) -> None:
"""Run the CSV import.""" """Run the CSV import."""
# Ensure failed_items is initialized
if "failed_items" not in self.metadata:
self.metadata["failed_items"] = []
filename = self.metadata["file"] filename = self.metadata["file"]
logger.debug(f"Importing {filename}") logger.debug(f"Importing {filename}")
# Validate the file before processing
if not self.validate_file(filename):
self.save()
return
with zipfile.ZipFile(filename, "r") as zipref: with zipfile.ZipFile(filename, "r") as zipref:
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
logger.debug(f"Extracting {filename} to {tmpdirname}") logger.debug(f"Extracting {filename} to {tmpdirname}")

View file

@ -0,0 +1,23 @@
# Generated by Django 4.2.18 on 2025-03-03 23:16
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("users", "0008_alter_task_type"),
("journal", "0005_csvexporter"),
]
operations = [
migrations.CreateModel(
name="CsvImporter",
fields=[],
options={
"proxy": True,
"indexes": [],
"constraints": [],
},
bases=("users.task",),
),
]

View file

@ -9,7 +9,7 @@ from loguru import logger
from catalog.models import Edition, IdType, Movie, TVEpisode, TVSeason, TVShow from catalog.models import Edition, IdType, Movie, TVEpisode, TVSeason, TVShow
from journal.exporters import CsvExporter from journal.exporters import CsvExporter
from journal.importers import CsvImporter from journal.importers import CsvImporter, get_neodb_importer
from users.models import User from users.models import User
from ..models import * from ..models import *
@ -219,6 +219,7 @@ class CsvExportImportTest(TestCase):
f"Expected file {filename} with {expected_data_count} data rows, but file not found" f"Expected file {filename} with {expected_data_count} data rows, but file not found"
) )
self.assertEqual(get_neodb_importer(export_path), CsvImporter)
importer = CsvImporter.create(user=self.user2, file=export_path, visibility=2) importer = CsvImporter.create(user=self.user2, file=export_path, visibility=2)
importer.run() importer.run()
self.assertEqual(importer.message, "Import complete") self.assertEqual(importer.message, "Import complete")

View file

@ -28,6 +28,8 @@ attrs==25.1.0
# via aiohttp # via aiohttp
babel==2.17.0 babel==2.17.0
# via mkdocs-material # via mkdocs-material
backrefs==5.8
# via mkdocs-material
beautifulsoup4==4.13.3 beautifulsoup4==4.13.3
# via markdownify # via markdownify
bleach==5.0.1 bleach==5.0.1
@ -54,7 +56,7 @@ click==8.1.8
colorama==0.4.6 colorama==0.4.6
# via djlint # via djlint
# via mkdocs-material # via mkdocs-material
cryptography==44.0.1 cryptography==44.0.2
# via atproto # via atproto
cssbeautifier==1.15.4 cssbeautifier==1.15.4
# via djlint # via djlint
@ -169,7 +171,7 @@ mkdocs==1.6.1
# via mkdocs-material # via mkdocs-material
mkdocs-get-deps==0.2.0 mkdocs-get-deps==0.2.0
# via mkdocs # via mkdocs
mkdocs-material==9.6.5 mkdocs-material==9.6.7
mkdocs-material-extensions==1.3.1 mkdocs-material-extensions==1.3.1
# via mkdocs-material # via mkdocs-material
multidict==6.1.0 multidict==6.1.0
@ -213,7 +215,7 @@ pygments==2.19.1
# via mkdocs-material # via mkdocs-material
pymdown-extensions==10.14.3 pymdown-extensions==10.14.3
# via mkdocs-material # via mkdocs-material
pyright==1.1.395 pyright==1.1.396
python-dateutil==2.9.0.post0 python-dateutil==2.9.0.post0
# via dateparser # via dateparser
# via django-auditlog # via django-auditlog
@ -241,7 +243,6 @@ redis==5.2.1
regex==2024.11.6 regex==2024.11.6
# via dateparser # via dateparser
# via djlint # via djlint
# via mkdocs-material
requests==2.32.3 requests==2.32.3
# via django-anymail # via django-anymail
# via igdb-api-v4 # via igdb-api-v4
@ -251,7 +252,7 @@ rjsmin==1.2.2
# via django-compressor # via django-compressor
rq==2.1.0 rq==2.1.0
# via django-rq # via django-rq
ruff==0.9.8 ruff==0.9.9
sentry-sdk==2.22.0 sentry-sdk==2.22.0
setproctitle==1.3.5 setproctitle==1.3.5
six==1.17.0 six==1.17.0

View file

@ -44,7 +44,7 @@ charset-normalizer==3.4.1
click==8.1.8 click==8.1.8
# via atproto # via atproto
# via rq # via rq
cryptography==44.0.1 cryptography==44.0.2
# via atproto # via atproto
dateparser==1.2.1 dateparser==1.2.1
deepmerge==2.0 deepmerge==2.0

View file

@ -0,0 +1,29 @@
# Generated by Django 4.2.18 on 2025-03-03 23:16
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("users", "0007_alter_task_type"),
]
operations = [
migrations.AlterField(
model_name="task",
name="type",
field=models.CharField(
choices=[
("journal.csvexporter", "csv exporter"),
("journal.csvimporter", "csv importer"),
("journal.doubanimporter", "douban importer"),
("journal.doufenexporter", "doufen exporter"),
("journal.goodreadsimporter", "goodreads importer"),
("journal.letterboxdimporter", "letterboxd importer"),
("journal.ndjsonexporter", "ndjson exporter"),
],
db_index=True,
max_length=255,
),
),
]

View file

@ -15,6 +15,127 @@
{% include "_header.html" %} {% include "_header.html" %}
<main> <main>
<div class="grid__main"> <div class="grid__main">
<article>
<details>
<summary>{% trans 'Export Data' %}</summary>
<form action="{% url 'users:export_csv' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<input type="submit"
value="{% trans 'Export marks, reviews and notes in CSV' %}" />
{% if csv_export_task %}
<br>
{% trans 'Last export' %}: {{ csv_export_task.created_time }}
{% trans 'Status' %}: {{ csv_export_task.get_state_display }}
<br>
{{ csv_export_task.message }}
{% if csv_export_task.metadata.file %}
<a href="{% url 'users:export_csv' %}" download>{% trans 'Download' %}</a>
{% endif %}
{% endif %}
</form>
<hr>
<form action="{% url 'users:export_ndjson' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<input type="submit" value="{% trans 'Export everything in NDJSON' %}" />
{% if ndjson_export_task %}
<br>
{% trans 'Last export' %}: {{ ndjson_export_task.created_time }}
{% trans 'Status' %}: {{ ndjson_export_task.get_state_display }}
<br>
{{ ndjson_export_task.message }}
{% if ndjson_export_task.metadata.file %}
<a href="{% url 'users:export_ndjson' %}" download>{% trans 'Download' %}</a>
{% endif %}
{% endif %}
</form>
<hr>
<form action="{% url 'users:export_marks' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<input type="submit"
class="secondary"
value="{% trans 'Export marks and reviews in XLSX (Doufen format)' %}" />
<small>exporting to this format will be deprecated soon.</small>
{% if export_task %}
<br>
{% trans 'Last export' %}: {{ export_task.created_time }}
{% trans 'Status' %}: {{ export_task.get_state_display }}
<br>
{{ export_task.message }}
{% if export_task.metadata.file %}
<a href="{% url 'users:export_marks' %}" download>{% trans 'Download' %}</a>
{% endif %}
{% endif %}
</form>
</details>
</article>
<article>
<details>
<summary>{% trans 'Import Data' %}</summary>
<form action="{% url 'users:import_neodb' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<ul>
<li>
{% trans 'Upload a <code>.zip</code> file containing <code>.csv</code> or <code>.ndjson</code> files exported from NeoDB.' %}
</li>
<li>{% trans 'Existing marks and reviews with newer dates will be preserved.' %}</li>
</ul>
<br>
<input type="file" name="file" required accept=".zip">
<p>
{% trans 'Visibility' %}: <small><code>.csv</code> only</small>
<br>
<label for="csv_visibility_0">
<input type="radio"
name="visibility"
value="0"
required=""
id="csv_visibility_0"
checked>
{% trans 'Public' %}
</label>
<label for="csv_visibility_1">
<input type="radio"
name="visibility"
value="1"
required=""
id="csv_visibility_1">
{% trans 'Followers Only' %}
</label>
<label for="csv_visibility_2">
<input type="radio"
name="visibility"
value="2"
required=""
id="csv_visibility_2">
{% trans 'Mentioned Only' %}
</label>
</p>
<input type="submit" value="{% trans 'Import' %}" />
<small>
{% if csv_import_task %}
<br>
{% trans 'Last import started' %}: {{ csv_import_task.created_time }}
{% trans 'Status' %}: {{ csv_import_task.get_state_display }}。
<br>
{{ csv_import_task.message }}
{% if csv_import_task.metadata.failed_items %}
{% trans 'Failed items' %}:
<br>
<textarea readonly>{% for item in csv_import_task.metadata.failed_items %}{{item}}&#10;{% endfor %}</textarea>
{% endif %}
{% endif %}
</small>
</form>
</details>
</article>
<article> <article>
<details> <details>
<summary>{% trans 'Import Marks and Reviews from Douban' %}</summary> <summary>{% trans 'Import Marks and Reviews from Douban' %}</summary>
@ -213,123 +334,6 @@
</form> </form>
</details> </details>
</article> </article>
<article>
<details>
<summary>{% trans 'Import marks, reviews and notes from CSV' %}</summary>
<form action="{% url 'users:import_csv' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<ul>
<li>{% trans 'Upload a ZIP file containing CSV files exported from NeoDB.' %}</li>
<li>{% trans 'Existing marks and reviews with newer dates will be preserved.' %}</li>
</ul>
<br>
<input type="file" name="file" required accept=".zip">
<p>
{% trans 'Visibility' %}:
<br>
<label for="csv_visibility_0">
<input type="radio"
name="visibility"
value="0"
required=""
id="csv_visibility_0"
checked>
{% trans 'Public' %}
</label>
<label for="csv_visibility_1">
<input type="radio"
name="visibility"
value="1"
required=""
id="csv_visibility_1">
{% trans 'Followers Only' %}
</label>
<label for="csv_visibility_2">
<input type="radio"
name="visibility"
value="2"
required=""
id="csv_visibility_2">
{% trans 'Mentioned Only' %}
</label>
</p>
<input type="submit" value="{% trans 'Import' %}" />
<small>
{% if csv_import_task %}
<br>
{% trans 'Last import started' %}: {{ csv_import_task.created_time }}
{% trans 'Status' %}: {{ csv_import_task.get_state_display }}。
<br>
{{ csv_import_task.message }}
{% if csv_import_task.metadata.failed_items %}
{% trans 'Failed items' %}:
<br>
<textarea readonly>{% for item in csv_import_task.metadata.failed_items %}{{item}}&#10;{% endfor %}</textarea>
{% endif %}
{% endif %}
</small>
</form>
</details>
</article>
<article>
<details>
<summary>{% trans 'Export Data' %}</summary>
<form action="{% url 'users:export_marks' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<input type="submit"
value="{% trans 'Export marks and reviews in XLSX (Doufen format)' %}" />
{% if export_task %}
<br>
{% trans 'Last export' %}: {{ export_task.created_time }}
{% trans 'Status' %}: {{ export_task.get_state_display }}
<br>
{{ export_task.message }}
{% if export_task.metadata.file %}
<a href="{% url 'users:export_marks' %}" download>{% trans 'Download' %}</a>
{% endif %}
{% endif %}
</form>
<hr>
<form action="{% url 'users:export_csv' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<input type="submit"
value="{% trans 'Export marks, reviews and notes in CSV' %}" />
{% if csv_export_task %}
<br>
{% trans 'Last export' %}: {{ csv_export_task.created_time }}
{% trans 'Status' %}: {{ csv_export_task.get_state_display }}
<br>
{{ csv_export_task.message }}
{% if csv_export_task.metadata.file %}
<a href="{% url 'users:export_csv' %}" download>{% trans 'Download' %}</a>
{% endif %}
{% endif %}
</form>
<hr>
<form action="{% url 'users:export_ndjson' %}"
method="post"
enctype="multipart/form-data">
{% csrf_token %}
<input type="submit" value="{% trans 'Export everything in NDJSON' %}" />
{% if ndjson_export_task %}
<br>
{% trans 'Last export' %}: {{ ndjson_export_task.created_time }}
{% trans 'Status' %}: {{ ndjson_export_task.get_state_display }}
<br>
{{ ndjson_export_task.message }}
{% if ndjson_export_task.metadata.file %}
<a href="{% url 'users:export_ndjson' %}" download>{% trans 'Download' %}</a>
{% endif %}
{% endif %}
</form>
</details>
</article>
<article> <article>
<details> <details>
<summary>{% trans 'View Annual Summary' %}</summary> <summary>{% trans 'View Annual Summary' %}</summary>

View file

@ -15,7 +15,7 @@ urlpatterns = [
path("data/import/douban", import_douban, name="import_douban"), path("data/import/douban", import_douban, name="import_douban"),
path("data/import/letterboxd", import_letterboxd, name="import_letterboxd"), path("data/import/letterboxd", import_letterboxd, name="import_letterboxd"),
path("data/import/opml", import_opml, name="import_opml"), path("data/import/opml", import_opml, name="import_opml"),
path("data/import/csv", import_csv, name="import_csv"), path("data/import/neodb", import_neodb, name="import_neodb"),
path("data/export/reviews", export_reviews, name="export_reviews"), path("data/export/reviews", export_reviews, name="export_reviews"),
path("data/export/marks", export_marks, name="export_marks"), path("data/export/marks", export_marks, name="export_marks"),
path("data/export/csv", export_csv, name="export_csv"), path("data/export/csv", export_csv, name="export_csv"),

View file

@ -19,6 +19,7 @@ from journal.importers import (
GoodreadsImporter, GoodreadsImporter,
LetterboxdImporter, LetterboxdImporter,
OPMLImporter, OPMLImporter,
get_neodb_importer,
) )
from journal.models import ShelfType from journal.models import ShelfType
from takahe.utils import Takahe from takahe.utils import Takahe
@ -324,7 +325,7 @@ def import_opml(request):
@login_required @login_required
def import_csv(request): def import_neodb(request):
if request.method == "POST": if request.method == "POST":
f = ( f = (
settings.MEDIA_ROOT settings.MEDIA_ROOT
@ -335,10 +336,11 @@ def import_csv(request):
with open(f, "wb+") as destination: with open(f, "wb+") as destination:
for chunk in request.FILES["file"].chunks(): for chunk in request.FILES["file"].chunks():
destination.write(chunk) destination.write(chunk)
if not CsvImporter.validate_file(f): importer = get_neodb_importer(f)
if not importer:
messages.add_message(request, messages.ERROR, _("Invalid file.")) messages.add_message(request, messages.ERROR, _("Invalid file."))
return redirect(reverse("users:data")) return redirect(reverse("users:data"))
CsvImporter.create( importer.create(
request.user, request.user,
visibility=int(request.POST.get("visibility", 0)), visibility=int(request.POST.get("visibility", 0)),
file=f, file=f,