add csv importer

This commit is contained in:
mein Name 2025-03-03 00:15:53 -05:00 committed by Henri Dickson
parent e586d660a8
commit 5911d422f5
7 changed files with 719 additions and 23 deletions

View file

@ -447,6 +447,9 @@ LANGUAGE_CODE, PREFERRED_LANGUAGES = _init_language_settings(
env("NEODB_PREFERRED_LANGUAGES") env("NEODB_PREFERRED_LANGUAGES")
) )
if TESTING: # force en if testing
LANGUAGE_CODE = "en"
LOCALE_PATHS = [os.path.join(BASE_DIR, "locale")] LOCALE_PATHS = [os.path.join(BASE_DIR, "locale")]
TIME_ZONE = env("NEODB_TIMEZONE", default="Asia/Shanghai") # type: ignore TIME_ZONE = env("NEODB_TIMEZONE", default="Asia/Shanghai") # type: ignore

View file

@ -1,6 +1,13 @@
from .csv import CsvImporter
from .douban import DoubanImporter from .douban import DoubanImporter
from .goodreads import GoodreadsImporter from .goodreads import GoodreadsImporter
from .letterboxd import LetterboxdImporter from .letterboxd import LetterboxdImporter
from .opml import OPMLImporter from .opml import OPMLImporter
__all__ = ["LetterboxdImporter", "OPMLImporter", "DoubanImporter", "GoodreadsImporter"] __all__ = [
"CsvImporter",
"LetterboxdImporter",
"OPMLImporter",
"DoubanImporter",
"GoodreadsImporter",
]

394
journal/importers/csv.py Normal file
View file

@ -0,0 +1,394 @@
import csv
import datetime
import os
import tempfile
import zipfile
from typing import Dict, List, Optional
from django.conf import settings
from django.utils import timezone
from django.utils.dateparse import parse_datetime
from django.utils.translation import gettext as _
from loguru import logger
from catalog.common.sites import SiteManager
from catalog.models import Edition, IdType, Item, ItemCategory
from journal.models import Mark, Note, Review, ShelfType
from users.models import Task
class CsvImporter(Task):
class Meta:
app_label = "journal" # workaround bug in TypedModel
TaskQueue = "import"
DefaultMetadata = {
"total": 0,
"processed": 0,
"skipped": 0,
"imported": 0,
"failed": 0,
"failed_items": [],
"file": None,
"visibility": 0,
}
def get_item_by_info_and_links(
self, title: str, info_str: str, links_str: str
) -> Optional[Item]:
"""Find an item based on information from CSV export.
Args:
title: Item title
info_str: Item info string (space-separated key:value pairs)
links_str: Space-separated URLs
Returns:
Item if found, None otherwise
"""
site_url = settings.SITE_INFO["site_url"] + "/"
links = links_str.strip().split()
for link in links:
if link.startswith("/") or link.startswith(site_url):
item = Item.get_by_url(link)
if item:
return item
for link in links:
site = SiteManager.get_site_by_url(link)
if site:
site.get_resource_ready()
item = site.get_item()
if item:
return item
# Try using the info string
if info_str:
info_dict = {}
for pair in info_str.strip().split():
if ":" in pair:
key, value = pair.split(":", 1)
info_dict[key] = value
# Check for ISBN, IMDB, etc.
item = None
for key, value in info_dict.items():
if key == "isbn" and value:
item = Edition.objects.filter(
primary_lookup_id_type=IdType.ISBN,
primary_lookup_id_value=value,
).first()
elif key == "imdb" and value:
item = Item.objects.filter(
primary_lookup_id_type=IdType.IMDB,
primary_lookup_id_value=value,
).first()
if item:
return item
return None
def parse_tags(self, tags_str: str) -> List[str]:
"""Parse space-separated tags string into a list of tags."""
if not tags_str:
return []
return [tag.strip() for tag in tags_str.split() if tag.strip()]
def parse_info(self, info_str: str) -> Dict[str, str]:
"""Parse info string into a dictionary."""
info_dict = {}
if not info_str:
return info_dict
for pair in info_str.split():
if ":" in pair:
key, value = pair.split(":", 1)
info_dict[key] = value
return info_dict
def parse_datetime(self, timestamp_str: str) -> Optional[datetime.datetime]:
"""Parse ISO format timestamp into datetime."""
if not timestamp_str:
return None
try:
dt = parse_datetime(timestamp_str)
if dt and dt.tzinfo is None:
dt = dt.replace(tzinfo=datetime.UTC)
return dt
except Exception as e:
logger.error(f"Error parsing datetime {timestamp_str}: {e}")
return None
def parse_shelf_type(self, status_str: str) -> ShelfType:
"""Parse shelf type string into ShelfType enum."""
if not status_str:
return ShelfType.WISHLIST
status_map = {
"wishlist": ShelfType.WISHLIST,
"progress": ShelfType.PROGRESS,
"complete": ShelfType.COMPLETE,
"dropped": ShelfType.DROPPED,
}
return status_map.get(status_str.lower(), ShelfType.WISHLIST)
def import_mark(self, row: Dict[str, str]) -> bool:
"""Import a mark from a CSV row."""
try:
item = self.get_item_by_info_and_links(
row.get("title", ""), row.get("info", ""), row.get("links", "")
)
if not item:
logger.error(f"Could not find item: {row.get('links', '')}")
self.metadata["failed_items"].append(
f"Could not find item: {row.get('links', '')}"
)
return False
owner = self.user.identity
mark = Mark(owner, item)
shelf_type = self.parse_shelf_type(row.get("status", ""))
rating_grade = None
if "rating" in row and row["rating"]:
try:
rating_grade = int(float(row["rating"]))
except (ValueError, TypeError):
pass
comment_text = row.get("comment", "")
tags = self.parse_tags(row.get("tags", ""))
# Parse timestamp
created_time = (
self.parse_datetime(row.get("timestamp", "")) or timezone.now()
)
if (
mark.shelf_type
and mark.created_time
and mark.created_time >= created_time
):
# skip if existing mark is newer
self.metadata["skipped"] = self.metadata.get("skipped", 0) + 1
return True
# Update the mark
mark.update(
shelf_type,
comment_text=comment_text,
rating_grade=rating_grade,
tags=tags,
created_time=created_time,
visibility=self.metadata.get("visibility", 0),
)
return True
except Exception as e:
logger.error(f"Error importing mark: {e}")
self.metadata["failed_items"].append(
f"Error importing mark for {row.get('title', '')}"
)
return False
def import_review(self, row: Dict[str, str]) -> bool:
"""Import a review from a CSV row."""
try:
item = self.get_item_by_info_and_links(
row.get("title", ""), row.get("info", ""), row.get("links", "")
)
if not item:
logger.error(f"Could not find item for review: {row.get('links', '')}")
self.metadata["failed_items"].append(
f"Could not find item for review: {row.get('links', '')}"
)
return False
owner = self.user.identity
review_title = row.get("title", "") # Second "title" field is review title
review_content = row.get("content", "")
# Parse timestamp
created_time = self.parse_datetime(row.get("timestamp", ""))
# Check if there's an existing review with the same or newer timestamp
existing_review = Review.objects.filter(
owner=owner, item=item, title=review_title
).first()
# Skip if existing review is newer or same age
if (
existing_review
and existing_review.created_time
and created_time
and existing_review.created_time >= created_time
):
logger.debug(
f"Skipping review import for {item.display_title}: existing review is newer or same age"
)
self.metadata["skipped"] = self.metadata.get("skipped", 0) + 1
return True
# Create/update the review
Review.update_item_review(
item,
owner,
review_title,
review_content,
created_time=created_time,
visibility=self.metadata.get("visibility", 0),
)
return True
except Exception as e:
logger.error(f"Error importing review: {e}")
self.metadata["failed_items"].append(
f"Error importing review for {row.get('title', '')}: {str(e)}"
)
return False
def import_note(self, row: Dict[str, str]) -> bool:
"""Import a note from a CSV row."""
try:
item = self.get_item_by_info_and_links(
row.get("title", ""), row.get("info", ""), row.get("links", "")
)
if not item:
logger.error(f"Could not find item for note: {row.get('links', '')}")
self.metadata["failed_items"].append(
f"Could not find item for note: {row.get('links', '')}"
)
return False
owner = self.user.identity
title = row.get("title", "") # Second "title" field is note title
content = row.get("content", "")
progress = row.get("progress", "")
# Parse timestamp
created_time = self.parse_datetime(row.get("timestamp", ""))
# Extract progress information
pt, pv = Note.extract_progress(progress)
# Check if a note with the same attributes already exists
existing_notes = Note.objects.filter(
item=item,
owner=owner,
title=title,
progress_type=pt,
progress_value=pv,
)
# If we have an exact content match, skip this import
for existing_note in existing_notes:
if existing_note.content == content:
logger.debug(
f"Skipping note import for {item.display_title}: duplicate note found"
)
self.metadata["skipped"] = self.metadata.get("skipped", 0) + 1
return True
# Create the note if no duplicate is found
Note.objects.create(
item=item,
owner=owner,
title=title,
content=content,
progress_type=pt,
progress_value=pv,
created_time=created_time,
visibility=self.metadata.get("visibility", 0),
)
return True
except Exception as e:
logger.error(f"Error importing note: {e}")
if "failed_items" not in self.metadata:
self.metadata["failed_items"] = []
self.metadata["failed_items"].append(
f"Error importing note for {row.get('title', '')}: {str(e)}"
)
return False
def progress(self, success: bool) -> None:
"""Update import progress."""
self.metadata["total"] += 1
self.metadata["processed"] += 1
if success:
self.metadata["imported"] += 1
else:
self.metadata["failed"] += 1
self.message = f"{self.metadata['imported']} imported, {self.metadata['skipped']} skipped, {self.metadata['failed']} failed"
self.save(update_fields=["metadata", "message"])
def process_csv_file(self, file_path: str, import_function) -> None:
"""Process a CSV file using the specified import function."""
logger.debug(f"Processing {file_path}")
with open(file_path, "r") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
success = import_function(row)
self.progress(success)
def validate_file(self, filename: str) -> bool:
"""Validate that the given file is a valid CSV export ZIP file.
Args:
filename: Path to the file to validate
Returns:
bool: True if the file is valid, False otherwise
"""
return os.path.exists(filename) and zipfile.is_zipfile(filename)
def run(self) -> None:
"""Run the CSV import."""
# Ensure failed_items is initialized
if "failed_items" not in self.metadata:
self.metadata["failed_items"] = []
filename = self.metadata["file"]
logger.debug(f"Importing {filename}")
# Validate the file before processing
if not self.validate_file(filename):
self.save()
return
with zipfile.ZipFile(filename, "r") as zipref:
with tempfile.TemporaryDirectory() as tmpdirname:
logger.debug(f"Extracting {filename} to {tmpdirname}")
zipref.extractall(tmpdirname)
# Look for mark, review, and note CSV files
for category in [
ItemCategory.Movie,
ItemCategory.TV,
ItemCategory.Music,
ItemCategory.Book,
ItemCategory.Game,
ItemCategory.Podcast,
ItemCategory.Performance,
]:
# Import marks
mark_file = os.path.join(tmpdirname, f"{category}_mark.csv")
if os.path.exists(mark_file):
self.process_csv_file(mark_file, self.import_mark)
# Import reviews
review_file = os.path.join(tmpdirname, f"{category}_review.csv")
if os.path.exists(review_file):
self.process_csv_file(review_file, self.import_review)
# Import notes
note_file = os.path.join(tmpdirname, f"{category}_note.csv")
if os.path.exists(note_file):
self.process_csv_file(note_file, self.import_note)
self.message = _("Import complete")
if self.metadata.get("failed_items", []):
self.message += f": {self.metadata['failed']} items failed ({len(self.metadata['failed_items'])} unique items)"
self.save()

View file

@ -0,0 +1,3 @@
from .csv import *
from .piece import *
from .search import *

283
journal/tests/csv.py Normal file
View file

@ -0,0 +1,283 @@
import csv
import os
import zipfile
from tempfile import TemporaryDirectory
from django.test import TestCase
from django.utils.dateparse import parse_datetime
from loguru import logger
from catalog.models import Edition, IdType, Movie, TVEpisode, TVSeason, TVShow
from journal.exporters import CsvExporter
from journal.importers import CsvImporter
from users.models import User
from ..models import *
class CsvExportImportTest(TestCase):
databases = "__all__"
def setUp(self):
# Create test items of different types
self.book1 = Edition.objects.create(
localized_title=[{"lang": "en", "text": "Hyperion"}],
primary_lookup_id_type=IdType.ISBN,
primary_lookup_id_value="9780553283686",
author=["Dan Simmons"],
pub_year=1989,
)
self.book2 = Edition.objects.create(
localized_title=[{"lang": "en", "text": "Dune"}],
primary_lookup_id_type=IdType.ISBN,
primary_lookup_id_value="9780441172719",
author=["Frank Herbert"],
pub_year=1965,
)
self.movie1 = Movie.objects.create(
localized_title=[{"lang": "en", "text": "Inception"}],
primary_lookup_id_type=IdType.IMDB,
primary_lookup_id_value="tt1375666",
director=["Christopher Nolan"],
year=2010,
)
self.movie2 = Movie.objects.create(
localized_title=[{"lang": "en", "text": "The Matrix"}],
primary_lookup_id_type=IdType.IMDB,
primary_lookup_id_value="tt0133093",
director=["Lana Wachowski", "Lilly Wachowski"],
year=1999,
)
self.tvshow = TVShow.objects.create(
localized_title=[{"lang": "en", "text": "Breaking Bad"}],
primary_lookup_id_type=IdType.IMDB,
primary_lookup_id_value="tt0903747",
year=2008,
)
self.tvseason = TVSeason.objects.create(
localized_title=[{"lang": "en", "text": "Breaking Bad Season 1"}],
show=self.tvshow,
season_number=1,
)
self.tvepisode1 = TVEpisode.objects.create(
localized_title=[{"lang": "en", "text": "Pilot"}],
season=self.tvseason,
episode_number=1,
)
self.tvepisode2 = TVEpisode.objects.create(
localized_title=[{"lang": "en", "text": "Cat's in the Bag..."}],
season=self.tvseason,
episode_number=2,
)
# Create user for testing
self.user1 = User.register(email="export@test.com", username="exporter")
self.user2 = User.register(email="import@test.com", username="importer")
self.dt = parse_datetime("2021-01-01T00:00:00Z")
def test_csv_export_import(self):
# Create marks, reviews and notes for user1
# Book marks with ratings and tags
mark_book1 = Mark(self.user1.identity, self.book1)
mark_book1.update(
ShelfType.COMPLETE,
"Great sci-fi classic",
10,
["sci-fi", "favorite", "space"],
1,
created_time=self.dt,
)
mark_book2 = Mark(self.user1.identity, self.book2)
mark_book2.update(
ShelfType.PROGRESS, "Reading it now", None, ["sci-fi", "desert"], 1
)
# Movie marks with ratings
mark_movie1 = Mark(self.user1.identity, self.movie1)
mark_movie1.update(
ShelfType.COMPLETE, "Mind-bending", 8, ["mindbender", "scifi"], 1
)
mark_movie2 = Mark(self.user1.identity, self.movie2)
mark_movie2.update(ShelfType.WISHLIST, "Need to rewatch", None, [], 1)
# TV show mark
mark_tvshow = Mark(self.user1.identity, self.tvshow)
mark_tvshow.update(ShelfType.WISHLIST, "Heard it's good", None, ["drama"], 1)
# TV episode marks
mark_episode1 = Mark(self.user1.identity, self.tvepisode1)
mark_episode1.update(ShelfType.COMPLETE, "Great start", 9, [], 1)
mark_episode2 = Mark(self.user1.identity, self.tvepisode2)
mark_episode2.update(ShelfType.COMPLETE, "It gets better", 9, [], 1)
# Create reviews
Review.update_item_review(
self.book1,
self.user1.identity,
"My thoughts on Hyperion",
"A masterpiece of science fiction that weaves multiple storylines into a captivating narrative.",
visibility=1,
created_time=self.dt,
)
Review.update_item_review(
self.movie1,
self.user1.identity,
"Inception Review",
"Christopher Nolan at his best. The movie plays with reality and dreams in a fascinating way.",
visibility=1,
)
# Create notes
Note.objects.create(
item=self.book2,
owner=self.user1.identity,
title="Reading progress",
content="Just finished the first part. The world-building is incredible.\n\n - p 125",
progress_type=Note.ProgressType.PAGE,
progress_value="p 125",
visibility=1,
)
Note.objects.create(
item=self.tvshow,
owner=self.user1.identity,
title="Before watching",
content="Things to look out for according to friends:\n- Character development\n- Color symbolism\n\n - e 0",
progress_type=Note.ProgressType.EPISODE,
progress_value="2",
visibility=1,
)
# Export data to CSV
exporter = CsvExporter.create(user=self.user1)
exporter.run()
export_path = exporter.metadata["file"]
logger.debug(f"exported to {export_path}")
self.assertTrue(os.path.exists(export_path))
# Validate the number of CSV rows in the export files
with TemporaryDirectory() as extract_dir:
with zipfile.ZipFile(export_path, "r") as zip_ref:
zip_ref.extractall(extract_dir)
logger.debug(f"unzipped to {extract_dir}")
# Expected row counts (data rows, excluding header)
expected_data_rows = {
"book_mark.csv": 2, # 2 book marks
"book_review.csv": 1, # 1 book review
"book_note.csv": 1, # 1 book note
"movie_mark.csv": 2, # 2 movie marks
"movie_review.csv": 1, # 1 movie review
"movie_note.csv": 0, # No movie notes
"tv_mark.csv": 3, # 3 TV marks (show + 2 episodes)
"tv_note.csv": 1, # 1 TV note
"tv_review.csv": 0, # No TV reviews
"music_mark.csv": 0, # No music marks
"music_review.csv": 0, # No music reviews
"music_note.csv": 0, # No music notes
"game_mark.csv": 0, # No game marks
"game_review.csv": 0, # No game reviews
"game_note.csv": 0, # No game notes
"podcast_mark.csv": 0, # No podcast marks
"podcast_review.csv": 0, # No podcast reviews
"podcast_note.csv": 0, # No podcast notes
"performance_mark.csv": 0, # No performance marks
"performance_review.csv": 0, # No performance reviews
"performance_note.csv": 0, # No performance notes
}
# Check each file
for filename, expected_data_count in expected_data_rows.items():
file_path = os.path.join(extract_dir, filename)
if os.path.exists(file_path):
with open(file_path, "r") as file:
csv_reader = csv.reader(file)
# Skip header row
next(csv_reader, None)
# Count data rows
row_count = sum(1 for _ in csv_reader)
self.assertEqual(
row_count,
expected_data_count,
f"File {filename} has {row_count} data rows, expected {expected_data_count}",
)
# Check header row is present by reopening the file
with open(file_path, "r") as header_check:
first_line = next(header_check, "")
self.assertTrue(
first_line.strip(),
f"File {filename} has no header row",
)
elif expected_data_count > 0:
self.fail(
f"Expected file {filename} with {expected_data_count} data rows, but file not found"
)
importer = CsvImporter.create(user=self.user2, file=export_path, visibility=2)
importer.run()
self.assertEqual(importer.message, "Import complete")
# Verify imported data
# Check marks
mark_book1_imported = Mark(self.user2.identity, self.book1)
self.assertEqual(mark_book1_imported.shelf_type, ShelfType.COMPLETE)
self.assertEqual(mark_book1_imported.comment_text, "Great sci-fi classic")
self.assertEqual(mark_book1_imported.rating_grade, 10)
self.assertEqual(mark_book1_imported.visibility, 2)
self.assertEqual(mark_book1_imported.created_time, self.dt)
self.assertEqual(
set(mark_book1_imported.tags), set(["sci-fi", "favorite", "space"])
)
mark_book2_imported = Mark(self.user2.identity, self.book2)
self.assertEqual(mark_book2_imported.shelf_type, ShelfType.PROGRESS)
self.assertEqual(mark_book2_imported.comment_text, "Reading it now")
self.assertIsNone(mark_book2_imported.rating_grade)
self.assertEqual(set(mark_book2_imported.tags), set(["sci-fi", "desert"]))
mark_movie1_imported = Mark(self.user2.identity, self.movie1)
self.assertEqual(mark_movie1_imported.shelf_type, ShelfType.COMPLETE)
self.assertEqual(mark_movie1_imported.comment_text, "Mind-bending")
self.assertEqual(mark_movie1_imported.rating_grade, 8)
self.assertEqual(set(mark_movie1_imported.tags), set(["mindbender", "scifi"]))
mark_episode1_imported = Mark(self.user2.identity, self.tvepisode1)
self.assertEqual(mark_episode1_imported.shelf_type, ShelfType.COMPLETE)
self.assertEqual(mark_episode1_imported.comment_text, "Great start")
self.assertEqual(mark_episode1_imported.rating_grade, 9)
# Check reviews
book1_reviews = Review.objects.filter(
owner=self.user2.identity, item=self.book1
)
self.assertEqual(book1_reviews.count(), 1)
self.assertEqual(book1_reviews[0].title, "My thoughts on Hyperion")
self.assertEqual(book1_reviews[0].created_time, self.dt)
self.assertIn("masterpiece of science fiction", book1_reviews[0].body)
movie1_reviews = Review.objects.filter(
owner=self.user2.identity, item=self.movie1
)
self.assertEqual(movie1_reviews.count(), 1)
self.assertEqual(movie1_reviews[0].title, "Inception Review")
self.assertIn("Christopher Nolan", movie1_reviews[0].body)
# Check notes
book2_notes = Note.objects.filter(owner=self.user2.identity, item=self.book2)
self.assertEqual(book2_notes.count(), 1)
self.assertEqual(book2_notes[0].title, "Reading progress")
self.assertIn("world-building is incredible", book2_notes[0].content)
self.assertEqual(book2_notes[0].progress_type, Note.ProgressType.PAGE)
self.assertEqual(book2_notes[0].progress_value, "125")
tvshow_notes = Note.objects.filter(owner=self.user2.identity, item=self.tvshow)
self.assertEqual(tvshow_notes.count(), 1)
self.assertEqual(tvshow_notes[0].title, "Before watching")
self.assertIn("Character development", tvshow_notes[0].content)

View file

@ -6,7 +6,7 @@ from catalog.models import Edition
from journal.models.common import Debris from journal.models.common import Debris
from users.models import User from users.models import User
from .models import * from ..models import *
class CollectionTest(TestCase): class CollectionTest(TestCase):
@ -300,24 +300,3 @@ class NoteTest(TestCase):
self.assertEqual(c, "test ") self.assertEqual(c, "test ")
self.assertEqual(t, Note.ProgressType.CHAPTER) self.assertEqual(t, Note.ProgressType.CHAPTER)
self.assertEqual(v, "2") self.assertEqual(v, "2")
class SearchTest(TestCase):
databases = "__all__"
def setUp(self):
self.book1 = Edition.objects.create(title="Hyperion")
self.book2 = Edition.objects.create(title="Andymion")
self.user1 = User.register(email="x@y.com", username="userx")
self.index = JournalIndex.instance()
self.index.delete_by_owner([self.user1.identity.pk])
def test_post(self):
mark = Mark(self.user1.identity, self.book1)
mark.update(ShelfType.WISHLIST, "a gentle comment", 9, ["Sci-Fi", "fic"], 0)
mark = Mark(self.user1.identity, self.book2)
mark.update(ShelfType.WISHLIST, "a gentle comment", None, ["nonfic"], 1)
q = JournalQueryParser("gentle")
q.filter_by_owner(self.user1.identity)
r = self.index.search(q)
self.assertEqual(r.total, 2)

27
journal/tests/search.py Normal file
View file

@ -0,0 +1,27 @@
from django.test import TestCase
from catalog.models import Edition
from users.models import User
from ..models import *
class SearchTest(TestCase):
databases = "__all__"
def setUp(self):
self.book1 = Edition.objects.create(title="Hyperion")
self.book2 = Edition.objects.create(title="Andymion")
self.user1 = User.register(email="x@y.com", username="userx")
self.index = JournalIndex.instance()
self.index.delete_by_owner([self.user1.identity.pk])
def test_post(self):
mark = Mark(self.user1.identity, self.book1)
mark.update(ShelfType.WISHLIST, "a gentle comment", 9, ["Sci-Fi", "fic"], 0)
mark = Mark(self.user1.identity, self.book2)
mark.update(ShelfType.WISHLIST, "a gentle comment", None, ["nonfic"], 1)
q = JournalQueryParser("gentle")
q.filter_by_owner(self.user1.identity)
r = self.index.search(q)
self.assertEqual(r.total, 2)