From 0e41a1e5efa0518639223e564403b39ca576c659 Mon Sep 17 00:00:00 2001
From: Your Name
Date: Thu, 26 Dec 2024 10:54:25 -0500
Subject: [PATCH] modernize importers
---
journal/importers/douban.py | 178 +++++++-----------
journal/importers/goodreads.py | 163 ++++++++--------
journal/management/commands/journal.py | 20 --
journal/migrations/0004_tasks.py | 21 ++-
users/migrations/0006_alter_task_type.py | 2 +
users/models/preference.py | 4 +-
users/templates/users/data.html | 11 +-
users/templates/users/data_import_status.html | 30 ++-
users/views/data.py | 47 +++--
9 files changed, 233 insertions(+), 243 deletions(-)
diff --git a/journal/importers/douban.py b/journal/importers/douban.py
index b89a8059..0e4c0eb7 100644
--- a/journal/importers/douban.py
+++ b/journal/importers/douban.py
@@ -2,14 +2,11 @@ import os
import re
from datetime import datetime
-import django_rq
import openpyxl
import pytz
-from auditlog.context import set_actor
from django.conf import settings
from loguru import logger
from markdownify import markdownify as md
-from user_messages import api as msg
from catalog.common import *
from catalog.common.downloaders import *
@@ -17,6 +14,7 @@ from catalog.models import *
from catalog.sites.douban import DoubanDownloader
from common.utils import GenerateDateUUIDMediaFilePath
from journal.models import *
+from users.models import Task
_tz_sh = pytz.timezone("Asia/Shanghai")
@@ -40,77 +38,22 @@ def _fetch_remote_image(url):
return url
-class DoubanImporter:
- total = 0
- processed = 0
- skipped = 0
- imported = 0
- failed = []
- visibility = 0
- mode = 0
- file = ""
+class DoubanImporter(Task):
+ class Meta:
+ app_label = "journal" # workaround bug in TypedModel
- def __init__(self, user, visibility, mode):
- self.user = user
- self.visibility = visibility
- self.mode = mode
-
- def update_user_import_status(self, status):
- self.user.preference.import_status["douban_pending"] = status
- self.user.preference.import_status["douban_file"] = self.file
- self.user.preference.import_status["douban_visibility"] = self.visibility
- self.user.preference.import_status["douban_mode"] = self.mode
- self.user.preference.import_status["douban_total"] = self.total
- self.user.preference.import_status["douban_processed"] = self.processed
- self.user.preference.import_status["douban_skipped"] = self.skipped
- self.user.preference.import_status["douban_imported"] = self.imported
- self.user.preference.import_status["douban_failed"] = self.failed
- self.user.preference.save(update_fields=["import_status"])
-
- @classmethod
- def reset(cls, user):
- user.preference.import_status["douban_pending"] = 0
- user.preference.save(update_fields=["import_status"])
-
- @classmethod
- def redo(cls, user):
- file = user.preference.import_status["douban_file"]
- imp = cls(
- user,
- user.preference.import_status["douban_visibility"],
- user.preference.import_status["douban_mode"],
- )
- imp.file = file
- jid = f"Douban_{user.id}_{os.path.basename(file)}_redo"
- django_rq.get_queue("import").enqueue(imp.import_from_file_task, job_id=jid)
-
- def import_from_file(self, uploaded_file):
- try:
- wb = openpyxl.open(
- uploaded_file, read_only=True, data_only=True, keep_links=False
- )
- wb.close()
- file = (
- settings.MEDIA_ROOT
- + "/"
- + GenerateDateUUIDMediaFilePath("x.xlsx", settings.SYNC_FILE_PATH_ROOT)
- )
- os.makedirs(os.path.dirname(file), exist_ok=True)
- with open(file, "wb") as destination:
- for chunk in uploaded_file.chunks():
- destination.write(chunk)
- self.file = file
- self.update_user_import_status(2)
- jid = f"Douban_{self.user.id}_{os.path.basename(self.file)}"
- django_rq.get_queue("import").enqueue(
- self.import_from_file_task, job_id=jid
- )
- except Exception as e:
- logger.error(
- f"unable to enqueue import {uploaded_file}", extra={"exception": e}
- )
- return False
- return True
+ TaskQueue = "import"
+ DefaultMetadata = {
+ "total": 0,
+ "processed": 0,
+ "skipped": 0,
+ "imported": 0,
+ "failed": 0,
+ "mode": 0,
+ "visibility": 0,
+ "failed_urls": [],
+ "file": None,
+ }
mark_sheet_config = {
"想读": [ShelfType.WISHLIST],
@@ -135,13 +78,30 @@ class DoubanImporter:
"剧评": [Performance],
"游戏评论&攻略": [Game],
}
+
+ @classmethod
+ def validate_file(cls, uploaded_file):
+ try:
+ wb = openpyxl.open(
+ uploaded_file, read_only=True, data_only=True, keep_links=False
+ )
+ sheets = cls.mark_sheet_config.keys() | cls.review_sheet_config.keys()
+ for name in sheets:
+ if name in wb:
+ return True
+ except Exception as e:
+ logger.error(
+ f"unable to validate excel file {uploaded_file}", extra={"exception": e}
+ )
+ return False
+
mark_data = {}
review_data = {}
entity_lookup = {}
def load_sheets(self):
"""Load data into mark_data / review_data / entity_lookup"""
- f = open(self.file, "rb")
+ f = open(self.metadata["file"], "rb")
wb = openpyxl.load_workbook(f, read_only=True, data_only=True, keep_links=False)
for data, config in [
(self.mark_data, self.mark_sheet_config),
@@ -164,8 +124,9 @@ class DoubanImporter:
self.entity_lookup[k].append(v)
else:
self.entity_lookup[k] = [v]
- self.total = sum(map(lambda a: len(a), self.mark_data.values()))
- self.total += sum(map(lambda a: len(a), self.review_data.values()))
+ self.metadata["total"] = sum(map(lambda a: len(a), self.mark_data.values()))
+ self.metadata["total"] += sum(map(lambda a: len(a), self.review_data.values()))
+ self.save()
def guess_entity_url(self, title, rating, timestamp):
k = f"{title}|{rating}"
@@ -189,28 +150,20 @@ class DoubanImporter:
# if cells[0] == title and cells[5] == rating:
# return cells[3]
- def import_from_file_task(self):
+ def run(self):
logger.info(f"{self.user} import start")
- msg.info(self.user, f"开始导入豆瓣标记和评论")
- self.update_user_import_status(1)
- with set_actor(self.user):
- self.load_sheets()
- logger.info(f"{self.user} sheet loaded, {self.total} lines total")
- self.update_user_import_status(1)
- for name, param in self.mark_sheet_config.items():
- self.import_mark_sheet(self.mark_data[name], param[0], name)
- for name, param in self.review_sheet_config.items():
- self.import_review_sheet(self.review_data[name], name)
- self.update_user_import_status(0)
- msg.success(
- self.user,
- f"豆瓣标记和评论导入完成,共处理{self.total}篇,已存在{self.skipped}篇,新增{self.imported}篇。",
- )
- if len(self.failed):
- msg.error(
- self.user,
- f'豆瓣评论导入时未能处理以下网址:\n{" , ".join(self.failed)}',
+ self.load_sheets()
+ logger.info(f"{self.user} sheet loaded, {self.metadata['total']} lines total")
+ for name, param in self.mark_sheet_config.items():
+ self.import_mark_sheet(self.mark_data[name], param[0], name)
+ for name, param in self.review_sheet_config.items():
+ self.import_review_sheet(self.review_data[name], name)
+ self.message = f"豆瓣标记和评论导入完成,共处理{self.metadata['total']}篇,已存在{self.metadata['skipped']}篇,新增{self.metadata['imported']}篇。"
+ if len(self.metadata["failed_urls"]) > 0:
+ self.message += (
+ f'导入时未能处理以下网址:\n{" , ".join(self.metadata["failed_urls"])}'
)
+ self.save()
def import_mark_sheet(self, worksheet, shelf_type, sheet_name):
prefix = f"{self.user} {sheet_name}|"
@@ -234,7 +187,7 @@ class DoubanImporter:
except Exception:
tags = []
comment = cells[7] if len(cells) >= 8 else None
- self.processed += 1
+ self.metadata["processed"] += 1
try:
if type(time) == str:
time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
@@ -243,10 +196,10 @@ class DoubanImporter:
time = None
r = self.import_mark(url, shelf_type, comment, rating_grade, tags, time)
if r == 1:
- self.imported += 1
+ self.metadata["imported"] += 1
elif r == 2:
- self.skipped += 1
- self.update_user_import_status(1)
+ self.metadata["skipped"] += 1
+ self.save()
def import_mark(self, url, shelf_type, comment, rating_grade, tags, time):
"""
@@ -257,7 +210,7 @@ class DoubanImporter:
logger.warning(f"{self.user} | match/fetch {url} failed")
return
mark = Mark(self.user.identity, item)
- if self.mode == 0 and (
+ if self.metadata["mode"] == 0 and (
mark.shelf_type == shelf_type
or mark.shelf_type == ShelfType.COMPLETE
or (
@@ -268,7 +221,12 @@ class DoubanImporter:
print("-", end="", flush=True)
return 2
mark.update(
- shelf_type, comment, rating_grade, tags, self.visibility, created_time=time
+ shelf_type,
+ comment,
+ rating_grade,
+ tags,
+ self.metadata["visibility"],
+ created_time=time,
)
print("+", end="", flush=True)
return 1
@@ -289,7 +247,7 @@ class DoubanImporter:
time = cells[3]
rating = cells[4]
content = cells[6]
- self.processed += 1
+ self.metadata["processed"] += 1
if time:
if type(time) == str:
time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
@@ -304,12 +262,12 @@ class DoubanImporter:
entity_title, rating, title, review_url, content, time
)
if r == 1:
- self.imported += 1
+ self.metadata["imported"] += 1
elif r == 2:
- self.skipped += 1
+ self.metadata["skipped"] += 1
else:
- self.failed.append(review_url)
- self.update_user_import_status(1)
+ self.metadata["failed_urls"].append(review_url)
+ self.save()
def get_item_by_url(self, url):
item = None
@@ -337,7 +295,7 @@ class DoubanImporter:
except Exception as e:
logger.error(f"fetching error: {url}", extra={"exception": e})
if item is None:
- self.failed.append(str(url))
+ self.metadata["failed_urls"].append(str(url))
return item
def import_review(self, entity_title, rating, title, review_url, content, time):
@@ -367,7 +325,7 @@ class DoubanImporter:
logger.warning(f"{prefix} match/fetch {url} failed")
return
if (
- self.mode == 1
+ self.metadata["mode"] == 1
and Review.objects.filter(owner=self.user.identity, item=item).exists()
):
return 2
@@ -387,7 +345,7 @@ class DoubanImporter:
"edited_time": time,
"title": title,
"body": content,
- "visibility": self.visibility,
+ "visibility": self.metadata["visibility"],
}
try:
Review.objects.update_or_create(
diff --git a/journal/importers/goodreads.py b/journal/importers/goodreads.py
index ca994364..7a68b362 100644
--- a/journal/importers/goodreads.py
+++ b/journal/importers/goodreads.py
@@ -1,16 +1,14 @@
import re
from datetime import datetime
-import django_rq
-from auditlog.context import set_actor
from django.utils import timezone
from django.utils.timezone import make_aware
-from user_messages import api as msg
from catalog.common import *
from catalog.common.downloaders import *
from catalog.models import *
from journal.models import *
+from users.models import Task
re_list = r"^https://www\.goodreads\.com/list/show/\d+"
re_shelf = r"^https://www\.goodreads\.com/review/list/\d+[^\?]*\?shelf=[^&]+"
@@ -24,93 +22,104 @@ gr_rating = {
}
-class GoodreadsImporter:
+class GoodreadsImporter(Task):
+ class Meta:
+ app_label = "journal" # workaround bug in TypedModel
+
+ TaskQueue = "import"
+ DefaultMetadata = {
+ "total": 0,
+ "processed": 0,
+ "skipped": 0,
+ "imported": 0,
+ "failed": 0,
+ "visibility": 0,
+ "failed_urls": [],
+ "url": None,
+ }
+
@classmethod
- def import_from_url(cls, raw_url, user):
+ def validate_url(cls, raw_url):
match_list = re.match(re_list, raw_url)
match_shelf = re.match(re_shelf, raw_url)
match_profile = re.match(re_profile, raw_url)
if match_profile or match_shelf or match_list:
- django_rq.get_queue("import").enqueue(
- cls.import_from_url_task, raw_url, user
- )
return True
else:
return False
- @classmethod
- def import_from_url_task(cls, url, user):
+ def run(self):
+ url = self.metadata["url"]
+ user = self.user
match_list = re.match(re_list, url)
match_shelf = re.match(re_shelf, url)
match_profile = re.match(re_profile, url)
total = 0
visibility = user.preference.default_visibility
- with set_actor(user):
- shelf = None
- if match_shelf:
- shelf = cls.parse_shelf(match_shelf[0], user)
- elif match_list:
- shelf = cls.parse_list(match_list[0], user)
- if shelf:
- if shelf["title"] and shelf["books"]:
- collection = Collection.objects.create(
- title=shelf["title"],
- brief=shelf["description"]
- + "\n\nImported from [Goodreads]("
- + url
- + ")",
- owner=user.identity,
- )
- for book in shelf["books"]:
- collection.append_item(book["book"], note=book["review"])
- total += 1
- collection.save()
- msg.success(
- user,
- f'Imported {total} books from Goodreads as a Collection {shelf["title"]}.',
+ shelf = None
+ if match_shelf:
+ shelf = self.parse_shelf(match_shelf[0])
+ elif match_list:
+ shelf = self.parse_list(match_list[0])
+ if shelf:
+ if shelf["title"] and shelf["books"]:
+ collection = Collection.objects.create(
+ title=shelf["title"],
+ brief=shelf["description"]
+ + "\n\nImported from [Goodreads]("
+ + url
+ + ")",
+ owner=user.identity,
)
- elif match_profile:
- uid = match_profile[1]
- shelves = {
- ShelfType.WISHLIST: f"https://www.goodreads.com/review/list/{uid}?shelf=to-read",
- ShelfType.PROGRESS: f"https://www.goodreads.com/review/list/{uid}?shelf=currently-reading",
- ShelfType.COMPLETE: f"https://www.goodreads.com/review/list/{uid}?shelf=read",
- }
- for shelf_type in shelves:
- shelf_url = shelves.get(shelf_type)
- shelf = cls.parse_shelf(shelf_url, user)
- for book in shelf["books"]:
- mark = Mark(user.identity, book["book"])
- if (
- (
- mark.shelf_type == shelf_type
- and mark.comment_text == book["review"]
- )
- or (
- mark.shelf_type == ShelfType.COMPLETE
- and shelf_type != ShelfType.COMPLETE
- )
- or (
- mark.shelf_type == ShelfType.PROGRESS
- and shelf_type == ShelfType.WISHLIST
- )
- ):
- print(
- f'Skip {shelf_type}/{book["book"]} bc it was marked {mark.shelf_type}'
- )
- else:
- mark.update(
- shelf_type,
- book["review"],
- book["rating"],
- visibility=visibility,
- created_time=book["last_updated"] or timezone.now(),
- )
- total += 1
- msg.success(user, f"Imported {total} records from Goodreads profile.")
+ for book in shelf["books"]:
+ collection.append_item(book["book"], note=book["review"])
+ total += 1
+ collection.save()
+ self.message = f'Imported {total} books from Goodreads as a Collection {shelf["title"]}.'
+ elif match_profile:
+ uid = match_profile[1]
+ shelves = {
+ ShelfType.WISHLIST: f"https://www.goodreads.com/review/list/{uid}?shelf=to-read",
+ ShelfType.PROGRESS: f"https://www.goodreads.com/review/list/{uid}?shelf=currently-reading",
+ ShelfType.COMPLETE: f"https://www.goodreads.com/review/list/{uid}?shelf=read",
+ }
+ for shelf_type in shelves:
+ shelf_url = shelves.get(shelf_type)
+ shelf = self.parse_shelf(shelf_url)
+ for book in shelf["books"]:
+ mark = Mark(user.identity, book["book"])
+ if (
+ (
+ mark.shelf_type == shelf_type
+ and mark.comment_text == book["review"]
+ )
+ or (
+ mark.shelf_type == ShelfType.COMPLETE
+ and shelf_type != ShelfType.COMPLETE
+ )
+ or (
+ mark.shelf_type == ShelfType.PROGRESS
+ and shelf_type == ShelfType.WISHLIST
+ )
+ ):
+ print(
+ f'Skip {shelf_type}/{book["book"]} bc it was marked {mark.shelf_type}'
+ )
+ else:
+ mark.update(
+ shelf_type,
+ book["review"],
+ book["rating"],
+ visibility=visibility,
+ created_time=book["last_updated"] or timezone.now(),
+ )
+ total += 1
+ self.message = f"Imported {total} records from Goodreads profile."
+ self.metadata["total"] = total
+ self.save()
@classmethod
- def get_book(cls, url, user):
+ def get_book(cls, url):
site = SiteManager.get_site_by_url(url)
if site:
book = site.get_item()
@@ -121,7 +130,7 @@ class GoodreadsImporter:
return book
@classmethod
- def parse_shelf(cls, url, user):
+ def parse_shelf(cls, url):
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
title = ""
books = []
@@ -194,7 +203,7 @@ class GoodreadsImporter:
except Exception:
print(f"Error loading/parsing review{url_review}, ignored")
try:
- book = cls.get_book(url_book, user)
+ book = cls.get_book(url_book)
books.append(
{
"url": url_book,
@@ -216,7 +225,7 @@ class GoodreadsImporter:
return {"title": title, "description": "", "books": books}
@classmethod
- def parse_list(cls, url, user):
+ def parse_list(cls, url):
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
title = ""
description = ""
@@ -237,7 +246,7 @@ class GoodreadsImporter:
for link in links: # type:ignore
url_book = "https://www.goodreads.com" + link
try:
- book = cls.get_book(url_book, user)
+ book = cls.get_book(url_book)
books.append(
{
"url": url_book,
diff --git a/journal/management/commands/journal.py b/journal/management/commands/journal.py
index fd8645a0..b9f49dc4 100644
--- a/journal/management/commands/journal.py
+++ b/journal/management/commands/journal.py
@@ -25,16 +25,6 @@ class Command(BaseCommand):
action="store_true",
help="purge invalid data (visibility=99)",
)
- parser.add_argument(
- "--douban-import-redo",
- action="store",
- help="reimport for user id",
- )
- parser.add_argument(
- "--douban-import-reset",
- action="store",
- help="reset for user id",
- )
parser.add_argument(
"--integrity",
action="store_true",
@@ -66,14 +56,4 @@ class Command(BaseCommand):
self.stdout.write(f"Cleaning up {cls}...")
cls.objects.filter(visibility=99).delete()
- if options["douban_import_redo"]:
- user = User.objects.get(pk=options["douban_import_redo"])
- self.stdout.write(f"Redo import for {user}...")
- DoubanImporter.redo(user)
-
- if options["douban_import_reset"]:
- user = User.objects.get(pk=options["douban_import_reset"])
- self.stdout.write(f"Reset import for {user}...")
- DoubanImporter.reset(user)
-
self.stdout.write(self.style.SUCCESS(f"Done."))
diff --git a/journal/migrations/0004_tasks.py b/journal/migrations/0004_tasks.py
index bcb2c854..682282b2 100644
--- a/journal/migrations/0004_tasks.py
+++ b/journal/migrations/0004_tasks.py
@@ -4,7 +4,6 @@ from django.db import migrations
class Migration(migrations.Migration):
-
dependencies = [
("users", "0006_alter_task_type"),
("journal", "0003_note_progress"),
@@ -21,4 +20,24 @@ class Migration(migrations.Migration):
},
bases=("users.task",),
),
+ migrations.CreateModel(
+ name="DoubanImporter",
+ fields=[],
+ options={
+ "proxy": True,
+ "indexes": [],
+ "constraints": [],
+ },
+ bases=("users.task",),
+ ),
+ migrations.CreateModel(
+ name="GoodreadsImporter",
+ fields=[],
+ options={
+ "proxy": True,
+ "indexes": [],
+ "constraints": [],
+ },
+ bases=("users.task",),
+ ),
]
diff --git a/users/migrations/0006_alter_task_type.py b/users/migrations/0006_alter_task_type.py
index fabfc827..8fadd54a 100644
--- a/users/migrations/0006_alter_task_type.py
+++ b/users/migrations/0006_alter_task_type.py
@@ -14,7 +14,9 @@ class Migration(migrations.Migration):
name="type",
field=models.CharField(
choices=[
+ ("journal.doubanimporter", "douban importer"),
("journal.doufenexporter", "doufen exporter"),
+ ("journal.goodreadsimporter", "goodreads importer"),
("journal.letterboxdimporter", "letterboxd importer"),
],
db_index=True,
diff --git a/users/models/preference.py b/users/models/preference.py
index 21944a71..a0c7a25a 100644
--- a/users/models/preference.py
+++ b/users/models/preference.py
@@ -21,10 +21,10 @@ class Preference(models.Model):
)
export_status = models.JSONField(
blank=True, null=True, encoder=DjangoJSONEncoder, default=dict
- )
+ ) # deprecated
import_status = models.JSONField(
blank=True, null=True, encoder=DjangoJSONEncoder, default=dict
- )
+ ) # deprecated
# 0: public, 1: follower only, 2: private
default_visibility = models.PositiveSmallIntegerField(null=False, default=0)
# 0: public, 1: unlisted, 4: local
diff --git a/users/templates/users/data.html b/users/templates/users/data.html
index 4ccafede..8277cba6 100644
--- a/users/templates/users/data.html
+++ b/users/templates/users/data.html
@@ -64,7 +64,7 @@
+ {% if import_task.status == "pending" %} onclick="return confirm('{% trans "Another import is in progress, starting a new import may cause issues, sure to import?" %}')" value="{% trans "Import in progress, please wait" %}" {% else %} value="{% trans 'Import' %}" {% endif %} />