modernize importers

This commit is contained in:
Your Name 2024-12-26 10:54:25 -05:00 committed by Henri Dickson
parent c5434b44eb
commit 0e41a1e5ef
9 changed files with 233 additions and 243 deletions

View file

@ -2,14 +2,11 @@ import os
import re
from datetime import datetime
import django_rq
import openpyxl
import pytz
from auditlog.context import set_actor
from django.conf import settings
from loguru import logger
from markdownify import markdownify as md
from user_messages import api as msg
from catalog.common import *
from catalog.common.downloaders import *
@ -17,6 +14,7 @@ from catalog.models import *
from catalog.sites.douban import DoubanDownloader
from common.utils import GenerateDateUUIDMediaFilePath
from journal.models import *
from users.models import Task
_tz_sh = pytz.timezone("Asia/Shanghai")
@ -40,77 +38,22 @@ def _fetch_remote_image(url):
return url
class DoubanImporter:
total = 0
processed = 0
skipped = 0
imported = 0
failed = []
visibility = 0
mode = 0
file = ""
class DoubanImporter(Task):
class Meta:
app_label = "journal" # workaround bug in TypedModel
def __init__(self, user, visibility, mode):
self.user = user
self.visibility = visibility
self.mode = mode
def update_user_import_status(self, status):
self.user.preference.import_status["douban_pending"] = status
self.user.preference.import_status["douban_file"] = self.file
self.user.preference.import_status["douban_visibility"] = self.visibility
self.user.preference.import_status["douban_mode"] = self.mode
self.user.preference.import_status["douban_total"] = self.total
self.user.preference.import_status["douban_processed"] = self.processed
self.user.preference.import_status["douban_skipped"] = self.skipped
self.user.preference.import_status["douban_imported"] = self.imported
self.user.preference.import_status["douban_failed"] = self.failed
self.user.preference.save(update_fields=["import_status"])
@classmethod
def reset(cls, user):
user.preference.import_status["douban_pending"] = 0
user.preference.save(update_fields=["import_status"])
@classmethod
def redo(cls, user):
file = user.preference.import_status["douban_file"]
imp = cls(
user,
user.preference.import_status["douban_visibility"],
user.preference.import_status["douban_mode"],
)
imp.file = file
jid = f"Douban_{user.id}_{os.path.basename(file)}_redo"
django_rq.get_queue("import").enqueue(imp.import_from_file_task, job_id=jid)
def import_from_file(self, uploaded_file):
try:
wb = openpyxl.open(
uploaded_file, read_only=True, data_only=True, keep_links=False
)
wb.close()
file = (
settings.MEDIA_ROOT
+ "/"
+ GenerateDateUUIDMediaFilePath("x.xlsx", settings.SYNC_FILE_PATH_ROOT)
)
os.makedirs(os.path.dirname(file), exist_ok=True)
with open(file, "wb") as destination:
for chunk in uploaded_file.chunks():
destination.write(chunk)
self.file = file
self.update_user_import_status(2)
jid = f"Douban_{self.user.id}_{os.path.basename(self.file)}"
django_rq.get_queue("import").enqueue(
self.import_from_file_task, job_id=jid
)
except Exception as e:
logger.error(
f"unable to enqueue import {uploaded_file}", extra={"exception": e}
)
return False
return True
TaskQueue = "import"
DefaultMetadata = {
"total": 0,
"processed": 0,
"skipped": 0,
"imported": 0,
"failed": 0,
"mode": 0,
"visibility": 0,
"failed_urls": [],
"file": None,
}
mark_sheet_config = {
"想读": [ShelfType.WISHLIST],
@ -135,13 +78,30 @@ class DoubanImporter:
"剧评": [Performance],
"游戏评论&攻略": [Game],
}
@classmethod
def validate_file(cls, uploaded_file):
try:
wb = openpyxl.open(
uploaded_file, read_only=True, data_only=True, keep_links=False
)
sheets = cls.mark_sheet_config.keys() | cls.review_sheet_config.keys()
for name in sheets:
if name in wb:
return True
except Exception as e:
logger.error(
f"unable to validate excel file {uploaded_file}", extra={"exception": e}
)
return False
mark_data = {}
review_data = {}
entity_lookup = {}
def load_sheets(self):
"""Load data into mark_data / review_data / entity_lookup"""
f = open(self.file, "rb")
f = open(self.metadata["file"], "rb")
wb = openpyxl.load_workbook(f, read_only=True, data_only=True, keep_links=False)
for data, config in [
(self.mark_data, self.mark_sheet_config),
@ -164,8 +124,9 @@ class DoubanImporter:
self.entity_lookup[k].append(v)
else:
self.entity_lookup[k] = [v]
self.total = sum(map(lambda a: len(a), self.mark_data.values()))
self.total += sum(map(lambda a: len(a), self.review_data.values()))
self.metadata["total"] = sum(map(lambda a: len(a), self.mark_data.values()))
self.metadata["total"] += sum(map(lambda a: len(a), self.review_data.values()))
self.save()
def guess_entity_url(self, title, rating, timestamp):
k = f"{title}|{rating}"
@ -189,28 +150,20 @@ class DoubanImporter:
# if cells[0] == title and cells[5] == rating:
# return cells[3]
def import_from_file_task(self):
def run(self):
logger.info(f"{self.user} import start")
msg.info(self.user, f"开始导入豆瓣标记和评论")
self.update_user_import_status(1)
with set_actor(self.user):
self.load_sheets()
logger.info(f"{self.user} sheet loaded, {self.total} lines total")
self.update_user_import_status(1)
for name, param in self.mark_sheet_config.items():
self.import_mark_sheet(self.mark_data[name], param[0], name)
for name, param in self.review_sheet_config.items():
self.import_review_sheet(self.review_data[name], name)
self.update_user_import_status(0)
msg.success(
self.user,
f"豆瓣标记和评论导入完成,共处理{self.total}篇,已存在{self.skipped}篇,新增{self.imported}篇。",
)
if len(self.failed):
msg.error(
self.user,
f'豆瓣评论导入时未能处理以下网址:\n{" , ".join(self.failed)}',
self.load_sheets()
logger.info(f"{self.user} sheet loaded, {self.metadata['total']} lines total")
for name, param in self.mark_sheet_config.items():
self.import_mark_sheet(self.mark_data[name], param[0], name)
for name, param in self.review_sheet_config.items():
self.import_review_sheet(self.review_data[name], name)
self.message = f"豆瓣标记和评论导入完成,共处理{self.metadata['total']}篇,已存在{self.metadata['skipped']}篇,新增{self.metadata['imported']}篇。"
if len(self.metadata["failed_urls"]) > 0:
self.message += (
f'导入时未能处理以下网址:\n{" , ".join(self.metadata["failed_urls"])}'
)
self.save()
def import_mark_sheet(self, worksheet, shelf_type, sheet_name):
prefix = f"{self.user} {sheet_name}|"
@ -234,7 +187,7 @@ class DoubanImporter:
except Exception:
tags = []
comment = cells[7] if len(cells) >= 8 else None
self.processed += 1
self.metadata["processed"] += 1
try:
if type(time) == str:
time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
@ -243,10 +196,10 @@ class DoubanImporter:
time = None
r = self.import_mark(url, shelf_type, comment, rating_grade, tags, time)
if r == 1:
self.imported += 1
self.metadata["imported"] += 1
elif r == 2:
self.skipped += 1
self.update_user_import_status(1)
self.metadata["skipped"] += 1
self.save()
def import_mark(self, url, shelf_type, comment, rating_grade, tags, time):
"""
@ -257,7 +210,7 @@ class DoubanImporter:
logger.warning(f"{self.user} | match/fetch {url} failed")
return
mark = Mark(self.user.identity, item)
if self.mode == 0 and (
if self.metadata["mode"] == 0 and (
mark.shelf_type == shelf_type
or mark.shelf_type == ShelfType.COMPLETE
or (
@ -268,7 +221,12 @@ class DoubanImporter:
print("-", end="", flush=True)
return 2
mark.update(
shelf_type, comment, rating_grade, tags, self.visibility, created_time=time
shelf_type,
comment,
rating_grade,
tags,
self.metadata["visibility"],
created_time=time,
)
print("+", end="", flush=True)
return 1
@ -289,7 +247,7 @@ class DoubanImporter:
time = cells[3]
rating = cells[4]
content = cells[6]
self.processed += 1
self.metadata["processed"] += 1
if time:
if type(time) == str:
time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
@ -304,12 +262,12 @@ class DoubanImporter:
entity_title, rating, title, review_url, content, time
)
if r == 1:
self.imported += 1
self.metadata["imported"] += 1
elif r == 2:
self.skipped += 1
self.metadata["skipped"] += 1
else:
self.failed.append(review_url)
self.update_user_import_status(1)
self.metadata["failed_urls"].append(review_url)
self.save()
def get_item_by_url(self, url):
item = None
@ -337,7 +295,7 @@ class DoubanImporter:
except Exception as e:
logger.error(f"fetching error: {url}", extra={"exception": e})
if item is None:
self.failed.append(str(url))
self.metadata["failed_urls"].append(str(url))
return item
def import_review(self, entity_title, rating, title, review_url, content, time):
@ -367,7 +325,7 @@ class DoubanImporter:
logger.warning(f"{prefix} match/fetch {url} failed")
return
if (
self.mode == 1
self.metadata["mode"] == 1
and Review.objects.filter(owner=self.user.identity, item=item).exists()
):
return 2
@ -387,7 +345,7 @@ class DoubanImporter:
"edited_time": time,
"title": title,
"body": content,
"visibility": self.visibility,
"visibility": self.metadata["visibility"],
}
try:
Review.objects.update_or_create(

View file

@ -1,16 +1,14 @@
import re
from datetime import datetime
import django_rq
from auditlog.context import set_actor
from django.utils import timezone
from django.utils.timezone import make_aware
from user_messages import api as msg
from catalog.common import *
from catalog.common.downloaders import *
from catalog.models import *
from journal.models import *
from users.models import Task
re_list = r"^https://www\.goodreads\.com/list/show/\d+"
re_shelf = r"^https://www\.goodreads\.com/review/list/\d+[^\?]*\?shelf=[^&]+"
@ -24,93 +22,104 @@ gr_rating = {
}
class GoodreadsImporter:
class GoodreadsImporter(Task):
class Meta:
app_label = "journal" # workaround bug in TypedModel
TaskQueue = "import"
DefaultMetadata = {
"total": 0,
"processed": 0,
"skipped": 0,
"imported": 0,
"failed": 0,
"visibility": 0,
"failed_urls": [],
"url": None,
}
@classmethod
def import_from_url(cls, raw_url, user):
def validate_url(cls, raw_url):
match_list = re.match(re_list, raw_url)
match_shelf = re.match(re_shelf, raw_url)
match_profile = re.match(re_profile, raw_url)
if match_profile or match_shelf or match_list:
django_rq.get_queue("import").enqueue(
cls.import_from_url_task, raw_url, user
)
return True
else:
return False
@classmethod
def import_from_url_task(cls, url, user):
def run(self):
url = self.metadata["url"]
user = self.user
match_list = re.match(re_list, url)
match_shelf = re.match(re_shelf, url)
match_profile = re.match(re_profile, url)
total = 0
visibility = user.preference.default_visibility
with set_actor(user):
shelf = None
if match_shelf:
shelf = cls.parse_shelf(match_shelf[0], user)
elif match_list:
shelf = cls.parse_list(match_list[0], user)
if shelf:
if shelf["title"] and shelf["books"]:
collection = Collection.objects.create(
title=shelf["title"],
brief=shelf["description"]
+ "\n\nImported from [Goodreads]("
+ url
+ ")",
owner=user.identity,
)
for book in shelf["books"]:
collection.append_item(book["book"], note=book["review"])
total += 1
collection.save()
msg.success(
user,
f'Imported {total} books from Goodreads as a Collection {shelf["title"]}.',
shelf = None
if match_shelf:
shelf = self.parse_shelf(match_shelf[0])
elif match_list:
shelf = self.parse_list(match_list[0])
if shelf:
if shelf["title"] and shelf["books"]:
collection = Collection.objects.create(
title=shelf["title"],
brief=shelf["description"]
+ "\n\nImported from [Goodreads]("
+ url
+ ")",
owner=user.identity,
)
elif match_profile:
uid = match_profile[1]
shelves = {
ShelfType.WISHLIST: f"https://www.goodreads.com/review/list/{uid}?shelf=to-read",
ShelfType.PROGRESS: f"https://www.goodreads.com/review/list/{uid}?shelf=currently-reading",
ShelfType.COMPLETE: f"https://www.goodreads.com/review/list/{uid}?shelf=read",
}
for shelf_type in shelves:
shelf_url = shelves.get(shelf_type)
shelf = cls.parse_shelf(shelf_url, user)
for book in shelf["books"]:
mark = Mark(user.identity, book["book"])
if (
(
mark.shelf_type == shelf_type
and mark.comment_text == book["review"]
)
or (
mark.shelf_type == ShelfType.COMPLETE
and shelf_type != ShelfType.COMPLETE
)
or (
mark.shelf_type == ShelfType.PROGRESS
and shelf_type == ShelfType.WISHLIST
)
):
print(
f'Skip {shelf_type}/{book["book"]} bc it was marked {mark.shelf_type}'
)
else:
mark.update(
shelf_type,
book["review"],
book["rating"],
visibility=visibility,
created_time=book["last_updated"] or timezone.now(),
)
total += 1
msg.success(user, f"Imported {total} records from Goodreads profile.")
for book in shelf["books"]:
collection.append_item(book["book"], note=book["review"])
total += 1
collection.save()
self.message = f'Imported {total} books from Goodreads as a Collection {shelf["title"]}.'
elif match_profile:
uid = match_profile[1]
shelves = {
ShelfType.WISHLIST: f"https://www.goodreads.com/review/list/{uid}?shelf=to-read",
ShelfType.PROGRESS: f"https://www.goodreads.com/review/list/{uid}?shelf=currently-reading",
ShelfType.COMPLETE: f"https://www.goodreads.com/review/list/{uid}?shelf=read",
}
for shelf_type in shelves:
shelf_url = shelves.get(shelf_type)
shelf = self.parse_shelf(shelf_url)
for book in shelf["books"]:
mark = Mark(user.identity, book["book"])
if (
(
mark.shelf_type == shelf_type
and mark.comment_text == book["review"]
)
or (
mark.shelf_type == ShelfType.COMPLETE
and shelf_type != ShelfType.COMPLETE
)
or (
mark.shelf_type == ShelfType.PROGRESS
and shelf_type == ShelfType.WISHLIST
)
):
print(
f'Skip {shelf_type}/{book["book"]} bc it was marked {mark.shelf_type}'
)
else:
mark.update(
shelf_type,
book["review"],
book["rating"],
visibility=visibility,
created_time=book["last_updated"] or timezone.now(),
)
total += 1
self.message = f"Imported {total} records from Goodreads profile."
self.metadata["total"] = total
self.save()
@classmethod
def get_book(cls, url, user):
def get_book(cls, url):
site = SiteManager.get_site_by_url(url)
if site:
book = site.get_item()
@ -121,7 +130,7 @@ class GoodreadsImporter:
return book
@classmethod
def parse_shelf(cls, url, user):
def parse_shelf(cls, url):
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
title = ""
books = []
@ -194,7 +203,7 @@ class GoodreadsImporter:
except Exception:
print(f"Error loading/parsing review{url_review}, ignored")
try:
book = cls.get_book(url_book, user)
book = cls.get_book(url_book)
books.append(
{
"url": url_book,
@ -216,7 +225,7 @@ class GoodreadsImporter:
return {"title": title, "description": "", "books": books}
@classmethod
def parse_list(cls, url, user):
def parse_list(cls, url):
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
title = ""
description = ""
@ -237,7 +246,7 @@ class GoodreadsImporter:
for link in links: # type:ignore
url_book = "https://www.goodreads.com" + link
try:
book = cls.get_book(url_book, user)
book = cls.get_book(url_book)
books.append(
{
"url": url_book,

View file

@ -25,16 +25,6 @@ class Command(BaseCommand):
action="store_true",
help="purge invalid data (visibility=99)",
)
parser.add_argument(
"--douban-import-redo",
action="store",
help="reimport for user id",
)
parser.add_argument(
"--douban-import-reset",
action="store",
help="reset for user id",
)
parser.add_argument(
"--integrity",
action="store_true",
@ -66,14 +56,4 @@ class Command(BaseCommand):
self.stdout.write(f"Cleaning up {cls}...")
cls.objects.filter(visibility=99).delete()
if options["douban_import_redo"]:
user = User.objects.get(pk=options["douban_import_redo"])
self.stdout.write(f"Redo import for {user}...")
DoubanImporter.redo(user)
if options["douban_import_reset"]:
user = User.objects.get(pk=options["douban_import_reset"])
self.stdout.write(f"Reset import for {user}...")
DoubanImporter.reset(user)
self.stdout.write(self.style.SUCCESS(f"Done."))

View file

@ -4,7 +4,6 @@ from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("users", "0006_alter_task_type"),
("journal", "0003_note_progress"),
@ -21,4 +20,24 @@ class Migration(migrations.Migration):
},
bases=("users.task",),
),
migrations.CreateModel(
name="DoubanImporter",
fields=[],
options={
"proxy": True,
"indexes": [],
"constraints": [],
},
bases=("users.task",),
),
migrations.CreateModel(
name="GoodreadsImporter",
fields=[],
options={
"proxy": True,
"indexes": [],
"constraints": [],
},
bases=("users.task",),
),
]

View file

@ -14,7 +14,9 @@ class Migration(migrations.Migration):
name="type",
field=models.CharField(
choices=[
("journal.doubanimporter", "douban importer"),
("journal.doufenexporter", "doufen exporter"),
("journal.goodreadsimporter", "goodreads importer"),
("journal.letterboxdimporter", "letterboxd importer"),
],
db_index=True,

View file

@ -21,10 +21,10 @@ class Preference(models.Model):
)
export_status = models.JSONField(
blank=True, null=True, encoder=DjangoJSONEncoder, default=dict
)
) # deprecated
import_status = models.JSONField(
blank=True, null=True, encoder=DjangoJSONEncoder, default=dict
)
) # deprecated
# 0: public, 1: follower only, 2: private
default_visibility = models.PositiveSmallIntegerField(null=False, default=0)
# 0: public, 1: unlisted, 4: local

View file

@ -64,7 +64,7 @@
</label>
</p>
<input type="submit"
{% if import_status.douban_pending %} onclick="return confirm('{% trans "Another import is in progress, starting a new import may cause issues, sure to import?" %}')" value="{% trans "Import in progress, please wait" %}" {% else %} value="{% trans 'Import' %}" {% endif %} />
{% if import_task.status == "pending" %} onclick="return confirm('{% trans "Another import is in progress, starting a new import may cause issues, sure to import?" %}')" value="{% trans "Import in progress, please wait" %}" {% else %} value="{% trans 'Import' %}" {% endif %} />
</form>
<div hx-get="{% url 'users:import_status' %}"
hx-trigger="load delay:1s"
@ -84,6 +84,15 @@
placeholder="https://www.goodreads.com/user/show/12345-janedoe"
required>
<input type="submit" value="{% trans 'Import' %}" />
<small>
{% if goodreads_task %}
<br>
{% trans 'Last import started' %}: {{ goodreads_task.created_time }}
{% trans 'Status' %}: {{ goodreads_task.get_state_display }}。
<br>
{{ goodreads_task.message }}
{% endif %}
</small>
</div>
<ul>
<li>

View file

@ -1,19 +1,15 @@
{% load i18n %}
{% if import_status.douban_pending == 2 %}
正在等待
{% elif import_status.douban_pending == 1 %}
<div hx-get="{% url 'users:import_status' %} "
hx-trigger="every 15s"
hx-swap="outerHTML">
正在导入
{% if import_status.douban_total %}
<br>
<progress value="{{ import_status.douban_processed }}"
max="{{ import_status.douban_total }}"></progress>
共{{ import_status.douban_total }}篇,目前已处理{{ import_status.douban_processed }}篇,其中已存在{{ import_status.douban_skipped }}篇,新增{{ import_status.douban_imported }}篇
{% endif %}
</div>
{% elif import_status.douban_file %}
上次结果
共计{{ import_status.douban_total }}篇,处理{{ import_status.douban_processed }}篇,其中已存在{{ import_status.douban_skipped }}篇,新增{{ import_status.douban_imported }}篇
{% trans 'Last import started' %}: {{ import_task.created_time }}
{% trans 'Status' %}: {{ import_task.get_state_display }}。
{% if import_task.metadata.total %}
<br>
<progress value="{{ import_task.metadata.processed }}"
max="{{ import_task.metadata.total }}"></progress>
共{{ import_task.metadata.total }}篇,已处理{{ import_task.metadata.processed }}篇,其中已存在{{ import_task.metadata.skipped }}篇,新增{{ import_task.metadata.imported }}篇
<br>
{% if import_task.metadata.failed_urls %}
{% trans 'Failed links, you may have to mark them manually' %}
<br>
<textarea readonly>{% for url in import_task.metadata.failed_urls %}{{url}}&#10;{% endfor %}</textarea>
{% endif %}
{% endif %}

View file

@ -95,9 +95,10 @@ def data(request):
"users/data.html",
{
"allow_any_site": settings.MASTODON_ALLOW_ANY_SITE,
"import_status": request.user.preference.import_status,
"import_task": DoubanImporter.latest_task(request.user),
"export_task": DoufenExporter.latest_task(request.user),
"letterboxd_task": LetterboxdImporter.latest_task(request.user),
"goodreads_task": GoodreadsImporter.latest_task(request.user),
"years": years,
},
)
@ -109,7 +110,7 @@ def data_import_status(request):
request,
"users/data_import_status.html",
{
"import_status": request.user.preference.import_status,
"import_task": DoubanImporter.latest_task(request.user),
},
)
@ -185,7 +186,12 @@ def reset_visibility(request):
def import_goodreads(request):
if request.method == "POST":
raw_url = request.POST.get("url")
if GoodreadsImporter.import_from_url(raw_url, request.user):
if GoodreadsImporter.validate_url(raw_url):
GoodreadsImporter.create(
request.user,
visibility=int(request.POST.get("visibility", 0)),
url=raw_url,
).enqueue()
messages.add_message(request, messages.INFO, _("Import in progress."))
else:
messages.add_message(request, messages.ERROR, _("Invalid URL."))
@ -194,18 +200,29 @@ def import_goodreads(request):
@login_required
def import_douban(request):
if request.method == "POST":
importer = DoubanImporter(
request.user,
int(request.POST.get("visibility", 0)),
int(request.POST.get("import_mode", 0)),
)
if importer.import_from_file(request.FILES["file"]):
messages.add_message(
request, messages.INFO, _("File is uploaded and will be imported soon.")
)
else:
messages.add_message(request, messages.ERROR, _("Invalid file."))
if request.method != "POST":
return redirect(reverse("users:data"))
f = (
settings.MEDIA_ROOT
+ "/"
+ GenerateDateUUIDMediaFilePath("x.zip", settings.SYNC_FILE_PATH_ROOT)
)
os.makedirs(os.path.dirname(f), exist_ok=True)
with open(f, "wb+") as destination:
for chunk in request.FILES["file"].chunks():
destination.write(chunk)
if not DoubanImporter.validate_file(request.FILES["file"]):
messages.add_message(request, messages.ERROR, _("Invalid file."))
return redirect(reverse("users:data"))
DoubanImporter.create(
request.user,
visibility=int(request.POST.get("visibility", 0)),
mode=int(request.POST.get("import_mode", 0)),
file=f,
).enqueue()
messages.add_message(
request, messages.INFO, _("File is uploaded and will be imported soon.")
)
return redirect(reverse("users:data"))