diff --git a/journal/importers/douban.py b/journal/importers/douban.py index b89a8059..0e4c0eb7 100644 --- a/journal/importers/douban.py +++ b/journal/importers/douban.py @@ -2,14 +2,11 @@ import os import re from datetime import datetime -import django_rq import openpyxl import pytz -from auditlog.context import set_actor from django.conf import settings from loguru import logger from markdownify import markdownify as md -from user_messages import api as msg from catalog.common import * from catalog.common.downloaders import * @@ -17,6 +14,7 @@ from catalog.models import * from catalog.sites.douban import DoubanDownloader from common.utils import GenerateDateUUIDMediaFilePath from journal.models import * +from users.models import Task _tz_sh = pytz.timezone("Asia/Shanghai") @@ -40,77 +38,22 @@ def _fetch_remote_image(url): return url -class DoubanImporter: - total = 0 - processed = 0 - skipped = 0 - imported = 0 - failed = [] - visibility = 0 - mode = 0 - file = "" +class DoubanImporter(Task): + class Meta: + app_label = "journal" # workaround bug in TypedModel - def __init__(self, user, visibility, mode): - self.user = user - self.visibility = visibility - self.mode = mode - - def update_user_import_status(self, status): - self.user.preference.import_status["douban_pending"] = status - self.user.preference.import_status["douban_file"] = self.file - self.user.preference.import_status["douban_visibility"] = self.visibility - self.user.preference.import_status["douban_mode"] = self.mode - self.user.preference.import_status["douban_total"] = self.total - self.user.preference.import_status["douban_processed"] = self.processed - self.user.preference.import_status["douban_skipped"] = self.skipped - self.user.preference.import_status["douban_imported"] = self.imported - self.user.preference.import_status["douban_failed"] = self.failed - self.user.preference.save(update_fields=["import_status"]) - - @classmethod - def reset(cls, user): - user.preference.import_status["douban_pending"] = 0 - user.preference.save(update_fields=["import_status"]) - - @classmethod - def redo(cls, user): - file = user.preference.import_status["douban_file"] - imp = cls( - user, - user.preference.import_status["douban_visibility"], - user.preference.import_status["douban_mode"], - ) - imp.file = file - jid = f"Douban_{user.id}_{os.path.basename(file)}_redo" - django_rq.get_queue("import").enqueue(imp.import_from_file_task, job_id=jid) - - def import_from_file(self, uploaded_file): - try: - wb = openpyxl.open( - uploaded_file, read_only=True, data_only=True, keep_links=False - ) - wb.close() - file = ( - settings.MEDIA_ROOT - + "/" - + GenerateDateUUIDMediaFilePath("x.xlsx", settings.SYNC_FILE_PATH_ROOT) - ) - os.makedirs(os.path.dirname(file), exist_ok=True) - with open(file, "wb") as destination: - for chunk in uploaded_file.chunks(): - destination.write(chunk) - self.file = file - self.update_user_import_status(2) - jid = f"Douban_{self.user.id}_{os.path.basename(self.file)}" - django_rq.get_queue("import").enqueue( - self.import_from_file_task, job_id=jid - ) - except Exception as e: - logger.error( - f"unable to enqueue import {uploaded_file}", extra={"exception": e} - ) - return False - return True + TaskQueue = "import" + DefaultMetadata = { + "total": 0, + "processed": 0, + "skipped": 0, + "imported": 0, + "failed": 0, + "mode": 0, + "visibility": 0, + "failed_urls": [], + "file": None, + } mark_sheet_config = { "想读": [ShelfType.WISHLIST], @@ -135,13 +78,30 @@ class DoubanImporter: "剧评": [Performance], "游戏评论&攻略": [Game], } + + @classmethod + def validate_file(cls, uploaded_file): + try: + wb = openpyxl.open( + uploaded_file, read_only=True, data_only=True, keep_links=False + ) + sheets = cls.mark_sheet_config.keys() | cls.review_sheet_config.keys() + for name in sheets: + if name in wb: + return True + except Exception as e: + logger.error( + f"unable to validate excel file {uploaded_file}", extra={"exception": e} + ) + return False + mark_data = {} review_data = {} entity_lookup = {} def load_sheets(self): """Load data into mark_data / review_data / entity_lookup""" - f = open(self.file, "rb") + f = open(self.metadata["file"], "rb") wb = openpyxl.load_workbook(f, read_only=True, data_only=True, keep_links=False) for data, config in [ (self.mark_data, self.mark_sheet_config), @@ -164,8 +124,9 @@ class DoubanImporter: self.entity_lookup[k].append(v) else: self.entity_lookup[k] = [v] - self.total = sum(map(lambda a: len(a), self.mark_data.values())) - self.total += sum(map(lambda a: len(a), self.review_data.values())) + self.metadata["total"] = sum(map(lambda a: len(a), self.mark_data.values())) + self.metadata["total"] += sum(map(lambda a: len(a), self.review_data.values())) + self.save() def guess_entity_url(self, title, rating, timestamp): k = f"{title}|{rating}" @@ -189,28 +150,20 @@ class DoubanImporter: # if cells[0] == title and cells[5] == rating: # return cells[3] - def import_from_file_task(self): + def run(self): logger.info(f"{self.user} import start") - msg.info(self.user, f"开始导入豆瓣标记和评论") - self.update_user_import_status(1) - with set_actor(self.user): - self.load_sheets() - logger.info(f"{self.user} sheet loaded, {self.total} lines total") - self.update_user_import_status(1) - for name, param in self.mark_sheet_config.items(): - self.import_mark_sheet(self.mark_data[name], param[0], name) - for name, param in self.review_sheet_config.items(): - self.import_review_sheet(self.review_data[name], name) - self.update_user_import_status(0) - msg.success( - self.user, - f"豆瓣标记和评论导入完成,共处理{self.total}篇,已存在{self.skipped}篇,新增{self.imported}篇。", - ) - if len(self.failed): - msg.error( - self.user, - f'豆瓣评论导入时未能处理以下网址:\n{" , ".join(self.failed)}', + self.load_sheets() + logger.info(f"{self.user} sheet loaded, {self.metadata['total']} lines total") + for name, param in self.mark_sheet_config.items(): + self.import_mark_sheet(self.mark_data[name], param[0], name) + for name, param in self.review_sheet_config.items(): + self.import_review_sheet(self.review_data[name], name) + self.message = f"豆瓣标记和评论导入完成,共处理{self.metadata['total']}篇,已存在{self.metadata['skipped']}篇,新增{self.metadata['imported']}篇。" + if len(self.metadata["failed_urls"]) > 0: + self.message += ( + f'导入时未能处理以下网址:\n{" , ".join(self.metadata["failed_urls"])}' ) + self.save() def import_mark_sheet(self, worksheet, shelf_type, sheet_name): prefix = f"{self.user} {sheet_name}|" @@ -234,7 +187,7 @@ class DoubanImporter: except Exception: tags = [] comment = cells[7] if len(cells) >= 8 else None - self.processed += 1 + self.metadata["processed"] += 1 try: if type(time) == str: time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S") @@ -243,10 +196,10 @@ class DoubanImporter: time = None r = self.import_mark(url, shelf_type, comment, rating_grade, tags, time) if r == 1: - self.imported += 1 + self.metadata["imported"] += 1 elif r == 2: - self.skipped += 1 - self.update_user_import_status(1) + self.metadata["skipped"] += 1 + self.save() def import_mark(self, url, shelf_type, comment, rating_grade, tags, time): """ @@ -257,7 +210,7 @@ class DoubanImporter: logger.warning(f"{self.user} | match/fetch {url} failed") return mark = Mark(self.user.identity, item) - if self.mode == 0 and ( + if self.metadata["mode"] == 0 and ( mark.shelf_type == shelf_type or mark.shelf_type == ShelfType.COMPLETE or ( @@ -268,7 +221,12 @@ class DoubanImporter: print("-", end="", flush=True) return 2 mark.update( - shelf_type, comment, rating_grade, tags, self.visibility, created_time=time + shelf_type, + comment, + rating_grade, + tags, + self.metadata["visibility"], + created_time=time, ) print("+", end="", flush=True) return 1 @@ -289,7 +247,7 @@ class DoubanImporter: time = cells[3] rating = cells[4] content = cells[6] - self.processed += 1 + self.metadata["processed"] += 1 if time: if type(time) == str: time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S") @@ -304,12 +262,12 @@ class DoubanImporter: entity_title, rating, title, review_url, content, time ) if r == 1: - self.imported += 1 + self.metadata["imported"] += 1 elif r == 2: - self.skipped += 1 + self.metadata["skipped"] += 1 else: - self.failed.append(review_url) - self.update_user_import_status(1) + self.metadata["failed_urls"].append(review_url) + self.save() def get_item_by_url(self, url): item = None @@ -337,7 +295,7 @@ class DoubanImporter: except Exception as e: logger.error(f"fetching error: {url}", extra={"exception": e}) if item is None: - self.failed.append(str(url)) + self.metadata["failed_urls"].append(str(url)) return item def import_review(self, entity_title, rating, title, review_url, content, time): @@ -367,7 +325,7 @@ class DoubanImporter: logger.warning(f"{prefix} match/fetch {url} failed") return if ( - self.mode == 1 + self.metadata["mode"] == 1 and Review.objects.filter(owner=self.user.identity, item=item).exists() ): return 2 @@ -387,7 +345,7 @@ class DoubanImporter: "edited_time": time, "title": title, "body": content, - "visibility": self.visibility, + "visibility": self.metadata["visibility"], } try: Review.objects.update_or_create( diff --git a/journal/importers/goodreads.py b/journal/importers/goodreads.py index ca994364..7a68b362 100644 --- a/journal/importers/goodreads.py +++ b/journal/importers/goodreads.py @@ -1,16 +1,14 @@ import re from datetime import datetime -import django_rq -from auditlog.context import set_actor from django.utils import timezone from django.utils.timezone import make_aware -from user_messages import api as msg from catalog.common import * from catalog.common.downloaders import * from catalog.models import * from journal.models import * +from users.models import Task re_list = r"^https://www\.goodreads\.com/list/show/\d+" re_shelf = r"^https://www\.goodreads\.com/review/list/\d+[^\?]*\?shelf=[^&]+" @@ -24,93 +22,104 @@ gr_rating = { } -class GoodreadsImporter: +class GoodreadsImporter(Task): + class Meta: + app_label = "journal" # workaround bug in TypedModel + + TaskQueue = "import" + DefaultMetadata = { + "total": 0, + "processed": 0, + "skipped": 0, + "imported": 0, + "failed": 0, + "visibility": 0, + "failed_urls": [], + "url": None, + } + @classmethod - def import_from_url(cls, raw_url, user): + def validate_url(cls, raw_url): match_list = re.match(re_list, raw_url) match_shelf = re.match(re_shelf, raw_url) match_profile = re.match(re_profile, raw_url) if match_profile or match_shelf or match_list: - django_rq.get_queue("import").enqueue( - cls.import_from_url_task, raw_url, user - ) return True else: return False - @classmethod - def import_from_url_task(cls, url, user): + def run(self): + url = self.metadata["url"] + user = self.user match_list = re.match(re_list, url) match_shelf = re.match(re_shelf, url) match_profile = re.match(re_profile, url) total = 0 visibility = user.preference.default_visibility - with set_actor(user): - shelf = None - if match_shelf: - shelf = cls.parse_shelf(match_shelf[0], user) - elif match_list: - shelf = cls.parse_list(match_list[0], user) - if shelf: - if shelf["title"] and shelf["books"]: - collection = Collection.objects.create( - title=shelf["title"], - brief=shelf["description"] - + "\n\nImported from [Goodreads](" - + url - + ")", - owner=user.identity, - ) - for book in shelf["books"]: - collection.append_item(book["book"], note=book["review"]) - total += 1 - collection.save() - msg.success( - user, - f'Imported {total} books from Goodreads as a Collection {shelf["title"]}.', + shelf = None + if match_shelf: + shelf = self.parse_shelf(match_shelf[0]) + elif match_list: + shelf = self.parse_list(match_list[0]) + if shelf: + if shelf["title"] and shelf["books"]: + collection = Collection.objects.create( + title=shelf["title"], + brief=shelf["description"] + + "\n\nImported from [Goodreads](" + + url + + ")", + owner=user.identity, ) - elif match_profile: - uid = match_profile[1] - shelves = { - ShelfType.WISHLIST: f"https://www.goodreads.com/review/list/{uid}?shelf=to-read", - ShelfType.PROGRESS: f"https://www.goodreads.com/review/list/{uid}?shelf=currently-reading", - ShelfType.COMPLETE: f"https://www.goodreads.com/review/list/{uid}?shelf=read", - } - for shelf_type in shelves: - shelf_url = shelves.get(shelf_type) - shelf = cls.parse_shelf(shelf_url, user) - for book in shelf["books"]: - mark = Mark(user.identity, book["book"]) - if ( - ( - mark.shelf_type == shelf_type - and mark.comment_text == book["review"] - ) - or ( - mark.shelf_type == ShelfType.COMPLETE - and shelf_type != ShelfType.COMPLETE - ) - or ( - mark.shelf_type == ShelfType.PROGRESS - and shelf_type == ShelfType.WISHLIST - ) - ): - print( - f'Skip {shelf_type}/{book["book"]} bc it was marked {mark.shelf_type}' - ) - else: - mark.update( - shelf_type, - book["review"], - book["rating"], - visibility=visibility, - created_time=book["last_updated"] or timezone.now(), - ) - total += 1 - msg.success(user, f"Imported {total} records from Goodreads profile.") + for book in shelf["books"]: + collection.append_item(book["book"], note=book["review"]) + total += 1 + collection.save() + self.message = f'Imported {total} books from Goodreads as a Collection {shelf["title"]}.' + elif match_profile: + uid = match_profile[1] + shelves = { + ShelfType.WISHLIST: f"https://www.goodreads.com/review/list/{uid}?shelf=to-read", + ShelfType.PROGRESS: f"https://www.goodreads.com/review/list/{uid}?shelf=currently-reading", + ShelfType.COMPLETE: f"https://www.goodreads.com/review/list/{uid}?shelf=read", + } + for shelf_type in shelves: + shelf_url = shelves.get(shelf_type) + shelf = self.parse_shelf(shelf_url) + for book in shelf["books"]: + mark = Mark(user.identity, book["book"]) + if ( + ( + mark.shelf_type == shelf_type + and mark.comment_text == book["review"] + ) + or ( + mark.shelf_type == ShelfType.COMPLETE + and shelf_type != ShelfType.COMPLETE + ) + or ( + mark.shelf_type == ShelfType.PROGRESS + and shelf_type == ShelfType.WISHLIST + ) + ): + print( + f'Skip {shelf_type}/{book["book"]} bc it was marked {mark.shelf_type}' + ) + else: + mark.update( + shelf_type, + book["review"], + book["rating"], + visibility=visibility, + created_time=book["last_updated"] or timezone.now(), + ) + total += 1 + self.message = f"Imported {total} records from Goodreads profile." + self.metadata["total"] = total + self.save() @classmethod - def get_book(cls, url, user): + def get_book(cls, url): site = SiteManager.get_site_by_url(url) if site: book = site.get_item() @@ -121,7 +130,7 @@ class GoodreadsImporter: return book @classmethod - def parse_shelf(cls, url, user): + def parse_shelf(cls, url): # return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]} title = "" books = [] @@ -194,7 +203,7 @@ class GoodreadsImporter: except Exception: print(f"Error loading/parsing review{url_review}, ignored") try: - book = cls.get_book(url_book, user) + book = cls.get_book(url_book) books.append( { "url": url_book, @@ -216,7 +225,7 @@ class GoodreadsImporter: return {"title": title, "description": "", "books": books} @classmethod - def parse_list(cls, url, user): + def parse_list(cls, url): # return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]} title = "" description = "" @@ -237,7 +246,7 @@ class GoodreadsImporter: for link in links: # type:ignore url_book = "https://www.goodreads.com" + link try: - book = cls.get_book(url_book, user) + book = cls.get_book(url_book) books.append( { "url": url_book, diff --git a/journal/management/commands/journal.py b/journal/management/commands/journal.py index fd8645a0..b9f49dc4 100644 --- a/journal/management/commands/journal.py +++ b/journal/management/commands/journal.py @@ -25,16 +25,6 @@ class Command(BaseCommand): action="store_true", help="purge invalid data (visibility=99)", ) - parser.add_argument( - "--douban-import-redo", - action="store", - help="reimport for user id", - ) - parser.add_argument( - "--douban-import-reset", - action="store", - help="reset for user id", - ) parser.add_argument( "--integrity", action="store_true", @@ -66,14 +56,4 @@ class Command(BaseCommand): self.stdout.write(f"Cleaning up {cls}...") cls.objects.filter(visibility=99).delete() - if options["douban_import_redo"]: - user = User.objects.get(pk=options["douban_import_redo"]) - self.stdout.write(f"Redo import for {user}...") - DoubanImporter.redo(user) - - if options["douban_import_reset"]: - user = User.objects.get(pk=options["douban_import_reset"]) - self.stdout.write(f"Reset import for {user}...") - DoubanImporter.reset(user) - self.stdout.write(self.style.SUCCESS(f"Done.")) diff --git a/journal/migrations/0004_tasks.py b/journal/migrations/0004_tasks.py index bcb2c854..682282b2 100644 --- a/journal/migrations/0004_tasks.py +++ b/journal/migrations/0004_tasks.py @@ -4,7 +4,6 @@ from django.db import migrations class Migration(migrations.Migration): - dependencies = [ ("users", "0006_alter_task_type"), ("journal", "0003_note_progress"), @@ -21,4 +20,24 @@ class Migration(migrations.Migration): }, bases=("users.task",), ), + migrations.CreateModel( + name="DoubanImporter", + fields=[], + options={ + "proxy": True, + "indexes": [], + "constraints": [], + }, + bases=("users.task",), + ), + migrations.CreateModel( + name="GoodreadsImporter", + fields=[], + options={ + "proxy": True, + "indexes": [], + "constraints": [], + }, + bases=("users.task",), + ), ] diff --git a/users/migrations/0006_alter_task_type.py b/users/migrations/0006_alter_task_type.py index fabfc827..8fadd54a 100644 --- a/users/migrations/0006_alter_task_type.py +++ b/users/migrations/0006_alter_task_type.py @@ -14,7 +14,9 @@ class Migration(migrations.Migration): name="type", field=models.CharField( choices=[ + ("journal.doubanimporter", "douban importer"), ("journal.doufenexporter", "doufen exporter"), + ("journal.goodreadsimporter", "goodreads importer"), ("journal.letterboxdimporter", "letterboxd importer"), ], db_index=True, diff --git a/users/models/preference.py b/users/models/preference.py index 21944a71..a0c7a25a 100644 --- a/users/models/preference.py +++ b/users/models/preference.py @@ -21,10 +21,10 @@ class Preference(models.Model): ) export_status = models.JSONField( blank=True, null=True, encoder=DjangoJSONEncoder, default=dict - ) + ) # deprecated import_status = models.JSONField( blank=True, null=True, encoder=DjangoJSONEncoder, default=dict - ) + ) # deprecated # 0: public, 1: follower only, 2: private default_visibility = models.PositiveSmallIntegerField(null=False, default=0) # 0: public, 1: unlisted, 4: local diff --git a/users/templates/users/data.html b/users/templates/users/data.html index 4ccafede..8277cba6 100644 --- a/users/templates/users/data.html +++ b/users/templates/users/data.html @@ -64,7 +64,7 @@

+ {% if import_task.status == "pending" %} onclick="return confirm('{% trans "Another import is in progress, starting a new import may cause issues, sure to import?" %}')" value="{% trans "Import in progress, please wait" %}" {% else %} value="{% trans 'Import' %}" {% endif %} />
+ + {% if goodreads_task %} +
+ {% trans 'Last import started' %}: {{ goodreads_task.created_time }} + {% trans 'Status' %}: {{ goodreads_task.get_state_display }}。 +
+ {{ goodreads_task.message }} + {% endif %} +