From 0af7032282a0190a9b2c67a75a6dcf12e5f025b5 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 30 Dec 2024 11:37:38 -0500 Subject: [PATCH] index faster --- journal/management/commands/journal.py | 74 ++++++++++++++++++++------ journal/models/collection.py | 3 ++ journal/models/index.py | 16 +++--- 3 files changed, 70 insertions(+), 23 deletions(-) diff --git a/journal/management/commands/journal.py b/journal/management/commands/journal.py index 00f5c921..61205a40 100644 --- a/journal/management/commands/journal.py +++ b/journal/management/commands/journal.py @@ -1,14 +1,25 @@ from argparse import RawTextHelpFormatter +from datetime import timedelta from django.core.management.base import BaseCommand from django.core.paginator import Paginator +from django.db.models import Q +from django.utils import timezone from tqdm import tqdm from catalog.models import Item -from journal.models import Content, JournalIndex, Piece, update_journal_for_merged_item +from journal.models import ( + Collection, + Content, + JournalIndex, + Piece, + Review, + ShelfMember, + update_journal_for_merged_item, +) from journal.models.itemlist import ListMember from takahe.models import Post -from users.models import APIdentity +from users.models import APIdentity, User _CONFIRM = "confirm deleting collection? [Y/N] " @@ -20,7 +31,7 @@ idx-init: check and create index if not exists idx-destroy: delete index idx-alt: update index schema idx-delete: delete docs in index -idx-update: reindex docs +idx-reindex: reindex docs idx-search: search docs in index """ @@ -43,7 +54,7 @@ class Command(BaseCommand): "idx-init", "idx-alt", "idx-destroy", - "idx-update", + "idx-reindex", "idx-delete", "idx-search", ], @@ -80,6 +91,11 @@ class Command(BaseCommand): "--yes", action="store_true", ) + parser.add_argument( + "--fast", + action="store_true", + help="skip some inactive users and rare cases to speed up index", + ) def integrity(self): self.stdout.write(f"Checking deleted items with remaining journals...") @@ -105,6 +121,7 @@ class Command(BaseCommand): verbose, fix, batch_size, + fast, *args, **kwargs, ): @@ -161,15 +178,26 @@ class Command(BaseCommand): c = index.delete_all() self.stdout.write(self.style.SUCCESS(f"deleted {c} documents.")) - case "idx-update": - pieces = Piece.objects.all() + case "idx-reindex": + if fast and not owners: + q = Q(social_accounts__type="mastodon.mastodonaccount") | Q( + social_accounts__last_reachable__gt=timezone.now() + - timedelta(days=365) + ) + owners = list( + User.objects.filter(is_active=True) + .filter(q) + .values_list("identity", flat=True) + ) + # index all posts first posts = Post.objects.filter(local=True).exclude( state__in=["deleted", "deleted_fanned_out"] ) if owners: - pieces = pieces.filter(owner_id__in=owners) + self.stdout.write( + self.style.SUCCESS(f"indexing for {len(owners)} users.") + ) posts = posts.filter(author_id__in=owners) - # index all posts first c = 0 pg = Paginator(posts.order_by("id"), self.batch_size) for p in tqdm(pg.page_range): @@ -178,15 +206,29 @@ class Command(BaseCommand): index.replace_docs(docs) self.stdout.write(self.style.SUCCESS(f"indexed {c} docs.")) # index remaining pieces without posts - c = 0 - pg = Paginator(pieces.order_by("id"), self.batch_size) - for p in tqdm(pg.page_range): - pieces = [ - p for p in pg.get_page(p).object_list if p.latest_post is None + for cls in ( + [ + ShelfMember, + Review, + Collection, ] - docs = index.pieces_to_docs(pieces) - c += len(docs) - index.replace_docs(docs) + if fast + else [Piece] + ): + pieces = cls.objects.filter(local=True) + if owners: + pieces = pieces.filter(owner_id__in=owners) + c = 0 + pg = Paginator(pieces.order_by("id"), self.batch_size) + for p in tqdm(pg.page_range): + pieces = [ + p + for p in pg.get_page(p).object_list + if p.latest_post is None + ] + docs = index.pieces_to_docs(pieces) + c += len(docs) + index.replace_docs(docs) self.stdout.write(self.style.SUCCESS(f"indexed {c} docs.")) # posts = posts.exclude(type_data__object__has_key="relatedWith") # docs = index.posts_to_docs(posts) diff --git a/journal/models/collection.py b/journal/models/collection.py index 846ec85d..1f4c4cdf 100644 --- a/journal/models/collection.py +++ b/journal/models/collection.py @@ -211,3 +211,6 @@ class FeaturedCollection(Piece): @cached_property def progress(self): return self.target.get_progress(self.owner) + + def to_indexable_doc(self) -> dict[str, Any]: + return {} diff --git a/journal/models/index.py b/journal/models/index.py index 449d36be..98200097 100644 --- a/journal/models/index.py +++ b/journal/models/index.py @@ -212,9 +212,10 @@ class JournalIndex(Index): if piece.latest_post: # fk is not enforced, so post might be deleted doc["post_id"] = [piece.latest_post_id] - doc["viewer_id"] = list( - piece.latest_post.interactions.values_list("identity_id", flat=True) - ) + # enable this in future when we support search other users + # doc["viewer_id"] = list( + # piece.latest_post.interactions.values_list("identity_id", flat=True) + # ) doc.update(d) return doc @@ -238,11 +239,12 @@ class JournalIndex(Index): "piece_class": ["Post"], "content": [post.content], "created": int(post.created.timestamp()), - "owner_id": post.author_id, - "viewer_id": list( - post.interactions.values_list("identity_id", flat=True) - ), "visibility": Takahe.visibility_t2n(post.visibility), + "owner_id": post.author_id, + # enable this in future when we support search other users + # "viewer_id": list( + # post.interactions.values_list("identity_id", flat=True) + # ), } return doc