index faster

This commit is contained in:
Your Name 2024-12-30 11:37:38 -05:00 committed by Henri Dickson
parent bee21c6d14
commit 0af7032282
3 changed files with 70 additions and 23 deletions

View file

@ -1,14 +1,25 @@
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter
from datetime import timedelta
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.core.paginator import Paginator from django.core.paginator import Paginator
from django.db.models import Q
from django.utils import timezone
from tqdm import tqdm from tqdm import tqdm
from catalog.models import Item from catalog.models import Item
from journal.models import Content, JournalIndex, Piece, update_journal_for_merged_item from journal.models import (
Collection,
Content,
JournalIndex,
Piece,
Review,
ShelfMember,
update_journal_for_merged_item,
)
from journal.models.itemlist import ListMember from journal.models.itemlist import ListMember
from takahe.models import Post from takahe.models import Post
from users.models import APIdentity from users.models import APIdentity, User
_CONFIRM = "confirm deleting collection? [Y/N] " _CONFIRM = "confirm deleting collection? [Y/N] "
@ -20,7 +31,7 @@ idx-init: check and create index if not exists
idx-destroy: delete index idx-destroy: delete index
idx-alt: update index schema idx-alt: update index schema
idx-delete: delete docs in index idx-delete: delete docs in index
idx-update: reindex docs idx-reindex: reindex docs
idx-search: search docs in index idx-search: search docs in index
""" """
@ -43,7 +54,7 @@ class Command(BaseCommand):
"idx-init", "idx-init",
"idx-alt", "idx-alt",
"idx-destroy", "idx-destroy",
"idx-update", "idx-reindex",
"idx-delete", "idx-delete",
"idx-search", "idx-search",
], ],
@ -80,6 +91,11 @@ class Command(BaseCommand):
"--yes", "--yes",
action="store_true", action="store_true",
) )
parser.add_argument(
"--fast",
action="store_true",
help="skip some inactive users and rare cases to speed up index",
)
def integrity(self): def integrity(self):
self.stdout.write(f"Checking deleted items with remaining journals...") self.stdout.write(f"Checking deleted items with remaining journals...")
@ -105,6 +121,7 @@ class Command(BaseCommand):
verbose, verbose,
fix, fix,
batch_size, batch_size,
fast,
*args, *args,
**kwargs, **kwargs,
): ):
@ -161,15 +178,26 @@ class Command(BaseCommand):
c = index.delete_all() c = index.delete_all()
self.stdout.write(self.style.SUCCESS(f"deleted {c} documents.")) self.stdout.write(self.style.SUCCESS(f"deleted {c} documents."))
case "idx-update": case "idx-reindex":
pieces = Piece.objects.all() if fast and not owners:
q = Q(social_accounts__type="mastodon.mastodonaccount") | Q(
social_accounts__last_reachable__gt=timezone.now()
- timedelta(days=365)
)
owners = list(
User.objects.filter(is_active=True)
.filter(q)
.values_list("identity", flat=True)
)
# index all posts first
posts = Post.objects.filter(local=True).exclude( posts = Post.objects.filter(local=True).exclude(
state__in=["deleted", "deleted_fanned_out"] state__in=["deleted", "deleted_fanned_out"]
) )
if owners: if owners:
pieces = pieces.filter(owner_id__in=owners) self.stdout.write(
self.style.SUCCESS(f"indexing for {len(owners)} users.")
)
posts = posts.filter(author_id__in=owners) posts = posts.filter(author_id__in=owners)
# index all posts first
c = 0 c = 0
pg = Paginator(posts.order_by("id"), self.batch_size) pg = Paginator(posts.order_by("id"), self.batch_size)
for p in tqdm(pg.page_range): for p in tqdm(pg.page_range):
@ -178,15 +206,29 @@ class Command(BaseCommand):
index.replace_docs(docs) index.replace_docs(docs)
self.stdout.write(self.style.SUCCESS(f"indexed {c} docs.")) self.stdout.write(self.style.SUCCESS(f"indexed {c} docs."))
# index remaining pieces without posts # index remaining pieces without posts
c = 0 for cls in (
pg = Paginator(pieces.order_by("id"), self.batch_size) [
for p in tqdm(pg.page_range): ShelfMember,
pieces = [ Review,
p for p in pg.get_page(p).object_list if p.latest_post is None Collection,
] ]
docs = index.pieces_to_docs(pieces) if fast
c += len(docs) else [Piece]
index.replace_docs(docs) ):
pieces = cls.objects.filter(local=True)
if owners:
pieces = pieces.filter(owner_id__in=owners)
c = 0
pg = Paginator(pieces.order_by("id"), self.batch_size)
for p in tqdm(pg.page_range):
pieces = [
p
for p in pg.get_page(p).object_list
if p.latest_post is None
]
docs = index.pieces_to_docs(pieces)
c += len(docs)
index.replace_docs(docs)
self.stdout.write(self.style.SUCCESS(f"indexed {c} docs.")) self.stdout.write(self.style.SUCCESS(f"indexed {c} docs."))
# posts = posts.exclude(type_data__object__has_key="relatedWith") # posts = posts.exclude(type_data__object__has_key="relatedWith")
# docs = index.posts_to_docs(posts) # docs = index.posts_to_docs(posts)

View file

@ -211,3 +211,6 @@ class FeaturedCollection(Piece):
@cached_property @cached_property
def progress(self): def progress(self):
return self.target.get_progress(self.owner) return self.target.get_progress(self.owner)
def to_indexable_doc(self) -> dict[str, Any]:
return {}

View file

@ -212,9 +212,10 @@ class JournalIndex(Index):
if piece.latest_post: if piece.latest_post:
# fk is not enforced, so post might be deleted # fk is not enforced, so post might be deleted
doc["post_id"] = [piece.latest_post_id] doc["post_id"] = [piece.latest_post_id]
doc["viewer_id"] = list( # enable this in future when we support search other users
piece.latest_post.interactions.values_list("identity_id", flat=True) # doc["viewer_id"] = list(
) # piece.latest_post.interactions.values_list("identity_id", flat=True)
# )
doc.update(d) doc.update(d)
return doc return doc
@ -238,11 +239,12 @@ class JournalIndex(Index):
"piece_class": ["Post"], "piece_class": ["Post"],
"content": [post.content], "content": [post.content],
"created": int(post.created.timestamp()), "created": int(post.created.timestamp()),
"owner_id": post.author_id,
"viewer_id": list(
post.interactions.values_list("identity_id", flat=True)
),
"visibility": Takahe.visibility_t2n(post.visibility), "visibility": Takahe.visibility_t2n(post.visibility),
"owner_id": post.author_id,
# enable this in future when we support search other users
# "viewer_id": list(
# post.interactions.values_list("identity_id", flat=True)
# ),
} }
return doc return doc