lib.itmens/catalog/jobs/discover.py

import time
from datetime import timedelta

from django.conf import settings
from django.core.cache import cache
from django.db.models import Count, F
from django.utils import timezone
from django.utils.translation import gettext_lazy as _
from loguru import logger

from catalog.models import *
from common.models import BaseJob, JobManager
from journal.models import (
    Collection,
    Comment,
    ShelfMember,
    TagManager,
    q_item_in_category,
)
from takahe.utils import Takahe
from users.models import APIdentity

MAX_ITEMS_PER_PERIOD = 12
MIN_MARKS = settings.MIN_MARKS_FOR_DISCOVER
MAX_DAYS_FOR_PERIOD = 96
MIN_DAYS_FOR_PERIOD = 6
DAYS_FOR_TRENDS = 3


@JobManager.register
class DiscoverGenerator(BaseJob):
    interval = timedelta(hours=1)

    def get_popular_marked_item_ids(self, category, days, exisiting_ids):
        item_ids = [
            m["item_id"]
            for m in ShelfMember.objects.filter(q_item_in_category(category))
            .filter(created_time__gt=timezone.now() - timedelta(days=days))
            .exclude(item_id__in=exisiting_ids)
            .values("item_id")
            .annotate(num=Count("item_id"))
            .filter(num__gte=MIN_MARKS)
            .order_by("-num")[:MAX_ITEMS_PER_PERIOD]
        ]
        return item_ids

    def get_popular_commented_podcast_ids(self, days, exisiting_ids):
        return list(
            Comment.objects.filter(q_item_in_category(ItemCategory.Podcast))
            .filter(created_time__gt=timezone.now() - timedelta(days=days))
            .annotate(p=F("item__podcastepisode__program"))
            .filter(p__isnull=False)
            .exclude(p__in=exisiting_ids)
            .values("p")
            .annotate(num=Count("p"))
            .filter(num__gte=MIN_MARKS)
            .order_by("-num")
            .values_list("p", flat=True)[:MAX_ITEMS_PER_PERIOD]
        )

    def cleanup_shows(self, items):
        seasons = [i for i in items if i.__class__ == TVSeason]
        for season in seasons:
            if season.show in items:
                items.remove(season.show)
        return items

    def run(self):
        logger.info("Discover data update start.")
        cache_key = "public_gallery"
        gallery_categories = [
            ItemCategory.Book,
            ItemCategory.Movie,
            ItemCategory.TV,
            ItemCategory.Game,
            ItemCategory.Music,
            ItemCategory.Podcast,
        ]
        gallery_list = []
        trends = []
        for category in gallery_categories:
            days = MAX_DAYS_FOR_PERIOD
            item_ids = []
            while days >= MIN_DAYS_FOR_PERIOD:
                ids = self.get_popular_marked_item_ids(category, days, item_ids)
                logger.info(f"Most marked {category} in last {days} days: {len(ids)}")
                item_ids = ids + item_ids
                days //= 2
            if category == ItemCategory.Podcast:
                days = MAX_DAYS_FOR_PERIOD // 4
                extra_ids = self.get_popular_commented_podcast_ids(days, item_ids)
                logger.info(
                    f"Most commented podcast in last {days} days: {len(extra_ids)}"
                )
                item_ids = extra_ids + item_ids
            items = [Item.objects.get(pk=i) for i in item_ids]
            if category == ItemCategory.TV:
                items = self.cleanup_shows(items)
            gallery_list.append(
                {
                    "name": "popular_" + category.value,
                    "category": category,
                    "items": items,
                }
            )
            item_ids = self.get_popular_marked_item_ids(category, DAYS_FOR_TRENDS, [])[
                :5
            ]
            if category == ItemCategory.Podcast:
                item_ids += self.get_popular_commented_podcast_ids(
                    DAYS_FOR_TRENDS, item_ids
                )[:3]
            for i in Item.objects.filter(pk__in=set(item_ids)):
                cnt = ShelfMember.objects.filter(
                    item=i, created_time__gt=timezone.now() - timedelta(days=7)
                ).count()
                trends.append(
                    {
                        "title": i.title,
                        "description": i.brief,
                        "url": i.absolute_url,
                        "image": i.cover_image_url or "",
                        "provider_name": str(i.category.label),
                        "history": [
                            {
                                "day": str(int(time.time() / 38400 - 3) * 38400),
                                "accounts": str(cnt),
                                "uses": str(cnt),
                            }
                        ],
                    }
                )
        trends.sort(key=lambda x: int(x["history"][0]["accounts"]), reverse=True)
        collection_ids = (
            Collection.objects.filter(visibility=0)
            .annotate(num=Count("interactions"))
            .filter(num__gte=MIN_MARKS)
            .order_by("-edited_time")
            .values_list("pk", flat=True)[:40]
        )
        tags = TagManager.popular_tags(days=14, limit=20)
        post_ids = Takahe.get_popular_posts(days=14, limit=20).values_list(
            "pk", flat=True
        )
        cache.set(cache_key, gallery_list, timeout=None)
        cache.set("trends_links", trends, timeout=None)
        cache.set("featured_collections", collection_ids, timeout=None)
        cache.set("popular_tags", list(tags), timeout=None)
        cache.set("popular_posts", list(post_ids), timeout=None)
        logger.info(
            f"Discover data updated, trends: {len(trends)}, collections: {len(collection_ids)}, tags: {len(tags)}, posts: {len(post_ids)}."
        )
improve trends 2024-06-01 23:25:50 -04:00			`import time`
use rq for cron tasks 2023-10-21 05:41:38 +00:00			`from datetime import timedelta`

more typehints wip wip 2024-05-27 15:44:12 -04:00			`from django.conf import settings`
use rq for cron tasks 2023-10-21 05:41:38 +00:00			`from django.core.cache import cache`
			`from django.db.models import Count, F`
			`from django.utils import timezone`
i18n wip 2024-03-10 20:55:50 -04:00			`from django.utils.translation import gettext_lazy as _`
use rq for cron tasks 2023-10-21 05:41:38 +00:00			`from loguru import logger`

			`from catalog.models import *`
			`from common.models import BaseJob, JobManager`
popular tags 2024-06-03 00:14:34 -04:00			`from journal.models import (`
			`Collection,`
			`Comment,`
			`ShelfMember,`
			`TagManager,`
			`q_item_in_category,`
			`)`
popular tags 2024-06-03 07:27:44 -04:00			`from takahe.utils import Takahe`
show featured collection on discover 2024-06-02 15:58:37 -04:00			`from users.models import APIdentity`
use rq for cron tasks 2023-10-21 05:41:38 +00:00
			`MAX_ITEMS_PER_PERIOD = 12`
discover cron run hourly 2024-01-14 13:12:43 -05:00			`MIN_MARKS = settings.MIN_MARKS_FOR_DISCOVER`
use rq for cron tasks 2023-10-21 05:41:38 +00:00			`MAX_DAYS_FOR_PERIOD = 96`
			`MIN_DAYS_FOR_PERIOD = 6`
improve trends 2024-06-01 23:25:50 -04:00			`DAYS_FOR_TRENDS = 3`
use rq for cron tasks 2023-10-21 05:41:38 +00:00

			`@JobManager.register`
			`class DiscoverGenerator(BaseJob):`
discover cron run hourly 2024-01-14 13:12:43 -05:00			`interval = timedelta(hours=1)`
use rq for cron tasks 2023-10-21 05:41:38 +00:00
			`def get_popular_marked_item_ids(self, category, days, exisiting_ids):`
			`item_ids = [`
			`m["item_id"]`
			`for m in ShelfMember.objects.filter(q_item_in_category(category))`
			`.filter(created_time__gt=timezone.now() - timedelta(days=days))`
			`.exclude(item_id__in=exisiting_ids)`
			`.values("item_id")`
			`.annotate(num=Count("item_id"))`
			`.filter(num__gte=MIN_MARKS)`
			`.order_by("-num")[:MAX_ITEMS_PER_PERIOD]`
			`]`
			`return item_ids`

			`def get_popular_commented_podcast_ids(self, days, exisiting_ids):`
			`return list(`
			`Comment.objects.filter(q_item_in_category(ItemCategory.Podcast))`
			`.filter(created_time__gt=timezone.now() - timedelta(days=days))`
			`.annotate(p=F("item__podcastepisode__program"))`
			`.filter(p__isnull=False)`
			`.exclude(p__in=exisiting_ids)`
			`.values("p")`
			`.annotate(num=Count("p"))`
			`.filter(num__gte=MIN_MARKS)`
			`.order_by("-num")`
			`.values_list("p", flat=True)[:MAX_ITEMS_PER_PERIOD]`
			`)`

			`def cleanup_shows(self, items):`
			`seasons = [i for i in items if i.__class__ == TVSeason]`
			`for season in seasons:`
			`if season.show in items:`
			`items.remove(season.show)`
			`return items`

			`def run(self):`
			`logger.info("Discover data update start.")`
			`cache_key = "public_gallery"`
			`gallery_categories = [`
			`ItemCategory.Book,`
			`ItemCategory.Movie,`
			`ItemCategory.TV,`
			`ItemCategory.Game,`
			`ItemCategory.Music,`
			`ItemCategory.Podcast,`
			`]`
			`gallery_list = []`
improve trends 2024-06-01 23:25:50 -04:00			`trends = []`
use rq for cron tasks 2023-10-21 05:41:38 +00:00			`for category in gallery_categories:`
			`days = MAX_DAYS_FOR_PERIOD`
			`item_ids = []`
			`while days >= MIN_DAYS_FOR_PERIOD:`
			`ids = self.get_popular_marked_item_ids(category, days, item_ids)`
			`logger.info(f"Most marked {category} in last {days} days: {len(ids)}")`
			`item_ids = ids + item_ids`
			`days //= 2`
			`if category == ItemCategory.Podcast:`
			`days = MAX_DAYS_FOR_PERIOD // 4`
			`extra_ids = self.get_popular_commented_podcast_ids(days, item_ids)`
			`logger.info(`
			`f"Most commented podcast in last {days} days: {len(extra_ids)}"`
			`)`
			`item_ids = extra_ids + item_ids`
			`items = [Item.objects.get(pk=i) for i in item_ids]`
			`if category == ItemCategory.TV:`
			`items = self.cleanup_shows(items)`
			`gallery_list.append(`
			`{`
			`"name": "popular_" + category.value,`
i18n for discover page 2024-05-30 00:35:57 -04:00			`"category": category,`
use rq for cron tasks 2023-10-21 05:41:38 +00:00			`"items": items,`
			`}`
			`)`
improve trends 2024-06-01 23:25:50 -04:00			`item_ids = self.get_popular_marked_item_ids(category, DAYS_FOR_TRENDS, [])[`
popular tags 2024-06-03 00:14:34 -04:00			`:5`
improve trends 2024-06-01 23:25:50 -04:00			`]`
			`if category == ItemCategory.Podcast:`
			`item_ids += self.get_popular_commented_podcast_ids(`
			`DAYS_FOR_TRENDS, item_ids`
			`)[:3]`
			`for i in Item.objects.filter(pk__in=set(item_ids)):`
			`cnt = ShelfMember.objects.filter(`
improve trend 2024-06-02 14:38:11 -04:00			`item=i, created_time__gt=timezone.now() - timedelta(days=7)`
improve trends 2024-06-01 23:25:50 -04:00			`).count()`
			`trends.append(`
			`{`
			`"title": i.title,`
			`"description": i.brief,`
			`"url": i.absolute_url,`
			`"image": i.cover_image_url or "",`
improve trend 2024-06-02 14:38:11 -04:00			`"provider_name": str(i.category.label),`
improve trends 2024-06-01 23:25:50 -04:00			`"history": [`
			`{`
compatible with Mastodon iOS 2024-06-02 17:41:23 -04:00			`"day": str(int(time.time() / 38400 - 3) * 38400),`
			`"accounts": str(cnt),`
			`"uses": str(cnt),`
improve trends 2024-06-01 23:25:50 -04:00			`}`
			`],`
			`}`
			`)`
compatible with Mastodon iOS 2024-06-02 17:41:23 -04:00			`trends.sort(key=lambda x: int(x["history"][0]["accounts"]), reverse=True)`
show featured collection on discover 2024-06-02 15:58:37 -04:00			`collection_ids = (`
			`Collection.objects.filter(visibility=0)`
			`.annotate(num=Count("interactions"))`
			`.filter(num__gte=MIN_MARKS)`
			`.order_by("-edited_time")`
			`.values_list("pk", flat=True)[:40]`
			`)`
popular tags 2024-06-03 00:14:34 -04:00			`tags = TagManager.popular_tags(days=14, limit=20)`
popular tags 2024-06-03 07:27:44 -04:00			`post_ids = Takahe.get_popular_posts(days=14, limit=20).values_list(`
			`"pk", flat=True`
			`)`
use rq for cron tasks 2023-10-21 05:41:38 +00:00			`cache.set(cache_key, gallery_list, timeout=None)`
improve trends 2024-06-01 23:25:50 -04:00			`cache.set("trends_links", trends, timeout=None)`
show featured collection on discover 2024-06-02 15:58:37 -04:00			`cache.set("featured_collections", collection_ids, timeout=None)`
popular tags 2024-06-03 07:27:44 -04:00			`cache.set("popular_tags", list(tags), timeout=None)`
			`cache.set("popular_posts", list(post_ids), timeout=None)`
compatible with Mastodon iOS 2024-06-02 17:41:23 -04:00			`logger.info(`
popular tags 2024-06-03 07:27:44 -04:00			`f"Discover data updated, trends: {len(trends)}, collections: {len(collection_ids)}, tags: {len(tags)}, posts: {len(post_ids)}."`
compatible with Mastodon iOS 2024-06-02 17:41:23 -04:00			`)`