2023-12-31 08:32:19 -05:00
|
|
|
# pyright: reportFunctionMemberAccess=false
|
2023-08-10 11:27:31 -04:00
|
|
|
import hashlib
|
|
|
|
|
2023-06-03 11:10:48 -04:00
|
|
|
import django_rq
|
2023-06-19 16:37:35 -04:00
|
|
|
from auditlog.context import set_actor
|
2023-08-10 11:27:31 -04:00
|
|
|
from django.conf import settings
|
2023-07-08 17:30:56 -04:00
|
|
|
from django.core.cache import cache
|
2023-08-10 11:27:31 -04:00
|
|
|
from django.utils.translation import gettext_lazy as _
|
2024-05-25 23:38:11 -04:00
|
|
|
from loguru import logger
|
2023-08-10 11:27:31 -04:00
|
|
|
from rq.job import Job
|
|
|
|
|
2024-11-24 09:00:59 -05:00
|
|
|
from catalog.common.downloaders import RESPONSE_CENSORSHIP, DownloadError
|
2023-08-10 11:27:31 -04:00
|
|
|
from catalog.common.sites import SiteManager
|
|
|
|
|
|
|
|
from ..models import Item, TVSeason
|
|
|
|
from .typesense import Indexer as TypeSenseIndexer
|
2023-06-03 11:10:48 -04:00
|
|
|
|
|
|
|
# from .meilisearch import Indexer as MeiliSearchIndexer
|
|
|
|
|
|
|
|
|
|
|
|
class DbIndexer:
|
2023-11-24 20:41:28 -05:00
|
|
|
@classmethod
|
|
|
|
def check(cls):
|
|
|
|
pass
|
|
|
|
|
2023-09-25 23:22:34 +00:00
|
|
|
@classmethod
|
|
|
|
def init(cls):
|
|
|
|
pass
|
|
|
|
|
2023-06-03 11:10:48 -04:00
|
|
|
@classmethod
|
2023-07-12 01:11:15 -04:00
|
|
|
def search(cls, q, page=1, categories=None, tag=None, sort=None):
|
2024-04-06 00:13:50 -04:00
|
|
|
result = lambda: None # noqa
|
2023-06-03 11:10:48 -04:00
|
|
|
result.items = Item.objects.filter(title__contains=q)[:10]
|
|
|
|
result.num_pages = 1
|
|
|
|
result.count = len(result.items)
|
|
|
|
return result
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def update_model_indexable(cls, model):
|
|
|
|
pass
|
|
|
|
|
2023-06-26 08:12:56 -04:00
|
|
|
@classmethod
|
|
|
|
def register_list_model(cls, list_model):
|
|
|
|
pass
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def register_piece_model(cls, model):
|
|
|
|
pass
|
|
|
|
|
2023-06-03 11:10:48 -04:00
|
|
|
|
|
|
|
# if settings.SEARCH_BACKEND == "MEILISEARCH":
|
|
|
|
#
|
|
|
|
# el
|
|
|
|
if settings.SEARCH_BACKEND == "TYPESENSE":
|
|
|
|
Indexer = TypeSenseIndexer
|
|
|
|
else:
|
|
|
|
Indexer = DbIndexer
|
|
|
|
|
|
|
|
|
2023-07-12 01:11:15 -04:00
|
|
|
def query_index(keywords, categories=None, tag=None, page=1, prepare_external=True):
|
2023-06-25 11:48:09 -04:00
|
|
|
if (
|
|
|
|
page < 1
|
|
|
|
or page > 99
|
|
|
|
or (not tag and isinstance(keywords, str) and len(keywords) < 2)
|
|
|
|
):
|
2023-06-10 22:44:50 -04:00
|
|
|
return [], 0, 0, []
|
2023-07-12 01:11:15 -04:00
|
|
|
result = Indexer.search(keywords, page=page, categories=categories, tag=tag)
|
2023-06-08 19:21:02 -04:00
|
|
|
keys = set()
|
2023-06-03 11:10:48 -04:00
|
|
|
items = []
|
2023-06-08 19:21:02 -04:00
|
|
|
duplicated_items = []
|
2023-06-03 11:10:48 -04:00
|
|
|
urls = []
|
|
|
|
for i in result.items:
|
2023-06-08 19:21:02 -04:00
|
|
|
if i.is_deleted or i.merged_to_item: # only happen if index is delayed
|
|
|
|
continue
|
2023-06-09 12:24:44 -04:00
|
|
|
if i.class_name == "work": # TODO: add searchable_item_class global config
|
|
|
|
continue
|
2023-06-08 19:21:02 -04:00
|
|
|
my_key = (
|
|
|
|
[i.isbn]
|
2023-06-03 11:10:48 -04:00
|
|
|
if hasattr(i, "isbn")
|
2023-06-08 19:21:02 -04:00
|
|
|
else ([i.imdb_code] if hasattr(i, "imdb_code") else [])
|
2023-06-03 11:10:48 -04:00
|
|
|
)
|
2023-06-08 19:21:02 -04:00
|
|
|
if hasattr(i, "works"):
|
|
|
|
my_key += [i[0] for i in i.works.all().values_list("id")]
|
|
|
|
if len(my_key):
|
2024-04-06 00:13:50 -04:00
|
|
|
sl = len(keys) + len(my_key)
|
2023-06-08 19:21:02 -04:00
|
|
|
keys.update(my_key)
|
|
|
|
# check and skip dup with same imdb or isbn or works id
|
2024-04-06 00:13:50 -04:00
|
|
|
if len(keys) < sl:
|
2023-06-08 19:21:02 -04:00
|
|
|
duplicated_items.append(i)
|
|
|
|
else:
|
|
|
|
items.append(i)
|
|
|
|
else:
|
2023-06-03 11:10:48 -04:00
|
|
|
items.append(i)
|
|
|
|
for res in i.external_resources.all():
|
|
|
|
urls.append(res.url)
|
2023-06-08 19:21:02 -04:00
|
|
|
# hide show if its season exists
|
|
|
|
seasons = [i for i in items if i.__class__ == TVSeason]
|
|
|
|
for season in seasons:
|
|
|
|
if season.show in items:
|
|
|
|
duplicated_items.append(season.show)
|
|
|
|
items.remove(season.show)
|
2023-06-03 11:10:48 -04:00
|
|
|
|
|
|
|
if prepare_external:
|
|
|
|
# store site url to avoid dups in external search
|
2023-07-12 01:11:15 -04:00
|
|
|
cache_key = f"search_{','.join(categories or [])}_{keywords}"
|
2023-06-03 11:10:48 -04:00
|
|
|
urls = list(set(cache.get(cache_key, []) + urls))
|
|
|
|
cache.set(cache_key, urls, timeout=300)
|
|
|
|
|
2023-06-08 19:21:02 -04:00
|
|
|
return items, result.num_pages, result.count, duplicated_items
|
2023-06-03 11:10:48 -04:00
|
|
|
|
|
|
|
|
2023-12-04 09:11:32 -05:00
|
|
|
def get_fetch_lock(user, url):
|
|
|
|
if user and user.is_authenticated:
|
|
|
|
_fetch_lock_key = f"_fetch_lock:{user.id}"
|
|
|
|
_fetch_lock_ttl = 1 if settings.DEBUG else 3
|
|
|
|
else:
|
|
|
|
_fetch_lock_key = "_fetch_lock"
|
|
|
|
_fetch_lock_ttl = 1 if settings.DEBUG else 15
|
|
|
|
if cache.get(_fetch_lock_key):
|
|
|
|
return False
|
|
|
|
cache.set(_fetch_lock_key, 1, timeout=_fetch_lock_ttl)
|
|
|
|
# do not fetch the same url twice in 2 hours
|
|
|
|
_fetch_lock_key = f"_fetch_lock:{url}"
|
|
|
|
_fetch_lock_ttl = 1 if settings.DEBUG else 7200
|
2023-07-08 17:30:56 -04:00
|
|
|
if cache.get(_fetch_lock_key):
|
|
|
|
return False
|
|
|
|
cache.set(_fetch_lock_key, 1, timeout=_fetch_lock_ttl)
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2023-06-19 16:37:35 -04:00
|
|
|
def enqueue_fetch(url, is_refetch, user=None):
|
2023-06-03 11:10:48 -04:00
|
|
|
job_id = "fetch_" + hashlib.md5(url.encode()).hexdigest()
|
|
|
|
in_progress = False
|
|
|
|
try:
|
|
|
|
job = Job.fetch(id=job_id, connection=django_rq.get_connection("fetch"))
|
|
|
|
in_progress = job.get_status() in ["queued", "started"]
|
2024-04-06 00:13:50 -04:00
|
|
|
except Exception:
|
2023-06-03 11:10:48 -04:00
|
|
|
in_progress = False
|
|
|
|
if not in_progress:
|
|
|
|
django_rq.get_queue("fetch").enqueue(
|
2023-06-19 16:37:35 -04:00
|
|
|
_fetch_task, url, is_refetch, user, job_id=job_id
|
2023-06-03 11:10:48 -04:00
|
|
|
)
|
|
|
|
return job_id
|
|
|
|
|
|
|
|
|
2023-06-19 16:37:35 -04:00
|
|
|
def _fetch_task(url, is_refetch, user):
|
2023-06-03 11:10:48 -04:00
|
|
|
item_url = "-"
|
2023-08-05 15:59:37 -04:00
|
|
|
with set_actor(user if user and user.is_authenticated else None):
|
2023-06-19 16:37:35 -04:00
|
|
|
try:
|
|
|
|
site = SiteManager.get_site_by_url(url)
|
|
|
|
if not site:
|
|
|
|
return None
|
|
|
|
site.get_resource_ready(ignore_existing_content=is_refetch)
|
|
|
|
item = site.get_item()
|
|
|
|
if item:
|
2024-05-25 23:38:11 -04:00
|
|
|
logger.info(f"fetched {url} {item.url} {item}")
|
2023-06-19 16:37:35 -04:00
|
|
|
item_url = item.url
|
|
|
|
else:
|
2024-05-25 23:38:11 -04:00
|
|
|
logger.error(f"fetch {url} failed")
|
2024-11-24 09:00:59 -05:00
|
|
|
except DownloadError as e:
|
|
|
|
if e.response_type != RESPONSE_CENSORSHIP:
|
|
|
|
logger.error(f"fetch {url} error", extra={"exception": e})
|
2023-06-19 16:37:35 -04:00
|
|
|
except Exception as e:
|
2024-11-24 09:00:59 -05:00
|
|
|
logger.error(f"parse {url} error", extra={"exception": e})
|
2023-06-19 16:37:35 -04:00
|
|
|
return item_url
|