lib.itmens/catalog/common/sites.py

324 lines
11 KiB
Python
Raw Normal View History

2022-12-08 18:08:05 +00:00
"""
2022-12-15 17:29:35 -05:00
Site and SiteManager
2022-12-08 18:08:05 +00:00
Site should inherite from AbstractSite
a Site should map to a unique set of url patterns.
a Site may scrape a url and store result in ResourceContent
ResourceContent persists as an ExternalResource which may link to an Item
"""
import json
import logging
import re
from dataclasses import dataclass, field
from typing import Callable
import django_rq
from .models import ExternalResource, IdealIdTypes, IdType, Item
2022-12-08 16:59:03 +00:00
_logger = logging.getLogger(__name__)
@dataclass
2022-12-08 16:08:59 +00:00
class ResourceContent:
lookup_ids: dict = field(default_factory=dict)
metadata: dict = field(default_factory=dict)
cover_image: bytes | None = None
cover_image_extention: str | None = None
def dict(self):
2022-12-29 23:57:02 -05:00
return {"metadata": self.metadata, "lookup_ids": self.lookup_ids}
def to_json(self) -> str:
2022-12-29 23:57:02 -05:00
return json.dumps({"metadata": self.metadata, "lookup_ids": self.lookup_ids})
class AbstractSite:
"""
Abstract class to represent a site
"""
2022-12-29 23:57:02 -05:00
2022-12-15 17:29:35 -05:00
SITE_NAME = None
ID_TYPE = None
2022-12-29 23:57:02 -05:00
WIKI_PROPERTY_ID = "P0undefined0"
DEFAULT_MODEL = None
URL_PATTERNS = [r"\w+://undefined/(\d+)"]
@classmethod
2022-12-31 00:20:20 -05:00
def validate_url(cls, url: str):
2022-12-29 23:57:02 -05:00
u = next(
2022-12-31 00:20:20 -05:00
iter([re.match(p, url) for p in cls.URL_PATTERNS if re.match(p, url)]),
2022-12-29 23:57:02 -05:00
None,
)
return u is not None
2022-12-16 08:34:33 -05:00
@classmethod
2022-12-31 00:20:20 -05:00
def validate_url_fallback(cls, url: str):
2022-12-16 08:34:33 -05:00
return False
@classmethod
2022-12-31 00:20:20 -05:00
def id_to_url(cls, id_value):
2022-12-29 23:57:02 -05:00
return "https://undefined/" + id_value
@classmethod
2022-12-31 00:20:20 -05:00
def url_to_id(cls, url: str):
2022-12-29 23:57:02 -05:00
u = next(
2022-12-31 00:20:20 -05:00
iter([re.match(p, url) for p in cls.URL_PATTERNS if re.match(p, url)]),
2022-12-29 23:57:02 -05:00
None,
)
return u[1] if u else None
def __str__(self):
2022-12-29 23:57:02 -05:00
return f"<{self.__class__.__name__}: {self.url}>"
2023-06-05 02:04:52 -04:00
def __init__(self, url=None, id_value=None):
# use id if possible, url will be cleaned up by id_to_url()
self.id_value = id_value or (self.url_to_id(url) if url else None)
self.url = self.id_to_url(self.id_value) if self.id_value else None
2022-12-08 16:08:59 +00:00
self.resource = None
2023-01-07 12:00:09 -05:00
def get_resource(self) -> ExternalResource:
2022-12-08 16:08:59 +00:00
if not self.resource:
self.resource = ExternalResource.objects.filter(url=self.url).first()
if self.resource is None:
self.resource = ExternalResource.objects.filter(
id_type=self.ID_TYPE, id_value=self.id_value
).first()
2022-12-08 16:08:59 +00:00
if self.resource is None:
2022-12-29 23:57:02 -05:00
self.resource = ExternalResource(
id_type=self.ID_TYPE, id_value=self.id_value, url=self.url
)
2022-12-08 16:08:59 +00:00
return self.resource
2022-12-08 16:08:59 +00:00
def scrape(self) -> ResourceContent:
"""subclass should implement this, return ResourceContent object"""
data = ResourceContent()
return data
2023-01-29 20:05:30 -05:00
def scrape_additional_data(self):
pass
@classmethod
def get_model_for_resource(cls, resource):
model = resource.get_preferred_model()
return model or cls.DEFAULT_MODEL
@classmethod
def match_existing_item_for_resource(cls, resource) -> Item | None:
model = cls.get_model_for_resource(resource)
if not model:
return None
2023-01-07 12:00:09 -05:00
t, v = model.get_best_lookup_id(resource.get_all_lookup_ids())
matched = None
if t is not None:
matched = model.objects.filter(
primary_lookup_id_type=t,
primary_lookup_id_value=v,
title=resource.metadata["title"],
).first()
if matched is None and resource.id_type not in [
IdType.DoubanMusic, # DoubanMusic has many dirty data with same UPC
# IdType.Goodreads, # previous scraper generated some dirty data
2023-01-07 12:00:09 -05:00
]:
matched = model.objects.filter(
primary_lookup_id_type=t, primary_lookup_id_value=v
).first()
if matched is None:
matched = model.objects.filter(
primary_lookup_id_type=resource.id_type,
primary_lookup_id_value=resource.id_value,
).first()
if matched and matched.merged_to_item:
matched = matched.merged_to_item
if (
matched
and matched.primary_lookup_id_type not in IdealIdTypes
and t in IdealIdTypes
):
matched.primary_lookup_id_type = t
matched.primary_lookup_id_value = v
matched.save()
2023-01-07 12:00:09 -05:00
return matched
@classmethod
def match_or_create_item_for_resource(cls, resource):
previous_item = resource.item
resource.item = cls.match_existing_item_for_resource(resource) or previous_item
if resource.item is None:
model = cls.get_model_for_resource(resource)
if not model:
return None
t, v = model.get_best_lookup_id(resource.get_all_lookup_ids())
obj = model.copy_metadata(resource.metadata)
obj["primary_lookup_id_type"] = t
obj["primary_lookup_id_value"] = v
resource.item = model.objects.create(**obj)
if previous_item != resource.item:
2023-06-19 16:37:35 -04:00
if previous_item:
previous_item.log_action({"unmatch": [str(resource), ""]})
resource.item.log_action({"!match": ["", str(resource)]})
resource.save(update_fields=["item"])
return resource.item
def get_item(self):
2022-12-08 16:08:59 +00:00
p = self.get_resource()
if not p:
2022-12-15 17:29:35 -05:00
# raise ValueError(f'resource not available for {self.url}')
return None
if not p.ready:
# raise ValueError(f'resource not ready for {self.url}')
return None
2023-06-19 16:37:35 -04:00
return self.match_or_create_item_for_resource(p)
@property
def ready(self):
2022-12-08 16:08:59 +00:00
return bool(self.resource and self.resource.ready)
2022-12-29 23:57:02 -05:00
def get_resource_ready(
self,
auto_save=True,
auto_create=True,
auto_link=True,
preloaded_content=None,
ignore_existing_content=False,
) -> ExternalResource | None:
2022-12-17 08:57:09 -05:00
"""
Returns an ExternalResource in scraped state if possible
2022-12-29 23:57:02 -05:00
2022-12-17 08:57:09 -05:00
Parameters
----------
auto_save : bool
automatically saves the ExternalResource and, if auto_create, the Item too
auto_create : bool
automatically creates an Item if not exist yet
auto_link : bool
automatically scrape the linked resources (e.g. a TVSeason may have a linked TVShow)
preloaded_content : ResourceContent or dict
skip scrape(), and use this as scraped result
ignore_existing_content : bool
if ExternalResource already has content, ignore that and either use preloaded_content or call scrape()
"""
if auto_link:
auto_create = True
if auto_create:
auto_save = True
2022-12-08 16:08:59 +00:00
p = self.get_resource()
resource_content = {}
if not self.resource:
return None
2022-12-17 08:57:09 -05:00
if not p.ready or ignore_existing_content:
if isinstance(preloaded_content, ResourceContent):
resource_content = preloaded_content
elif isinstance(preloaded_content, dict):
resource_content = ResourceContent(**preloaded_content)
else:
2022-12-08 16:08:59 +00:00
resource_content = self.scrape()
2023-01-29 20:05:30 -05:00
if resource_content:
p.update_content(resource_content)
if not p.ready:
2022-12-29 23:57:02 -05:00
_logger.error(f"unable to get resource {self.url} ready")
return None
if auto_create: # and (p.item is None or p.item.is_deleted):
self.get_item()
if auto_save:
p.save()
if p.item:
2023-01-05 03:06:13 -05:00
p.item.merge_data_from_external_resources(ignore_existing_content)
p.item.save()
2023-01-29 20:05:30 -05:00
self.scrape_additional_data()
if auto_link:
for linked_resource in p.required_resources: # type: ignore
linked_url = linked_resource.get("url")
if linked_url:
linked_site = SiteManager.get_site_by_url(linked_url)
if linked_site:
linked_site.get_resource_ready(
auto_link=False,
preloaded_content=linked_resource.get("content"),
)
else:
_logger.error(f"unable to get site for {linked_url}")
2023-01-07 12:00:09 -05:00
if p.related_resources:
django_rq.get_queue("crawl").enqueue(crawl_related_resources_task, p.pk)
2023-06-05 02:04:52 -04:00
if p.item:
p.item.update_linked_items_from_external_resource(p)
p.item.save()
return p
2022-12-15 17:29:35 -05:00
class SiteManager:
registry = {}
@staticmethod
def register(target) -> Callable:
id_type = target.ID_TYPE
if id_type in SiteManager.registry:
2022-12-29 23:57:02 -05:00
raise ValueError(f"Site for {id_type} already exists")
SiteManager.registry[id_type] = target
return target
@staticmethod
2023-06-05 10:06:16 -04:00
def get_site_cls_by_id_type(typ: str) -> AbstractSite:
if typ in SiteManager.registry:
return SiteManager.registry[typ]
else:
raise ValueError(f"Site for {typ} not found")
@staticmethod
def get_site_by_url(url: str) -> AbstractSite | None:
2022-12-31 00:20:20 -05:00
if not url:
return None
2022-12-29 23:57:02 -05:00
cls = next(
filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None
)
2022-12-16 08:34:33 -05:00
if cls is None:
2022-12-29 23:57:02 -05:00
cls = next(
filter(
lambda p: p.validate_url_fallback(url),
SiteManager.registry.values(),
),
None,
)
return cls(url) if cls else None
2023-06-05 02:04:52 -04:00
@staticmethod
def get_site_by_id(id_type: IdType, id_value: str) -> AbstractSite | None:
if not id_type in SiteManager.registry:
return None
cls = SiteManager.registry[id_type]
return cls(id_value=id_value)
@staticmethod
def get_id_by_url(url: str):
site = SiteManager.get_site_by_url(url)
return site.url_to_id(url) if site else None
2022-12-15 17:29:35 -05:00
2023-01-05 03:06:13 -05:00
@staticmethod
def get_all_sites():
return SiteManager.register.values()
2022-12-15 17:29:35 -05:00
def crawl_related_resources_task(resource_pk):
2023-01-07 12:00:09 -05:00
resource = ExternalResource.objects.filter(pk=resource_pk).first()
if not resource:
_logger.warn(f"crawl resource not found {resource_pk}")
return
links = resource.related_resources
for w in links: # type: ignore
try:
item = None
2023-06-05 02:04:52 -04:00
site = None
if w.get("id_value") and w.get("id_type"):
site = SiteManager.get_site_by_id(w["id_type"], w["id_value"])
if not site and w.get("url"):
site = SiteManager.get_site_by_url(w["url"])
if site:
site.get_resource_ready(ignore_existing_content=False, auto_link=True)
item = site.get_item()
if item:
2023-06-05 02:04:52 -04:00
_logger.info(f"crawled {w} {item}")
else:
2023-06-05 02:04:52 -04:00
_logger.warn(f"crawl {w} failed")
except Exception as e:
2023-06-05 02:04:52 -04:00
_logger.warn(f"crawl {w} error {e}")