lib.itmens/catalog/common/sites.py

302 lines
10 KiB
Python
Raw Normal View History

2022-12-08 18:08:05 +00:00
"""
2022-12-15 17:29:35 -05:00
Site and SiteManager
2022-12-08 18:08:05 +00:00
Site should inherite from AbstractSite
a Site should map to a unique set of url patterns.
a Site may scrape a url and store result in ResourceContent
ResourceContent persists as an ExternalResource which may link to an Item
"""
2022-12-31 00:20:20 -05:00
from typing import Callable
import re
2023-01-07 12:00:09 -05:00
from .models import ExternalResource, IdType, Item
from dataclasses import dataclass, field
import logging
2022-12-31 00:20:20 -05:00
import json
import django_rq
2022-12-08 16:59:03 +00:00
_logger = logging.getLogger(__name__)
@dataclass
2022-12-08 16:08:59 +00:00
class ResourceContent:
lookup_ids: dict = field(default_factory=dict)
metadata: dict = field(default_factory=dict)
cover_image: bytes | None = None
cover_image_extention: str | None = None
def dict(self):
2022-12-29 23:57:02 -05:00
return {"metadata": self.metadata, "lookup_ids": self.lookup_ids}
def to_json(self) -> str:
2022-12-29 23:57:02 -05:00
return json.dumps({"metadata": self.metadata, "lookup_ids": self.lookup_ids})
class AbstractSite:
"""
Abstract class to represent a site
"""
2022-12-29 23:57:02 -05:00
2022-12-15 17:29:35 -05:00
SITE_NAME = None
ID_TYPE = None
2022-12-29 23:57:02 -05:00
WIKI_PROPERTY_ID = "P0undefined0"
DEFAULT_MODEL = None
URL_PATTERNS = [r"\w+://undefined/(\d+)"]
@classmethod
2022-12-31 00:20:20 -05:00
def validate_url(cls, url: str):
2022-12-29 23:57:02 -05:00
u = next(
2022-12-31 00:20:20 -05:00
iter([re.match(p, url) for p in cls.URL_PATTERNS if re.match(p, url)]),
2022-12-29 23:57:02 -05:00
None,
)
return u is not None
2022-12-16 08:34:33 -05:00
@classmethod
2022-12-31 00:20:20 -05:00
def validate_url_fallback(cls, url: str):
2022-12-16 08:34:33 -05:00
return False
@classmethod
2022-12-31 00:20:20 -05:00
def id_to_url(cls, id_value):
2022-12-29 23:57:02 -05:00
return "https://undefined/" + id_value
@classmethod
2022-12-31 00:20:20 -05:00
def url_to_id(cls, url: str):
2022-12-29 23:57:02 -05:00
u = next(
2022-12-31 00:20:20 -05:00
iter([re.match(p, url) for p in cls.URL_PATTERNS if re.match(p, url)]),
2022-12-29 23:57:02 -05:00
None,
)
return u[1] if u else None
def __str__(self):
2022-12-29 23:57:02 -05:00
return f"<{self.__class__.__name__}: {self.url}>"
2023-06-05 02:04:52 -04:00
def __init__(self, url=None, id_value=None):
# use id if possible, url will be cleaned up by id_to_url()
self.id_value = id_value or (self.url_to_id(url) if url else None)
self.url = self.id_to_url(self.id_value) if self.id_value else None
2022-12-08 16:08:59 +00:00
self.resource = None
2023-01-07 12:00:09 -05:00
def get_resource(self) -> ExternalResource:
2022-12-08 16:08:59 +00:00
if not self.resource:
self.resource = ExternalResource.objects.filter(url=self.url).first()
if self.resource is None:
self.resource = ExternalResource.objects.filter(
id_type=self.ID_TYPE, id_value=self.id_value
).first()
2022-12-08 16:08:59 +00:00
if self.resource is None:
2022-12-29 23:57:02 -05:00
self.resource = ExternalResource(
id_type=self.ID_TYPE, id_value=self.id_value, url=self.url
)
2022-12-08 16:08:59 +00:00
return self.resource
2022-12-08 16:08:59 +00:00
def scrape(self) -> ResourceContent:
"""subclass should implement this, return ResourceContent object"""
data = ResourceContent()
return data
2023-01-29 20:05:30 -05:00
def scrape_additional_data(self):
pass
@classmethod
def get_model_for_resource(cls, resource):
model = resource.get_preferred_model()
return model or cls.DEFAULT_MODEL
@classmethod
def match_existing_item_for_resource(cls, resource) -> Item | None:
model = cls.get_model_for_resource(resource)
if not model:
return None
2023-01-07 12:00:09 -05:00
t, v = model.get_best_lookup_id(resource.get_all_lookup_ids())
matched = None
if t is not None:
matched = model.objects.filter(
primary_lookup_id_type=t,
primary_lookup_id_value=v,
title=resource.metadata["title"],
).first()
if matched is None and resource.id_type not in [
IdType.DoubanMusic, # DoubanMusic has many dirty data with same UPC
# IdType.Goodreads, # previous scraper generated some dirty data
2023-01-07 12:00:09 -05:00
]:
matched = model.objects.filter(
primary_lookup_id_type=t, primary_lookup_id_value=v
).first()
return matched
@classmethod
def match_or_create_item_for_resource(cls, resource):
resource.item = cls.match_existing_item_for_resource(resource)
if resource.item is None:
model = cls.get_model_for_resource(resource)
if not model:
return None
t, v = model.get_best_lookup_id(resource.get_all_lookup_ids())
obj = model.copy_metadata(resource.metadata)
obj["primary_lookup_id_type"] = t
obj["primary_lookup_id_value"] = v
resource.item = model.objects.create(**obj)
return resource.item
def get_item(self):
2022-12-08 16:08:59 +00:00
p = self.get_resource()
if not p:
2022-12-15 17:29:35 -05:00
# raise ValueError(f'resource not available for {self.url}')
return None
if not p.ready:
# raise ValueError(f'resource not ready for {self.url}')
return None
return self.match_or_create_item_for_resource(p)
@property
def ready(self):
2022-12-08 16:08:59 +00:00
return bool(self.resource and self.resource.ready)
2022-12-29 23:57:02 -05:00
def get_resource_ready(
self,
auto_save=True,
auto_create=True,
auto_link=True,
preloaded_content=None,
ignore_existing_content=False,
) -> ExternalResource | None:
2022-12-17 08:57:09 -05:00
"""
Returns an ExternalResource in scraped state if possible
2022-12-29 23:57:02 -05:00
2022-12-17 08:57:09 -05:00
Parameters
----------
auto_save : bool
automatically saves the ExternalResource and, if auto_create, the Item too
auto_create : bool
automatically creates an Item if not exist yet
auto_link : bool
automatically scrape the linked resources (e.g. a TVSeason may have a linked TVShow)
preloaded_content : ResourceContent or dict
skip scrape(), and use this as scraped result
ignore_existing_content : bool
if ExternalResource already has content, ignore that and either use preloaded_content or call scrape()
"""
if auto_link:
auto_create = True
if auto_create:
auto_save = True
2022-12-08 16:08:59 +00:00
p = self.get_resource()
resource_content = {}
if not self.resource:
return None
2022-12-17 08:57:09 -05:00
if not p.ready or ignore_existing_content:
if isinstance(preloaded_content, ResourceContent):
resource_content = preloaded_content
elif isinstance(preloaded_content, dict):
resource_content = ResourceContent(**preloaded_content)
else:
2022-12-08 16:08:59 +00:00
resource_content = self.scrape()
2023-01-29 20:05:30 -05:00
if resource_content:
p.update_content(resource_content)
if not p.ready:
2022-12-29 23:57:02 -05:00
_logger.error(f"unable to get resource {self.url} ready")
return None
if auto_create: # and (p.item is None or p.item.is_deleted):
self.get_item()
if auto_save:
p.save()
if p.item:
2023-01-05 03:06:13 -05:00
p.item.merge_data_from_external_resources(ignore_existing_content)
p.item.save()
2023-01-29 20:05:30 -05:00
self.scrape_additional_data()
if auto_link:
for linked_resource in p.required_resources:
2022-12-29 23:57:02 -05:00
linked_site = SiteManager.get_site_by_url(linked_resource["url"])
if linked_site:
2022-12-29 23:57:02 -05:00
linked_site.get_resource_ready(
auto_link=False,
preloaded_content=linked_resource.get("content"),
)
else:
_logger.error(f'unable to get site for {linked_resource["url"]}')
2023-01-07 12:00:09 -05:00
if p.related_resources:
django_rq.get_queue("crawl").enqueue(crawl_related_resources_task, p.pk)
2023-06-05 02:04:52 -04:00
if p.item:
p.item.update_linked_items_from_external_resource(p)
p.item.save()
return p
2022-12-15 17:29:35 -05:00
class SiteManager:
registry = {}
@staticmethod
def register(target) -> Callable:
id_type = target.ID_TYPE
if id_type in SiteManager.registry:
2022-12-29 23:57:02 -05:00
raise ValueError(f"Site for {id_type} already exists")
SiteManager.registry[id_type] = target
return target
@staticmethod
2023-06-05 02:04:52 -04:00
def get_site_by_id_type(typ: str) -> AbstractSite | None:
return SiteManager.registry[typ]() if typ in SiteManager.registry else None
@staticmethod
def get_site_by_url(url: str) -> AbstractSite | None:
2022-12-31 00:20:20 -05:00
if not url:
return None
2022-12-29 23:57:02 -05:00
cls = next(
filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None
)
2022-12-16 08:34:33 -05:00
if cls is None:
2022-12-29 23:57:02 -05:00
cls = next(
filter(
lambda p: p.validate_url_fallback(url),
SiteManager.registry.values(),
),
None,
)
return cls(url) if cls else None
2023-06-05 02:04:52 -04:00
@staticmethod
def get_site_by_id(id_type: IdType, id_value: str) -> AbstractSite | None:
if not id_type in SiteManager.registry:
return None
cls = SiteManager.registry[id_type]
return cls(id_value=id_value)
@staticmethod
def get_id_by_url(url: str):
site = SiteManager.get_site_by_url(url)
return site.url_to_id(url) if site else None
2022-12-15 17:29:35 -05:00
2023-01-05 03:06:13 -05:00
@staticmethod
def get_all_sites():
return SiteManager.register.values()
2022-12-15 17:29:35 -05:00
2022-12-29 23:57:02 -05:00
ExternalResource.get_site = lambda resource: SiteManager.get_site_by_id_type(
resource.id_type
2023-06-05 02:04:52 -04:00
) # type: ignore
def crawl_related_resources_task(resource_pk):
2023-01-07 12:00:09 -05:00
resource = ExternalResource.objects.filter(pk=resource_pk).first()
if not resource:
_logger.warn(f"crawl resource not found {resource_pk}")
return
links = resource.related_resources
for w in links:
try:
item = None
2023-06-05 02:04:52 -04:00
site = None
if w.get("id_value") and w.get("id_type"):
site = SiteManager.get_site_by_id(w["id_type"], w["id_value"])
if not site and w.get("url"):
site = SiteManager.get_site_by_url(w["url"])
if site:
site.get_resource_ready(ignore_existing_content=False, auto_link=True)
item = site.get_item()
if item:
2023-06-05 02:04:52 -04:00
_logger.info(f"crawled {w} {item}")
else:
2023-06-05 02:04:52 -04:00
_logger.warn(f"crawl {w} failed")
except Exception as e:
2023-06-05 02:04:52 -04:00
_logger.warn(f"crawl {w} error {e}")