2022-12-08 18:08:05 +00:00
|
|
|
"""
|
2022-12-15 17:29:35 -05:00
|
|
|
Site and SiteManager
|
2022-12-08 18:08:05 +00:00
|
|
|
|
|
|
|
Site should inherite from AbstractSite
|
|
|
|
a Site should map to a unique set of url patterns.
|
|
|
|
a Site may scrape a url and store result in ResourceContent
|
|
|
|
ResourceContent persists as an ExternalResource which may link to an Item
|
|
|
|
"""
|
2022-12-31 00:20:20 -05:00
|
|
|
from typing import Callable
|
2022-12-07 19:09:05 -05:00
|
|
|
import re
|
2023-01-07 12:00:09 -05:00
|
|
|
from .models import ExternalResource, IdType, Item
|
2022-12-07 19:09:05 -05:00
|
|
|
from dataclasses import dataclass, field
|
2022-12-08 05:53:00 +00:00
|
|
|
import logging
|
2022-12-31 00:20:20 -05:00
|
|
|
import json
|
2023-01-07 00:35:30 -05:00
|
|
|
import django_rq
|
2022-12-08 05:53:00 +00:00
|
|
|
|
|
|
|
|
2022-12-08 16:59:03 +00:00
|
|
|
_logger = logging.getLogger(__name__)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
2022-12-08 16:08:59 +00:00
|
|
|
class ResourceContent:
|
2022-12-07 19:09:05 -05:00
|
|
|
lookup_ids: dict = field(default_factory=dict)
|
|
|
|
metadata: dict = field(default_factory=dict)
|
2023-01-07 00:35:30 -05:00
|
|
|
cover_image: bytes | None = None
|
|
|
|
cover_image_extention: str | None = None
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2022-12-17 02:04:12 -05:00
|
|
|
def dict(self):
|
2022-12-29 23:57:02 -05:00
|
|
|
return {"metadata": self.metadata, "lookup_ids": self.lookup_ids}
|
2022-12-17 02:04:12 -05:00
|
|
|
|
|
|
|
def to_json(self) -> str:
|
2022-12-29 23:57:02 -05:00
|
|
|
return json.dumps({"metadata": self.metadata, "lookup_ids": self.lookup_ids})
|
2022-12-17 02:04:12 -05:00
|
|
|
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
class AbstractSite:
|
|
|
|
"""
|
|
|
|
Abstract class to represent a site
|
|
|
|
"""
|
2022-12-29 23:57:02 -05:00
|
|
|
|
2022-12-15 17:29:35 -05:00
|
|
|
SITE_NAME = None
|
2022-12-07 19:09:05 -05:00
|
|
|
ID_TYPE = None
|
2022-12-29 23:57:02 -05:00
|
|
|
WIKI_PROPERTY_ID = "P0undefined0"
|
2022-12-07 19:09:05 -05:00
|
|
|
DEFAULT_MODEL = None
|
|
|
|
URL_PATTERNS = [r"\w+://undefined/(\d+)"]
|
|
|
|
|
|
|
|
@classmethod
|
2022-12-31 00:20:20 -05:00
|
|
|
def validate_url(cls, url: str):
|
2022-12-29 23:57:02 -05:00
|
|
|
u = next(
|
2022-12-31 00:20:20 -05:00
|
|
|
iter([re.match(p, url) for p in cls.URL_PATTERNS if re.match(p, url)]),
|
2022-12-29 23:57:02 -05:00
|
|
|
None,
|
|
|
|
)
|
2022-12-07 19:09:05 -05:00
|
|
|
return u is not None
|
|
|
|
|
2022-12-16 08:34:33 -05:00
|
|
|
@classmethod
|
2022-12-31 00:20:20 -05:00
|
|
|
def validate_url_fallback(cls, url: str):
|
2022-12-16 08:34:33 -05:00
|
|
|
return False
|
|
|
|
|
2022-12-07 19:09:05 -05:00
|
|
|
@classmethod
|
2022-12-31 00:20:20 -05:00
|
|
|
def id_to_url(cls, id_value):
|
2022-12-29 23:57:02 -05:00
|
|
|
return "https://undefined/" + id_value
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
@classmethod
|
2022-12-31 00:20:20 -05:00
|
|
|
def url_to_id(cls, url: str):
|
2022-12-29 23:57:02 -05:00
|
|
|
u = next(
|
2022-12-31 00:20:20 -05:00
|
|
|
iter([re.match(p, url) for p in cls.URL_PATTERNS if re.match(p, url)]),
|
2022-12-29 23:57:02 -05:00
|
|
|
None,
|
|
|
|
)
|
2022-12-07 19:09:05 -05:00
|
|
|
return u[1] if u else None
|
|
|
|
|
|
|
|
def __str__(self):
|
2022-12-29 23:57:02 -05:00
|
|
|
return f"<{self.__class__.__name__}: {self.url}>"
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2023-06-05 02:04:52 -04:00
|
|
|
def __init__(self, url=None, id_value=None):
|
|
|
|
# use id if possible, url will be cleaned up by id_to_url()
|
|
|
|
self.id_value = id_value or (self.url_to_id(url) if url else None)
|
|
|
|
self.url = self.id_to_url(self.id_value) if self.id_value else None
|
2022-12-08 16:08:59 +00:00
|
|
|
self.resource = None
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2023-01-07 12:00:09 -05:00
|
|
|
def get_resource(self) -> ExternalResource:
|
2022-12-08 16:08:59 +00:00
|
|
|
if not self.resource:
|
|
|
|
self.resource = ExternalResource.objects.filter(url=self.url).first()
|
2023-05-28 12:31:21 -04:00
|
|
|
if self.resource is None:
|
|
|
|
self.resource = ExternalResource.objects.filter(
|
|
|
|
id_type=self.ID_TYPE, id_value=self.id_value
|
|
|
|
).first()
|
2022-12-08 16:08:59 +00:00
|
|
|
if self.resource is None:
|
2022-12-29 23:57:02 -05:00
|
|
|
self.resource = ExternalResource(
|
|
|
|
id_type=self.ID_TYPE, id_value=self.id_value, url=self.url
|
|
|
|
)
|
2022-12-08 16:08:59 +00:00
|
|
|
return self.resource
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2022-12-08 16:08:59 +00:00
|
|
|
def scrape(self) -> ResourceContent:
|
|
|
|
"""subclass should implement this, return ResourceContent object"""
|
|
|
|
data = ResourceContent()
|
2022-12-07 19:09:05 -05:00
|
|
|
return data
|
|
|
|
|
2023-01-29 20:05:30 -05:00
|
|
|
def scrape_additional_data(self):
|
|
|
|
pass
|
|
|
|
|
2023-01-08 00:32:25 -05:00
|
|
|
@classmethod
|
|
|
|
def get_model_for_resource(cls, resource):
|
|
|
|
model = resource.get_preferred_model()
|
|
|
|
return model or cls.DEFAULT_MODEL
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def match_existing_item_for_resource(cls, resource) -> Item | None:
|
|
|
|
model = cls.get_model_for_resource(resource)
|
|
|
|
if not model:
|
|
|
|
return None
|
2023-01-07 12:00:09 -05:00
|
|
|
t, v = model.get_best_lookup_id(resource.get_all_lookup_ids())
|
|
|
|
matched = None
|
|
|
|
if t is not None:
|
|
|
|
matched = model.objects.filter(
|
|
|
|
primary_lookup_id_type=t,
|
|
|
|
primary_lookup_id_value=v,
|
|
|
|
title=resource.metadata["title"],
|
|
|
|
).first()
|
|
|
|
if matched is None and resource.id_type not in [
|
|
|
|
IdType.DoubanMusic, # DoubanMusic has many dirty data with same UPC
|
2023-05-22 19:19:53 -04:00
|
|
|
# IdType.Goodreads, # previous scraper generated some dirty data
|
2023-01-07 12:00:09 -05:00
|
|
|
]:
|
|
|
|
matched = model.objects.filter(
|
|
|
|
primary_lookup_id_type=t, primary_lookup_id_value=v
|
|
|
|
).first()
|
|
|
|
return matched
|
|
|
|
|
2023-01-08 00:32:25 -05:00
|
|
|
@classmethod
|
|
|
|
def match_or_create_item_for_resource(cls, resource):
|
|
|
|
resource.item = cls.match_existing_item_for_resource(resource)
|
|
|
|
if resource.item is None:
|
|
|
|
model = cls.get_model_for_resource(resource)
|
|
|
|
if not model:
|
|
|
|
return None
|
|
|
|
t, v = model.get_best_lookup_id(resource.get_all_lookup_ids())
|
|
|
|
obj = model.copy_metadata(resource.metadata)
|
|
|
|
obj["primary_lookup_id_type"] = t
|
|
|
|
obj["primary_lookup_id_value"] = v
|
|
|
|
resource.item = model.objects.create(**obj)
|
|
|
|
return resource.item
|
|
|
|
|
2022-12-07 19:09:05 -05:00
|
|
|
def get_item(self):
|
2022-12-08 16:08:59 +00:00
|
|
|
p = self.get_resource()
|
2022-12-07 19:09:05 -05:00
|
|
|
if not p:
|
2022-12-15 17:29:35 -05:00
|
|
|
# raise ValueError(f'resource not available for {self.url}')
|
|
|
|
return None
|
|
|
|
if not p.ready:
|
|
|
|
# raise ValueError(f'resource not ready for {self.url}')
|
|
|
|
return None
|
2023-01-08 00:32:25 -05:00
|
|
|
return self.match_or_create_item_for_resource(p)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
@property
|
|
|
|
def ready(self):
|
2022-12-08 16:08:59 +00:00
|
|
|
return bool(self.resource and self.resource.ready)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2022-12-29 23:57:02 -05:00
|
|
|
def get_resource_ready(
|
|
|
|
self,
|
|
|
|
auto_save=True,
|
|
|
|
auto_create=True,
|
|
|
|
auto_link=True,
|
|
|
|
preloaded_content=None,
|
|
|
|
ignore_existing_content=False,
|
2023-01-07 00:35:30 -05:00
|
|
|
) -> ExternalResource | None:
|
2022-12-17 08:57:09 -05:00
|
|
|
"""
|
|
|
|
Returns an ExternalResource in scraped state if possible
|
2022-12-29 23:57:02 -05:00
|
|
|
|
2022-12-17 08:57:09 -05:00
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
auto_save : bool
|
|
|
|
automatically saves the ExternalResource and, if auto_create, the Item too
|
|
|
|
auto_create : bool
|
|
|
|
automatically creates an Item if not exist yet
|
|
|
|
auto_link : bool
|
|
|
|
automatically scrape the linked resources (e.g. a TVSeason may have a linked TVShow)
|
|
|
|
preloaded_content : ResourceContent or dict
|
|
|
|
skip scrape(), and use this as scraped result
|
|
|
|
ignore_existing_content : bool
|
|
|
|
if ExternalResource already has content, ignore that and either use preloaded_content or call scrape()
|
|
|
|
"""
|
2022-12-07 19:09:05 -05:00
|
|
|
if auto_link:
|
|
|
|
auto_create = True
|
|
|
|
if auto_create:
|
|
|
|
auto_save = True
|
2022-12-08 16:08:59 +00:00
|
|
|
p = self.get_resource()
|
|
|
|
resource_content = {}
|
|
|
|
if not self.resource:
|
2022-12-07 19:09:05 -05:00
|
|
|
return None
|
2022-12-17 08:57:09 -05:00
|
|
|
if not p.ready or ignore_existing_content:
|
2022-12-17 02:04:12 -05:00
|
|
|
if isinstance(preloaded_content, ResourceContent):
|
|
|
|
resource_content = preloaded_content
|
|
|
|
elif isinstance(preloaded_content, dict):
|
|
|
|
resource_content = ResourceContent(**preloaded_content)
|
|
|
|
else:
|
2022-12-08 16:08:59 +00:00
|
|
|
resource_content = self.scrape()
|
2023-01-29 20:05:30 -05:00
|
|
|
if resource_content:
|
|
|
|
p.update_content(resource_content)
|
2022-12-07 19:09:05 -05:00
|
|
|
if not p.ready:
|
2022-12-29 23:57:02 -05:00
|
|
|
_logger.error(f"unable to get resource {self.url} ready")
|
2022-12-07 19:09:05 -05:00
|
|
|
return None
|
2023-01-08 00:32:25 -05:00
|
|
|
if auto_create: # and (p.item is None or p.item.is_deleted):
|
2022-12-07 19:09:05 -05:00
|
|
|
self.get_item()
|
|
|
|
if auto_save:
|
|
|
|
p.save()
|
|
|
|
if p.item:
|
2023-01-05 03:06:13 -05:00
|
|
|
p.item.merge_data_from_external_resources(ignore_existing_content)
|
2022-12-07 19:09:05 -05:00
|
|
|
p.item.save()
|
2023-01-29 20:05:30 -05:00
|
|
|
self.scrape_additional_data()
|
2022-12-07 19:09:05 -05:00
|
|
|
if auto_link:
|
2022-12-17 02:04:12 -05:00
|
|
|
for linked_resource in p.required_resources:
|
2022-12-29 23:57:02 -05:00
|
|
|
linked_site = SiteManager.get_site_by_url(linked_resource["url"])
|
2022-12-08 05:53:00 +00:00
|
|
|
if linked_site:
|
2022-12-29 23:57:02 -05:00
|
|
|
linked_site.get_resource_ready(
|
|
|
|
auto_link=False,
|
|
|
|
preloaded_content=linked_resource.get("content"),
|
|
|
|
)
|
2022-12-08 05:53:00 +00:00
|
|
|
else:
|
2022-12-17 02:04:12 -05:00
|
|
|
_logger.error(f'unable to get site for {linked_resource["url"]}')
|
2023-01-07 12:00:09 -05:00
|
|
|
if p.related_resources:
|
|
|
|
django_rq.get_queue("crawl").enqueue(crawl_related_resources_task, p.pk)
|
2023-06-05 02:04:52 -04:00
|
|
|
if p.item:
|
|
|
|
p.item.update_linked_items_from_external_resource(p)
|
|
|
|
p.item.save()
|
2022-12-07 19:09:05 -05:00
|
|
|
return p
|
|
|
|
|
|
|
|
|
2022-12-15 17:29:35 -05:00
|
|
|
class SiteManager:
|
2022-12-07 19:09:05 -05:00
|
|
|
registry = {}
|
|
|
|
|
2022-12-17 02:04:12 -05:00
|
|
|
@staticmethod
|
|
|
|
def register(target) -> Callable:
|
2022-12-07 19:09:05 -05:00
|
|
|
id_type = target.ID_TYPE
|
2022-12-17 02:04:12 -05:00
|
|
|
if id_type in SiteManager.registry:
|
2022-12-29 23:57:02 -05:00
|
|
|
raise ValueError(f"Site for {id_type} already exists")
|
2022-12-17 02:04:12 -05:00
|
|
|
SiteManager.registry[id_type] = target
|
2022-12-07 19:09:05 -05:00
|
|
|
return target
|
|
|
|
|
2022-12-17 02:04:12 -05:00
|
|
|
@staticmethod
|
2023-06-05 10:06:16 -04:00
|
|
|
def get_site_cls_by_id_type(typ: str) -> AbstractSite:
|
|
|
|
if typ in SiteManager.registry:
|
|
|
|
return SiteManager.registry[typ]
|
|
|
|
else:
|
|
|
|
raise ValueError(f"Site for {typ} not found")
|
2022-12-07 19:09:05 -05:00
|
|
|
|
2022-12-17 02:04:12 -05:00
|
|
|
@staticmethod
|
2023-01-07 00:35:30 -05:00
|
|
|
def get_site_by_url(url: str) -> AbstractSite | None:
|
2022-12-31 00:20:20 -05:00
|
|
|
if not url:
|
|
|
|
return None
|
2022-12-29 23:57:02 -05:00
|
|
|
cls = next(
|
|
|
|
filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None
|
|
|
|
)
|
2022-12-16 08:34:33 -05:00
|
|
|
if cls is None:
|
2022-12-29 23:57:02 -05:00
|
|
|
cls = next(
|
|
|
|
filter(
|
|
|
|
lambda p: p.validate_url_fallback(url),
|
|
|
|
SiteManager.registry.values(),
|
|
|
|
),
|
|
|
|
None,
|
|
|
|
)
|
2022-12-07 19:09:05 -05:00
|
|
|
return cls(url) if cls else None
|
|
|
|
|
2023-06-05 02:04:52 -04:00
|
|
|
@staticmethod
|
|
|
|
def get_site_by_id(id_type: IdType, id_value: str) -> AbstractSite | None:
|
|
|
|
if not id_type in SiteManager.registry:
|
|
|
|
return None
|
|
|
|
cls = SiteManager.registry[id_type]
|
|
|
|
return cls(id_value=id_value)
|
|
|
|
|
2022-12-17 02:04:12 -05:00
|
|
|
@staticmethod
|
|
|
|
def get_id_by_url(url: str):
|
|
|
|
site = SiteManager.get_site_by_url(url)
|
2022-12-07 19:09:05 -05:00
|
|
|
return site.url_to_id(url) if site else None
|
2022-12-15 17:29:35 -05:00
|
|
|
|
2023-01-05 03:06:13 -05:00
|
|
|
@staticmethod
|
|
|
|
def get_all_sites():
|
|
|
|
return SiteManager.register.values()
|
|
|
|
|
2022-12-15 17:29:35 -05:00
|
|
|
|
2023-06-05 10:06:16 -04:00
|
|
|
ExternalResource.get_site = lambda resource: SiteManager.get_site_cls_by_id_type(
|
2022-12-29 23:57:02 -05:00
|
|
|
resource.id_type
|
2023-06-05 02:04:52 -04:00
|
|
|
) # type: ignore
|
2023-01-07 00:35:30 -05:00
|
|
|
|
|
|
|
|
|
|
|
def crawl_related_resources_task(resource_pk):
|
2023-01-07 12:00:09 -05:00
|
|
|
resource = ExternalResource.objects.filter(pk=resource_pk).first()
|
|
|
|
if not resource:
|
|
|
|
_logger.warn(f"crawl resource not found {resource_pk}")
|
|
|
|
return
|
2023-01-07 00:35:30 -05:00
|
|
|
links = resource.related_resources
|
|
|
|
for w in links:
|
|
|
|
try:
|
|
|
|
item = None
|
2023-06-05 02:04:52 -04:00
|
|
|
site = None
|
|
|
|
if w.get("id_value") and w.get("id_type"):
|
|
|
|
site = SiteManager.get_site_by_id(w["id_type"], w["id_value"])
|
|
|
|
if not site and w.get("url"):
|
|
|
|
site = SiteManager.get_site_by_url(w["url"])
|
2023-01-07 00:35:30 -05:00
|
|
|
if site:
|
|
|
|
site.get_resource_ready(ignore_existing_content=False, auto_link=True)
|
|
|
|
item = site.get_item()
|
|
|
|
if item:
|
2023-06-05 02:04:52 -04:00
|
|
|
_logger.info(f"crawled {w} {item}")
|
2023-01-07 00:35:30 -05:00
|
|
|
else:
|
2023-06-05 02:04:52 -04:00
|
|
|
_logger.warn(f"crawl {w} failed")
|
2023-01-07 00:35:30 -05:00
|
|
|
except Exception as e:
|
2023-06-05 02:04:52 -04:00
|
|
|
_logger.warn(f"crawl {w} error {e}")
|