2023-02-12 21:03:53 -05:00
|
|
|
import re
|
|
|
|
from urllib.parse import urljoin
|
2023-08-10 11:27:31 -04:00
|
|
|
|
|
|
|
from django.core.management.base import BaseCommand
|
2023-07-12 23:05:19 -04:00
|
|
|
from loguru import logger
|
2025-02-22 18:52:40 -05:00
|
|
|
from lxml import html
|
2023-02-12 21:03:53 -05:00
|
|
|
|
2023-08-10 11:27:31 -04:00
|
|
|
from catalog.common import *
|
|
|
|
|
2023-02-12 21:03:53 -05:00
|
|
|
|
|
|
|
class Command(BaseCommand):
|
|
|
|
help = "Crawl content"
|
|
|
|
|
|
|
|
def add_arguments(self, parser):
|
|
|
|
parser.add_argument("start", type=str, help="URL to start with")
|
|
|
|
parser.add_argument("--pattern", help="pattern to navigate", action="store")
|
|
|
|
|
|
|
|
def handle(self, *args, **options):
|
2023-07-12 23:05:19 -04:00
|
|
|
logger.info("Crawl starts.")
|
2023-02-12 21:03:53 -05:00
|
|
|
queue = [str(options["start"])]
|
|
|
|
pattern = options["pattern"] or ""
|
|
|
|
history = []
|
|
|
|
item_patterns = []
|
|
|
|
for site in SiteManager.registry.values():
|
|
|
|
if site.URL_PATTERNS:
|
|
|
|
item_patterns += site.URL_PATTERNS
|
|
|
|
while queue and len(history) < 1000:
|
|
|
|
url = queue.pop(0)
|
|
|
|
history.append(url)
|
2023-07-12 23:05:19 -04:00
|
|
|
logger.info(f"Navigating {url}")
|
2025-02-22 18:52:40 -05:00
|
|
|
try:
|
|
|
|
content = ProxiedDownloader(url).download().html()
|
|
|
|
except Exception:
|
|
|
|
content = html.fromstring("<html />")
|
2023-02-12 21:03:53 -05:00
|
|
|
urls = content.xpath("//a/@href")
|
2023-07-20 21:59:49 -04:00
|
|
|
for _u in urls: # type:ignore
|
2023-02-12 21:03:53 -05:00
|
|
|
u = urljoin(url, _u)
|
|
|
|
if u not in history and u not in queue:
|
|
|
|
if len([p for p in item_patterns if re.match(p, u)]) > 0:
|
|
|
|
site = SiteManager.get_site_by_url(u)
|
2023-07-20 21:59:49 -04:00
|
|
|
if site:
|
|
|
|
u = site.url
|
|
|
|
if u not in history:
|
|
|
|
history.append(u)
|
|
|
|
logger.info(f"Fetching {u}")
|
|
|
|
site.get_resource_ready()
|
|
|
|
else:
|
|
|
|
logger.warning(f"unable to parse {u}")
|
2023-02-12 21:03:53 -05:00
|
|
|
elif pattern and u.find(pattern) >= 0:
|
|
|
|
queue.append(u)
|
2023-07-12 23:05:19 -04:00
|
|
|
logger.info("Crawl finished.")
|