lib.itmens/catalog/management/commands/crawl.py
2025-02-22 19:46:56 -05:00

51 lines
1.9 KiB
Python

import re
from urllib.parse import urljoin
from django.core.management.base import BaseCommand
from loguru import logger
from lxml import html
from catalog.common import *
class Command(BaseCommand):
help = "Crawl content"
def add_arguments(self, parser):
parser.add_argument("start", type=str, help="URL to start with")
parser.add_argument("--pattern", help="pattern to navigate", action="store")
def handle(self, *args, **options):
logger.info("Crawl starts.")
queue = [str(options["start"])]
pattern = options["pattern"] or ""
history = []
item_patterns = []
for site in SiteManager.registry.values():
if site.URL_PATTERNS:
item_patterns += site.URL_PATTERNS
while queue and len(history) < 1000:
url = queue.pop(0)
history.append(url)
logger.info(f"Navigating {url}")
try:
content = ProxiedDownloader(url).download().html()
except Exception:
content = html.fromstring("<html />")
urls = content.xpath("//a/@href")
for _u in urls: # type:ignore
u = urljoin(url, _u)
if u not in history and u not in queue:
if len([p for p in item_patterns if re.match(p, u)]) > 0:
site = SiteManager.get_site_by_url(u)
if site:
u = site.url
if u not in history:
history.append(u)
logger.info(f"Fetching {u}")
site.get_resource_ready()
else:
logger.warning(f"unable to parse {u}")
elif pattern and u.find(pattern) >= 0:
queue.append(u)
logger.info("Crawl finished.")