lib.itmens/catalog/management/commands/crawl.py

52 lines
1.9 KiB
Python
Raw Normal View History

2023-02-12 21:03:53 -05:00
import re
from urllib.parse import urljoin
from django.core.management.base import BaseCommand
2023-07-12 23:05:19 -04:00
from loguru import logger
2025-02-22 18:52:40 -05:00
from lxml import html
2023-02-12 21:03:53 -05:00
from catalog.common import *
2023-02-12 21:03:53 -05:00
class Command(BaseCommand):
help = "Crawl content"
def add_arguments(self, parser):
parser.add_argument("start", type=str, help="URL to start with")
parser.add_argument("--pattern", help="pattern to navigate", action="store")
def handle(self, *args, **options):
2023-07-12 23:05:19 -04:00
logger.info("Crawl starts.")
2023-02-12 21:03:53 -05:00
queue = [str(options["start"])]
pattern = options["pattern"] or ""
history = []
item_patterns = []
for site in SiteManager.registry.values():
if site.URL_PATTERNS:
item_patterns += site.URL_PATTERNS
while queue and len(history) < 1000:
url = queue.pop(0)
history.append(url)
2023-07-12 23:05:19 -04:00
logger.info(f"Navigating {url}")
2025-02-22 18:52:40 -05:00
try:
content = ProxiedDownloader(url).download().html()
except Exception:
content = html.fromstring("<html />")
2023-02-12 21:03:53 -05:00
urls = content.xpath("//a/@href")
2023-07-20 21:59:49 -04:00
for _u in urls: # type:ignore
2023-02-12 21:03:53 -05:00
u = urljoin(url, _u)
if u not in history and u not in queue:
if len([p for p in item_patterns if re.match(p, u)]) > 0:
site = SiteManager.get_site_by_url(u)
2023-07-20 21:59:49 -04:00
if site:
u = site.url
if u not in history:
history.append(u)
logger.info(f"Fetching {u}")
site.get_resource_ready()
else:
logger.warning(f"unable to parse {u}")
2023-02-12 21:03:53 -05:00
elif pattern and u.find(pattern) >= 0:
queue.append(u)
2023-07-12 23:05:19 -04:00
logger.info("Crawl finished.")