ignore download error during crawl

2025-02-22 18:52:40 -05:00 · 2025-02-22 18:52:40 -05:00 · bd991412b4
commit bd991412b4
parent 4277908023
1 changed files with 5 additions and 1 deletions
--- a/catalog/management/commands/crawl.py
+++ b/catalog/management/commands/crawl.py
@ -3,6 +3,7 @@ from urllib.parse import urljoin
 from django.core.management.base import BaseCommand
 from loguru import logger
 from lxml import html
 from catalog.common import *
@ -27,7 +28,10 @@ class Command(BaseCommand):
            url = queue.pop(0)
            history.append(url)
            logger.info(f"Navigating {url}")
            try:
                content = ProxiedDownloader(url).download().html()
            except Exception:
                content = html.fromstring("<html />")
            urls = content.xpath("//a/@href")
            for _u in urls:  # type:ignore
                u = urljoin(url, _u)