ignore download error during crawl

This commit is contained in:
Your Name 2025-02-22 18:52:40 -05:00 committed by Henri Dickson
parent 4277908023
commit bd991412b4

View file

@ -3,6 +3,7 @@ from urllib.parse import urljoin
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from loguru import logger from loguru import logger
from lxml import html
from catalog.common import * from catalog.common import *
@ -27,7 +28,10 @@ class Command(BaseCommand):
url = queue.pop(0) url = queue.pop(0)
history.append(url) history.append(url)
logger.info(f"Navigating {url}") logger.info(f"Navigating {url}")
try:
content = ProxiedDownloader(url).download().html() content = ProxiedDownloader(url).download().html()
except Exception:
content = html.fromstring("<html />")
urls = content.xpath("//a/@href") urls = content.xpath("//a/@href")
for _u in urls: # type:ignore for _u in urls: # type:ignore
u = urljoin(url, _u) u = urljoin(url, _u)