ignore download error during crawl
This commit is contained in:
parent
4277908023
commit
bd991412b4
1 changed files with 5 additions and 1 deletions
|
@ -3,6 +3,7 @@ from urllib.parse import urljoin
|
||||||
|
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
from catalog.common import *
|
from catalog.common import *
|
||||||
|
|
||||||
|
@ -27,7 +28,10 @@ class Command(BaseCommand):
|
||||||
url = queue.pop(0)
|
url = queue.pop(0)
|
||||||
history.append(url)
|
history.append(url)
|
||||||
logger.info(f"Navigating {url}")
|
logger.info(f"Navigating {url}")
|
||||||
|
try:
|
||||||
content = ProxiedDownloader(url).download().html()
|
content = ProxiedDownloader(url).download().html()
|
||||||
|
except Exception:
|
||||||
|
content = html.fromstring("<html />")
|
||||||
urls = content.xpath("//a/@href")
|
urls = content.xpath("//a/@href")
|
||||||
for _u in urls: # type:ignore
|
for _u in urls: # type:ignore
|
||||||
u = urljoin(url, _u)
|
u = urljoin(url, _u)
|
||||||
|
|
Loading…
Add table
Reference in a new issue