improve downloader compatibility
This commit is contained in:
parent
b4c69768b4
commit
c6ecbd38c1
3 changed files with 50 additions and 9 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -31,9 +31,10 @@ log
|
|||
# conf folders
|
||||
/conf
|
||||
/neodb
|
||||
/playground
|
||||
|
||||
# typesense folder
|
||||
/typesense-data
|
||||
|
||||
# test coverage
|
||||
.coverage
|
||||
/.coverage
|
||||
|
|
|
@ -11,7 +11,7 @@ import re
|
|||
import time
|
||||
import logging
|
||||
from lxml import html
|
||||
|
||||
from urllib.parse import quote
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -138,24 +138,24 @@ class BasicDownloader:
|
|||
class ProxiedDownloader(BasicDownloader):
|
||||
def get_proxied_urls(self):
|
||||
urls = []
|
||||
if settings.PROXYCRAWL_KEY is not None:
|
||||
urls.append(
|
||||
f"https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={self.url}"
|
||||
)
|
||||
if settings.SCRAPESTACK_KEY is not None:
|
||||
# urls.append(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
|
||||
urls.append(
|
||||
f"http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={self.url}"
|
||||
f"http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={quote(self.url)}"
|
||||
)
|
||||
if settings.PROXYCRAWL_KEY is not None:
|
||||
urls.append(
|
||||
f"https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={quote(self.url)}"
|
||||
)
|
||||
if settings.SCRAPERAPI_KEY is not None:
|
||||
urls.append(
|
||||
f"http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={self.url}"
|
||||
f"http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={quote(self.url)}"
|
||||
)
|
||||
return urls
|
||||
|
||||
def get_special_proxied_url(self):
|
||||
return (
|
||||
f"{settings.LOCAL_PROXY}?url={self.url}"
|
||||
f"{settings.LOCAL_PROXY}?url={quote(self.url)}"
|
||||
if settings.LOCAL_PROXY is not None
|
||||
else None
|
||||
)
|
||||
|
|
40
catalog/management/commands/crawl.py
Normal file
40
catalog/management/commands/crawl.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
from catalog.common import *
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Crawl content"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("start", type=str, help="URL to start with")
|
||||
parser.add_argument("--pattern", help="pattern to navigate", action="store")
|
||||
|
||||
def handle(self, *args, **options):
|
||||
queue = [str(options["start"])]
|
||||
pattern = options["pattern"] or ""
|
||||
history = []
|
||||
item_patterns = []
|
||||
for site in SiteManager.registry.values():
|
||||
if site.URL_PATTERNS:
|
||||
item_patterns += site.URL_PATTERNS
|
||||
while queue and len(history) < 1000:
|
||||
url = queue.pop(0)
|
||||
history.append(url)
|
||||
self.stdout.write(f"Navigating {url}")
|
||||
content = ProxiedDownloader(url).download().html()
|
||||
urls = content.xpath("//a/@href")
|
||||
for _u in urls:
|
||||
u = urljoin(url, _u)
|
||||
if u not in history and u not in queue:
|
||||
if len([p for p in item_patterns if re.match(p, u)]) > 0:
|
||||
site = SiteManager.get_site_by_url(u)
|
||||
u = site.url
|
||||
if u not in history:
|
||||
history.append(u)
|
||||
self.stdout.write(f"Fetching {u}")
|
||||
site.get_resource_ready()
|
||||
elif pattern and u.find(pattern) >= 0:
|
||||
queue.append(u)
|
||||
self.stdout.write(self.style.SUCCESS(f"Done."))
|
Loading…
Add table
Reference in a new issue