improve downloader compatibility

This commit is contained in:
Your Name 2023-02-12 21:03:53 -05:00 committed by Henri Dickson
parent b4c69768b4
commit c6ecbd38c1
3 changed files with 50 additions and 9 deletions

3
.gitignore vendored
View file

@ -31,9 +31,10 @@ log
# conf folders
/conf
/neodb
/playground
# typesense folder
/typesense-data
# test coverage
.coverage
/.coverage

View file

@ -11,7 +11,7 @@ import re
import time
import logging
from lxml import html
from urllib.parse import quote
_logger = logging.getLogger(__name__)
@ -138,24 +138,24 @@ class BasicDownloader:
class ProxiedDownloader(BasicDownloader):
def get_proxied_urls(self):
urls = []
if settings.PROXYCRAWL_KEY is not None:
urls.append(
f"https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={self.url}"
)
if settings.SCRAPESTACK_KEY is not None:
# urls.append(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
urls.append(
f"http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={self.url}"
f"http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={quote(self.url)}"
)
if settings.PROXYCRAWL_KEY is not None:
urls.append(
f"https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={quote(self.url)}"
)
if settings.SCRAPERAPI_KEY is not None:
urls.append(
f"http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={self.url}"
f"http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={quote(self.url)}"
)
return urls
def get_special_proxied_url(self):
return (
f"{settings.LOCAL_PROXY}?url={self.url}"
f"{settings.LOCAL_PROXY}?url={quote(self.url)}"
if settings.LOCAL_PROXY is not None
else None
)

View file

@ -0,0 +1,40 @@
from django.core.management.base import BaseCommand
from catalog.common import *
import re
from urllib.parse import urljoin
class Command(BaseCommand):
help = "Crawl content"
def add_arguments(self, parser):
parser.add_argument("start", type=str, help="URL to start with")
parser.add_argument("--pattern", help="pattern to navigate", action="store")
def handle(self, *args, **options):
queue = [str(options["start"])]
pattern = options["pattern"] or ""
history = []
item_patterns = []
for site in SiteManager.registry.values():
if site.URL_PATTERNS:
item_patterns += site.URL_PATTERNS
while queue and len(history) < 1000:
url = queue.pop(0)
history.append(url)
self.stdout.write(f"Navigating {url}")
content = ProxiedDownloader(url).download().html()
urls = content.xpath("//a/@href")
for _u in urls:
u = urljoin(url, _u)
if u not in history and u not in queue:
if len([p for p in item_patterns if re.match(p, u)]) > 0:
site = SiteManager.get_site_by_url(u)
u = site.url
if u not in history:
history.append(u)
self.stdout.write(f"Fetching {u}")
site.get_resource_ready()
elif pattern and u.find(pattern) >= 0:
queue.append(u)
self.stdout.write(self.style.SUCCESS(f"Done."))