diff --git a/.gitignore b/.gitignore index 906ad8d6..a0403515 100644 --- a/.gitignore +++ b/.gitignore @@ -31,9 +31,10 @@ log # conf folders /conf /neodb +/playground # typesense folder /typesense-data # test coverage -.coverage +/.coverage diff --git a/catalog/common/downloaders.py b/catalog/common/downloaders.py index dc01f265..389ddbac 100644 --- a/catalog/common/downloaders.py +++ b/catalog/common/downloaders.py @@ -11,7 +11,7 @@ import re import time import logging from lxml import html - +from urllib.parse import quote _logger = logging.getLogger(__name__) @@ -138,24 +138,24 @@ class BasicDownloader: class ProxiedDownloader(BasicDownloader): def get_proxied_urls(self): urls = [] - if settings.PROXYCRAWL_KEY is not None: - urls.append( - f"https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={self.url}" - ) if settings.SCRAPESTACK_KEY is not None: # urls.append(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={self.url}') urls.append( - f"http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={self.url}" + f"http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={quote(self.url)}" + ) + if settings.PROXYCRAWL_KEY is not None: + urls.append( + f"https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={quote(self.url)}" ) if settings.SCRAPERAPI_KEY is not None: urls.append( - f"http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={self.url}" + f"http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={quote(self.url)}" ) return urls def get_special_proxied_url(self): return ( - f"{settings.LOCAL_PROXY}?url={self.url}" + f"{settings.LOCAL_PROXY}?url={quote(self.url)}" if settings.LOCAL_PROXY is not None else None ) diff --git a/catalog/management/commands/crawl.py b/catalog/management/commands/crawl.py new file mode 100644 index 00000000..59453f93 --- /dev/null +++ b/catalog/management/commands/crawl.py @@ -0,0 +1,40 @@ +from django.core.management.base import BaseCommand +from catalog.common import * +import re +from urllib.parse import urljoin + + +class Command(BaseCommand): + help = "Crawl content" + + def add_arguments(self, parser): + parser.add_argument("start", type=str, help="URL to start with") + parser.add_argument("--pattern", help="pattern to navigate", action="store") + + def handle(self, *args, **options): + queue = [str(options["start"])] + pattern = options["pattern"] or "" + history = [] + item_patterns = [] + for site in SiteManager.registry.values(): + if site.URL_PATTERNS: + item_patterns += site.URL_PATTERNS + while queue and len(history) < 1000: + url = queue.pop(0) + history.append(url) + self.stdout.write(f"Navigating {url}") + content = ProxiedDownloader(url).download().html() + urls = content.xpath("//a/@href") + for _u in urls: + u = urljoin(url, _u) + if u not in history and u not in queue: + if len([p for p in item_patterns if re.match(p, u)]) > 0: + site = SiteManager.get_site_by_url(u) + u = site.url + if u not in history: + history.append(u) + self.stdout.write(f"Fetching {u}") + site.get_resource_ready() + elif pattern and u.find(pattern) >= 0: + queue.append(u) + self.stdout.write(self.style.SUCCESS(f"Done."))