improve downloader compatibility
This commit is contained in:
parent
b4c69768b4
commit
c6ecbd38c1
3 changed files with 50 additions and 9 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -31,9 +31,10 @@ log
|
||||||
# conf folders
|
# conf folders
|
||||||
/conf
|
/conf
|
||||||
/neodb
|
/neodb
|
||||||
|
/playground
|
||||||
|
|
||||||
# typesense folder
|
# typesense folder
|
||||||
/typesense-data
|
/typesense-data
|
||||||
|
|
||||||
# test coverage
|
# test coverage
|
||||||
.coverage
|
/.coverage
|
||||||
|
|
|
@ -11,7 +11,7 @@ import re
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
from urllib.parse import quote
|
||||||
|
|
||||||
_logger = logging.getLogger(__name__)
|
_logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -138,24 +138,24 @@ class BasicDownloader:
|
||||||
class ProxiedDownloader(BasicDownloader):
|
class ProxiedDownloader(BasicDownloader):
|
||||||
def get_proxied_urls(self):
|
def get_proxied_urls(self):
|
||||||
urls = []
|
urls = []
|
||||||
if settings.PROXYCRAWL_KEY is not None:
|
|
||||||
urls.append(
|
|
||||||
f"https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={self.url}"
|
|
||||||
)
|
|
||||||
if settings.SCRAPESTACK_KEY is not None:
|
if settings.SCRAPESTACK_KEY is not None:
|
||||||
# urls.append(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
|
# urls.append(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={self.url}')
|
||||||
urls.append(
|
urls.append(
|
||||||
f"http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={self.url}"
|
f"http://api.scrapestack.com/scrape?keep_headers=1&access_key={settings.SCRAPESTACK_KEY}&url={quote(self.url)}"
|
||||||
|
)
|
||||||
|
if settings.PROXYCRAWL_KEY is not None:
|
||||||
|
urls.append(
|
||||||
|
f"https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={quote(self.url)}"
|
||||||
)
|
)
|
||||||
if settings.SCRAPERAPI_KEY is not None:
|
if settings.SCRAPERAPI_KEY is not None:
|
||||||
urls.append(
|
urls.append(
|
||||||
f"http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={self.url}"
|
f"http://api.scraperapi.com/?api_key={settings.SCRAPERAPI_KEY}&url={quote(self.url)}"
|
||||||
)
|
)
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
def get_special_proxied_url(self):
|
def get_special_proxied_url(self):
|
||||||
return (
|
return (
|
||||||
f"{settings.LOCAL_PROXY}?url={self.url}"
|
f"{settings.LOCAL_PROXY}?url={quote(self.url)}"
|
||||||
if settings.LOCAL_PROXY is not None
|
if settings.LOCAL_PROXY is not None
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
|
40
catalog/management/commands/crawl.py
Normal file
40
catalog/management/commands/crawl.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from catalog.common import *
|
||||||
|
import re
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "Crawl content"
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument("start", type=str, help="URL to start with")
|
||||||
|
parser.add_argument("--pattern", help="pattern to navigate", action="store")
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
queue = [str(options["start"])]
|
||||||
|
pattern = options["pattern"] or ""
|
||||||
|
history = []
|
||||||
|
item_patterns = []
|
||||||
|
for site in SiteManager.registry.values():
|
||||||
|
if site.URL_PATTERNS:
|
||||||
|
item_patterns += site.URL_PATTERNS
|
||||||
|
while queue and len(history) < 1000:
|
||||||
|
url = queue.pop(0)
|
||||||
|
history.append(url)
|
||||||
|
self.stdout.write(f"Navigating {url}")
|
||||||
|
content = ProxiedDownloader(url).download().html()
|
||||||
|
urls = content.xpath("//a/@href")
|
||||||
|
for _u in urls:
|
||||||
|
u = urljoin(url, _u)
|
||||||
|
if u not in history and u not in queue:
|
||||||
|
if len([p for p in item_patterns if re.match(p, u)]) > 0:
|
||||||
|
site = SiteManager.get_site_by_url(u)
|
||||||
|
u = site.url
|
||||||
|
if u not in history:
|
||||||
|
history.append(u)
|
||||||
|
self.stdout.write(f"Fetching {u}")
|
||||||
|
site.get_resource_ready()
|
||||||
|
elif pattern and u.find(pattern) >= 0:
|
||||||
|
queue.append(u)
|
||||||
|
self.stdout.write(self.style.SUCCESS(f"Done."))
|
Loading…
Add table
Reference in a new issue