From a8c4cae5788bccf9a413cda409a6b01e42c86f5c Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 15 Jun 2022 21:04:44 -0400 Subject: [PATCH] resync failed urls with local proxy: check proxy first --- sync/management/commands/resync.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sync/management/commands/resync.py b/sync/management/commands/resync.py index 0804c300..f11c4b74 100644 --- a/sync/management/commands/resync.py +++ b/sync/management/commands/resync.py @@ -5,12 +5,26 @@ from sync.models import SyncTask from users.models import User from django.core.exceptions import ObjectDoesNotExist, PermissionDenied from tqdm import tqdm +from django.conf import settings +import requests class Command(BaseCommand): help = 'Re-scrape failed urls (via local proxy)' def handle(self, *args, **options): + self.stdout.write(f'Checking local proxy...') + url = f'{settings.LOCAL_PROXY}?url=https://www.douban.com/doumail/' + try: + r = requests.get(url, timeout=settings.SCRAPING_TIMEOUT) + except Exception as e: + self.stdout.write(self.style.ERROR(e)) + return + content = r.content.decode('utf-8') + if content.find('我的豆邮') == -1: + self.stdout.write(self.style.ERROR(f'Proxy check failed.')) + return + self.stdout.write(f'Loading failed urls...') tasks = SyncTask.objects.filter(failed_urls__isnull=False) urls = [] @@ -25,6 +39,7 @@ class Command(BaseCommand): f_i = open("/tmp/resync_ignore.txt", "a") f_s = open("/tmp/resync_success.txt", "a") user = User.objects.get(id=1) + for url in tqdm(urls): url = get_normalized_url(url) scraper = get_scraper_by_url(url)