diff --git a/common/scrapers/douban.py b/common/scrapers/douban.py index 668b66c0..600b16b8 100644 --- a/common/scrapers/douban.py +++ b/common/scrapers/douban.py @@ -122,7 +122,7 @@ class DoubanScrapperMixin: error = error + '\nDirect: ' get(url) check_content() - if last_error == 'network' and settings.PROXYCRAWL_KEY is not None: + if last_error == 'network' and settings.LOCAL_PROXY is None and settings.PROXYCRAWL_KEY is not None: error = error + '\nProxyCrawl: ' get(f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={url}') check_content() diff --git a/sync/management/commands/resync.py b/sync/management/commands/resync.py index f11c4b74..1fa655e9 100644 --- a/sync/management/commands/resync.py +++ b/sync/management/commands/resync.py @@ -7,13 +7,32 @@ from django.core.exceptions import ObjectDoesNotExist, PermissionDenied from tqdm import tqdm from django.conf import settings import requests +import os class Command(BaseCommand): help = 'Re-scrape failed urls (via local proxy)' def handle(self, *args, **options): - self.stdout.write(f'Checking local proxy...') + tasks = SyncTask.objects.filter(failed_urls__isnull=False) + urls = [] + for task in tqdm(tasks): + for url in task.failed_urls: + if url not in urls and url not in urls: + url = get_normalized_url(str(url)) + scraper = get_scraper_by_url(url) + if scraper is not None: + try: + url = scraper.get_effective_url(url) + entity = scraper.data_class.objects.get(source_url=url) + except ObjectDoesNotExist: + urls.append(url) + f = open("/tmp/resync_todo.txt", "w") + f.write("\n".join(urls)) + f.close() + return + + self.stdout.write(f'Checking local proxy...{settings.LOCAL_PROXY}') url = f'{settings.LOCAL_PROXY}?url=https://www.douban.com/doumail/' try: r = requests.get(url, timeout=settings.SCRAPING_TIMEOUT) @@ -25,14 +44,11 @@ class Command(BaseCommand): self.stdout.write(self.style.ERROR(f'Proxy check failed.')) return - self.stdout.write(f'Loading failed urls...') - tasks = SyncTask.objects.filter(failed_urls__isnull=False) - urls = [] - for task in tasks: - for url in task.failed_urls: - if url not in urls: - urls.append(url) - with open("/tmp/resync_failed.txt") as file: + self.stdout.write(f'Loading urls...') + with open("/tmp/resync_todo.txt") as file: + todos = file.readlines() + todos = [line.strip() for line in todos] + with open("/tmp/resync_success.txt") as file: skips = file.readlines() skips = [line.strip() for line in skips] f_f = open("/tmp/resync_failed.txt", "a") @@ -40,8 +56,7 @@ class Command(BaseCommand): f_s = open("/tmp/resync_success.txt", "a") user = User.objects.get(id=1) - for url in tqdm(urls): - url = get_normalized_url(url) + for url in tqdm(todos): scraper = get_scraper_by_url(url) url = scraper.get_effective_url(url) if url in skips: @@ -59,7 +74,9 @@ class Command(BaseCommand): scraper.scrape(url) form = scraper.save(request_user=user) f_s.write(url + '\n') - # self.stdout.write(self.style.SUCCESS(f'Saved.')) + f_s.flush() + os.fsync(f_s.fileno()) + self.stdout.write(self.style.SUCCESS(f'Saved {url}')) except Exception as e: f_f.write(url + '\n') - # self.stdout.write(self.style.ERROR(f'Error.')) + self.stdout.write(self.style.ERROR(f'Error {url}'))