resync failed urls

This commit is contained in:
Their Name 2022-06-17 02:52:44 +00:00
parent c0f90cc0d8
commit 6d614d806b
2 changed files with 31 additions and 14 deletions

View file

@ -122,7 +122,7 @@ class DoubanScrapperMixin:
error = error + '\nDirect: '
get(url)
check_content()
if last_error == 'network' and settings.PROXYCRAWL_KEY is not None:
if last_error == 'network' and settings.LOCAL_PROXY is None and settings.PROXYCRAWL_KEY is not None:
error = error + '\nProxyCrawl: '
get(f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={url}')
check_content()

View file

@ -7,13 +7,32 @@ from django.core.exceptions import ObjectDoesNotExist, PermissionDenied
from tqdm import tqdm
from django.conf import settings
import requests
import os
class Command(BaseCommand):
help = 'Re-scrape failed urls (via local proxy)'
def handle(self, *args, **options):
self.stdout.write(f'Checking local proxy...')
tasks = SyncTask.objects.filter(failed_urls__isnull=False)
urls = []
for task in tqdm(tasks):
for url in task.failed_urls:
if url not in urls and url not in urls:
url = get_normalized_url(str(url))
scraper = get_scraper_by_url(url)
if scraper is not None:
try:
url = scraper.get_effective_url(url)
entity = scraper.data_class.objects.get(source_url=url)
except ObjectDoesNotExist:
urls.append(url)
f = open("/tmp/resync_todo.txt", "w")
f.write("\n".join(urls))
f.close()
return
self.stdout.write(f'Checking local proxy...{settings.LOCAL_PROXY}')
url = f'{settings.LOCAL_PROXY}?url=https://www.douban.com/doumail/'
try:
r = requests.get(url, timeout=settings.SCRAPING_TIMEOUT)
@ -25,14 +44,11 @@ class Command(BaseCommand):
self.stdout.write(self.style.ERROR(f'Proxy check failed.'))
return
self.stdout.write(f'Loading failed urls...')
tasks = SyncTask.objects.filter(failed_urls__isnull=False)
urls = []
for task in tasks:
for url in task.failed_urls:
if url not in urls:
urls.append(url)
with open("/tmp/resync_failed.txt") as file:
self.stdout.write(f'Loading urls...')
with open("/tmp/resync_todo.txt") as file:
todos = file.readlines()
todos = [line.strip() for line in todos]
with open("/tmp/resync_success.txt") as file:
skips = file.readlines()
skips = [line.strip() for line in skips]
f_f = open("/tmp/resync_failed.txt", "a")
@ -40,8 +56,7 @@ class Command(BaseCommand):
f_s = open("/tmp/resync_success.txt", "a")
user = User.objects.get(id=1)
for url in tqdm(urls):
url = get_normalized_url(url)
for url in tqdm(todos):
scraper = get_scraper_by_url(url)
url = scraper.get_effective_url(url)
if url in skips:
@ -59,7 +74,9 @@ class Command(BaseCommand):
scraper.scrape(url)
form = scraper.save(request_user=user)
f_s.write(url + '\n')
# self.stdout.write(self.style.SUCCESS(f'Saved.'))
f_s.flush()
os.fsync(f_s.fileno())
self.stdout.write(self.style.SUCCESS(f'Saved {url}'))
except Exception as e:
f_f.write(url + '\n')
# self.stdout.write(self.style.ERROR(f'Error.'))
self.stdout.write(self.style.ERROR(f'Error {url}'))