resync failed urls
This commit is contained in:
parent
c0f90cc0d8
commit
6d614d806b
2 changed files with 31 additions and 14 deletions
|
@ -122,7 +122,7 @@ class DoubanScrapperMixin:
|
|||
error = error + '\nDirect: '
|
||||
get(url)
|
||||
check_content()
|
||||
if last_error == 'network' and settings.PROXYCRAWL_KEY is not None:
|
||||
if last_error == 'network' and settings.LOCAL_PROXY is None and settings.PROXYCRAWL_KEY is not None:
|
||||
error = error + '\nProxyCrawl: '
|
||||
get(f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={url}')
|
||||
check_content()
|
||||
|
|
|
@ -7,13 +7,32 @@ from django.core.exceptions import ObjectDoesNotExist, PermissionDenied
|
|||
from tqdm import tqdm
|
||||
from django.conf import settings
|
||||
import requests
|
||||
import os
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Re-scrape failed urls (via local proxy)'
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.stdout.write(f'Checking local proxy...')
|
||||
tasks = SyncTask.objects.filter(failed_urls__isnull=False)
|
||||
urls = []
|
||||
for task in tqdm(tasks):
|
||||
for url in task.failed_urls:
|
||||
if url not in urls and url not in urls:
|
||||
url = get_normalized_url(str(url))
|
||||
scraper = get_scraper_by_url(url)
|
||||
if scraper is not None:
|
||||
try:
|
||||
url = scraper.get_effective_url(url)
|
||||
entity = scraper.data_class.objects.get(source_url=url)
|
||||
except ObjectDoesNotExist:
|
||||
urls.append(url)
|
||||
f = open("/tmp/resync_todo.txt", "w")
|
||||
f.write("\n".join(urls))
|
||||
f.close()
|
||||
return
|
||||
|
||||
self.stdout.write(f'Checking local proxy...{settings.LOCAL_PROXY}')
|
||||
url = f'{settings.LOCAL_PROXY}?url=https://www.douban.com/doumail/'
|
||||
try:
|
||||
r = requests.get(url, timeout=settings.SCRAPING_TIMEOUT)
|
||||
|
@ -25,14 +44,11 @@ class Command(BaseCommand):
|
|||
self.stdout.write(self.style.ERROR(f'Proxy check failed.'))
|
||||
return
|
||||
|
||||
self.stdout.write(f'Loading failed urls...')
|
||||
tasks = SyncTask.objects.filter(failed_urls__isnull=False)
|
||||
urls = []
|
||||
for task in tasks:
|
||||
for url in task.failed_urls:
|
||||
if url not in urls:
|
||||
urls.append(url)
|
||||
with open("/tmp/resync_failed.txt") as file:
|
||||
self.stdout.write(f'Loading urls...')
|
||||
with open("/tmp/resync_todo.txt") as file:
|
||||
todos = file.readlines()
|
||||
todos = [line.strip() for line in todos]
|
||||
with open("/tmp/resync_success.txt") as file:
|
||||
skips = file.readlines()
|
||||
skips = [line.strip() for line in skips]
|
||||
f_f = open("/tmp/resync_failed.txt", "a")
|
||||
|
@ -40,8 +56,7 @@ class Command(BaseCommand):
|
|||
f_s = open("/tmp/resync_success.txt", "a")
|
||||
user = User.objects.get(id=1)
|
||||
|
||||
for url in tqdm(urls):
|
||||
url = get_normalized_url(url)
|
||||
for url in tqdm(todos):
|
||||
scraper = get_scraper_by_url(url)
|
||||
url = scraper.get_effective_url(url)
|
||||
if url in skips:
|
||||
|
@ -59,7 +74,9 @@ class Command(BaseCommand):
|
|||
scraper.scrape(url)
|
||||
form = scraper.save(request_user=user)
|
||||
f_s.write(url + '\n')
|
||||
# self.stdout.write(self.style.SUCCESS(f'Saved.'))
|
||||
f_s.flush()
|
||||
os.fsync(f_s.fileno())
|
||||
self.stdout.write(self.style.SUCCESS(f'Saved {url}'))
|
||||
except Exception as e:
|
||||
f_f.write(url + '\n')
|
||||
# self.stdout.write(self.style.ERROR(f'Error.'))
|
||||
self.stdout.write(self.style.ERROR(f'Error {url}'))
|
||||
|
|
Loading…
Add table
Reference in a new issue