resync failed urls with local proxy
This commit is contained in:
parent
41ed0314ad
commit
792fb5abc8
2 changed files with 54 additions and 1 deletions
|
@ -109,7 +109,10 @@ class DoubanScrapperMixin:
|
|||
|
||||
def latest():
|
||||
nonlocal r, error, content
|
||||
if settings.SCRAPESTACK_KEY is not None:
|
||||
if settings.LOCAL_PROXY is not None:
|
||||
error = error + '\nLocal: '
|
||||
get(f'{settings.LOCAL_PROXY}?url={url}')
|
||||
elif settings.SCRAPESTACK_KEY is not None:
|
||||
error = error + '\nScrapeStack: '
|
||||
get(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}')
|
||||
elif settings.SCRAPERAPI_KEY is not None:
|
||||
|
|
50
sync/management/commands/resync.py
Normal file
50
sync/management/commands/resync.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
from common.scraper import get_scraper_by_url, get_normalized_url
|
||||
import pprint
|
||||
from sync.models import SyncTask
|
||||
from users.models import User
|
||||
from django.core.exceptions import ObjectDoesNotExist, PermissionDenied
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Re-scrape failed urls (via local proxy)'
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.stdout.write(f'Loading failed urls...')
|
||||
tasks = SyncTask.objects.filter(failed_urls__isnull=False)
|
||||
urls = []
|
||||
for task in tasks:
|
||||
for url in task.failed_urls:
|
||||
if url not in urls:
|
||||
urls.append(url)
|
||||
with open("/tmp/resync_failed.txt") as file:
|
||||
skips = file.readlines()
|
||||
skips = [line.strip() for line in skips]
|
||||
f_f = open("/tmp/resync_failed.txt", "a")
|
||||
f_i = open("/tmp/resync_ignore.txt", "a")
|
||||
f_s = open("/tmp/resync_success.txt", "a")
|
||||
user = User.objects.get(id=1)
|
||||
for url in tqdm(urls):
|
||||
url = get_normalized_url(url)
|
||||
scraper = get_scraper_by_url(url)
|
||||
url = scraper.get_effective_url(url)
|
||||
if url in skips:
|
||||
self.stdout.write(f'Skip {url}')
|
||||
elif scraper is None:
|
||||
self.stdout.write(self.style.ERROR(f'Unable to find scraper for {url}'))
|
||||
f_i.write(url + '\n')
|
||||
else:
|
||||
try:
|
||||
entity = scraper.data_class.objects.get(source_url=url)
|
||||
f_i.write(url + '\n')
|
||||
except ObjectDoesNotExist:
|
||||
try:
|
||||
# self.stdout.write(f'Fetching {url} via {scraper.__name__}')
|
||||
scraper.scrape(url)
|
||||
form = scraper.save(request_user=user)
|
||||
f_s.write(url + '\n')
|
||||
# self.stdout.write(self.style.SUCCESS(f'Saved.'))
|
||||
except Exception as e:
|
||||
f_f.write(url + '\n')
|
||||
# self.stdout.write(self.style.ERROR(f'Error.'))
|
Loading…
Add table
Reference in a new issue