resync failed urls with local proxy

2022-06-15 20:35:52 -04:00 · 2022-06-15 20:35:52 -04:00 · 792fb5abc8
commit 792fb5abc8
parent 41ed0314ad
2 changed files with 54 additions and 1 deletions
--- a/common/scrapers/douban.py
+++ b/common/scrapers/douban.py
@ -109,7 +109,10 @@ class DoubanScrapperMixin:

        def latest():
            nonlocal r, error, content
-            if settings.SCRAPESTACK_KEY is not None:
+            if settings.LOCAL_PROXY is not None:
+                error = error + '\nLocal: '
+                get(f'{settings.LOCAL_PROXY}?url={url}')
+            elif settings.SCRAPESTACK_KEY is not None:
                error = error + '\nScrapeStack: '
                get(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}')
            elif settings.SCRAPERAPI_KEY is not None:
--- a/sync/management/commands/resync.py
+++ b/sync/management/commands/resync.py
@ -0,0 +1,50 @@
+from django.core.management.base import BaseCommand
+from common.scraper import get_scraper_by_url, get_normalized_url
+import pprint
+from sync.models import SyncTask
+from users.models import User
+from django.core.exceptions import ObjectDoesNotExist, PermissionDenied
+from tqdm import tqdm
+
+
+class Command(BaseCommand):
+    help = 'Re-scrape failed urls (via local proxy)'
+
+    def handle(self, *args, **options):
+        self.stdout.write(f'Loading failed urls...')
+        tasks = SyncTask.objects.filter(failed_urls__isnull=False)
+        urls = []
+        for task in tasks:
+            for url in task.failed_urls:
+                if url not in urls:
+                    urls.append(url)
+        with open("/tmp/resync_failed.txt") as file:
+            skips = file.readlines()
+            skips = [line.strip() for line in skips]
+        f_f = open("/tmp/resync_failed.txt", "a")
+        f_i = open("/tmp/resync_ignore.txt", "a")
+        f_s = open("/tmp/resync_success.txt", "a")
+        user = User.objects.get(id=1)
+        for url in tqdm(urls):
+            url = get_normalized_url(url)
+            scraper = get_scraper_by_url(url)
+            url = scraper.get_effective_url(url)
+            if url in skips:
+                self.stdout.write(f'Skip {url}')
+            elif scraper is None:
+                self.stdout.write(self.style.ERROR(f'Unable to find scraper for {url}'))
+                f_i.write(url + '\n')
+            else:
+                try:
+                    entity = scraper.data_class.objects.get(source_url=url)
+                    f_i.write(url + '\n')
+                except ObjectDoesNotExist:
+                    try:
+                        # self.stdout.write(f'Fetching {url} via {scraper.__name__}')
+                        scraper.scrape(url)
+                        form = scraper.save(request_user=user)
+                        f_s.write(url + '\n')
+                        # self.stdout.write(self.style.SUCCESS(f'Saved.'))
+                    except Exception as e:
+                        f_f.write(url + '\n')
+                        # self.stdout.write(self.style.ERROR(f'Error.'))