From 64a8d953ec4eb95ce2f5f6aefcc3c604bf70a104 Mon Sep 17 00:00:00 2001 From: doubaniux Date: Fri, 15 May 2020 15:06:57 +0800 Subject: [PATCH] fix invalid url scraping error --- .gitignore | 5 ++++- books/views.py | 25 ++++--------------------- common/scraper.py | 9 ++++++++- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 875d99a8..da0d2ef1 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,7 @@ migrations/ # deployed media and static files media/ -static/ \ No newline at end of file +static/ + +# log file +log \ No newline at end of file diff --git a/books/views.py b/books/views.py index 67af10f0..012c25da 100644 --- a/books/views.py +++ b/books/views.py @@ -468,13 +468,9 @@ def click_to_scrape(request): try: scraped_book, raw_cover = scrape_douban_book(url) except TimeoutError: - return render( - request, - 'common/error.html', - { - 'msg': _("爬取数据失败😫"), - } - ) + return render(request, 'common/error.html', {'msg': _("爬取数据失败😫,请重试")}) + except ValueError: + return render(request, 'common/error.html', {'msg': _("链接非法,爬取失败")}) scraped_cover = {'cover': SimpleUploadedFile('temp.jpg', raw_cover)} form = BookForm(scraped_book, scraped_cover) if form.is_valid(): @@ -486,20 +482,7 @@ def click_to_scrape(request): msg = _("ISBN与现有图书重复") else: msg = _("爬取数据失败😫") - return render( - request, - 'common/error.html', - { - 'msg': msg, - } - ) - return render( - request, - 'common/error.html', - { - 'msg': _("爬取数据失败😫"), - } - ) + return render(request, 'common/error.html', {'msg': msg}) else: return HttpResponseBadRequest() else: diff --git a/common/scraper.py b/common/scraper.py index 82f6df19..171a3860 100644 --- a/common/scraper.py +++ b/common/scraper.py @@ -6,6 +6,7 @@ from boofilsic.settings import LUMINATI_USERNAME, LUMINATI_PASSWORD, DEBUG RE_NUMBERS = re.compile(r"\d+\d*") RE_WHITESPACES = re.compile(r"\s+") +RE_DOUBAN_BOOK_URL = re.compile(r"https://book.douban.com/subject/\d+/") DEFAULT_REQUEST_HEADERS = { 'Host': 'book.douban.com', @@ -28,6 +29,9 @@ PORT = 22225 def scrape_douban_book(url): + if RE_DOUBAN_BOOK_URL.match(url) is None: + raise ValueError("not valid douban book url") + session_id = random.random() proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' % (LUMINATI_USERNAME, session_id, LUMINATI_PASSWORD, PORT)) @@ -42,7 +46,10 @@ def scrape_douban_book(url): content = html.fromstring(r.content.decode('utf-8')) - title = content.xpath("/html/body/div[3]/h1/span/text()")[0].strip() + try: + title = content.xpath("/html/body/div[3]/h1/span/text()")[0].strip() + except IndexError: + raise ValueError("given url contains no book info") subtitle_elem = content.xpath("//div[@id='info']//span[text()='副标题:']/following::text()") subtitle = subtitle_elem[0].strip() if subtitle_elem else None