fix invalid url scraping error
This commit is contained in:
parent
625cdbcbab
commit
64a8d953ec
3 changed files with 16 additions and 23 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -19,4 +19,7 @@ migrations/
|
||||||
|
|
||||||
# deployed media and static files
|
# deployed media and static files
|
||||||
media/
|
media/
|
||||||
static/
|
static/
|
||||||
|
|
||||||
|
# log file
|
||||||
|
log
|
|
@ -468,13 +468,9 @@ def click_to_scrape(request):
|
||||||
try:
|
try:
|
||||||
scraped_book, raw_cover = scrape_douban_book(url)
|
scraped_book, raw_cover = scrape_douban_book(url)
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
return render(
|
return render(request, 'common/error.html', {'msg': _("爬取数据失败😫,请重试")})
|
||||||
request,
|
except ValueError:
|
||||||
'common/error.html',
|
return render(request, 'common/error.html', {'msg': _("链接非法,爬取失败")})
|
||||||
{
|
|
||||||
'msg': _("爬取数据失败😫"),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
scraped_cover = {'cover': SimpleUploadedFile('temp.jpg', raw_cover)}
|
scraped_cover = {'cover': SimpleUploadedFile('temp.jpg', raw_cover)}
|
||||||
form = BookForm(scraped_book, scraped_cover)
|
form = BookForm(scraped_book, scraped_cover)
|
||||||
if form.is_valid():
|
if form.is_valid():
|
||||||
|
@ -486,20 +482,7 @@ def click_to_scrape(request):
|
||||||
msg = _("ISBN与现有图书重复")
|
msg = _("ISBN与现有图书重复")
|
||||||
else:
|
else:
|
||||||
msg = _("爬取数据失败😫")
|
msg = _("爬取数据失败😫")
|
||||||
return render(
|
return render(request, 'common/error.html', {'msg': msg})
|
||||||
request,
|
|
||||||
'common/error.html',
|
|
||||||
{
|
|
||||||
'msg': msg,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return render(
|
|
||||||
request,
|
|
||||||
'common/error.html',
|
|
||||||
{
|
|
||||||
'msg': _("爬取数据失败😫"),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
return HttpResponseBadRequest()
|
return HttpResponseBadRequest()
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -6,6 +6,7 @@ from boofilsic.settings import LUMINATI_USERNAME, LUMINATI_PASSWORD, DEBUG
|
||||||
|
|
||||||
RE_NUMBERS = re.compile(r"\d+\d*")
|
RE_NUMBERS = re.compile(r"\d+\d*")
|
||||||
RE_WHITESPACES = re.compile(r"\s+")
|
RE_WHITESPACES = re.compile(r"\s+")
|
||||||
|
RE_DOUBAN_BOOK_URL = re.compile(r"https://book.douban.com/subject/\d+/")
|
||||||
|
|
||||||
DEFAULT_REQUEST_HEADERS = {
|
DEFAULT_REQUEST_HEADERS = {
|
||||||
'Host': 'book.douban.com',
|
'Host': 'book.douban.com',
|
||||||
|
@ -28,6 +29,9 @@ PORT = 22225
|
||||||
|
|
||||||
|
|
||||||
def scrape_douban_book(url):
|
def scrape_douban_book(url):
|
||||||
|
if RE_DOUBAN_BOOK_URL.match(url) is None:
|
||||||
|
raise ValueError("not valid douban book url")
|
||||||
|
|
||||||
session_id = random.random()
|
session_id = random.random()
|
||||||
proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %
|
proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %
|
||||||
(LUMINATI_USERNAME, session_id, LUMINATI_PASSWORD, PORT))
|
(LUMINATI_USERNAME, session_id, LUMINATI_PASSWORD, PORT))
|
||||||
|
@ -42,7 +46,10 @@ def scrape_douban_book(url):
|
||||||
|
|
||||||
content = html.fromstring(r.content.decode('utf-8'))
|
content = html.fromstring(r.content.decode('utf-8'))
|
||||||
|
|
||||||
title = content.xpath("/html/body/div[3]/h1/span/text()")[0].strip()
|
try:
|
||||||
|
title = content.xpath("/html/body/div[3]/h1/span/text()")[0].strip()
|
||||||
|
except IndexError:
|
||||||
|
raise ValueError("given url contains no book info")
|
||||||
|
|
||||||
subtitle_elem = content.xpath("//div[@id='info']//span[text()='副标题:']/following::text()")
|
subtitle_elem = content.xpath("//div[@id='info']//span[text()='副标题:']/following::text()")
|
||||||
subtitle = subtitle_elem[0].strip() if subtitle_elem else None
|
subtitle = subtitle_elem[0].strip() if subtitle_elem else None
|
||||||
|
|
Loading…
Add table
Reference in a new issue