From 62d89bb5bc53f157dddfa1083a9c549de2b27575 Mon Sep 17 00:00:00 2001 From: doubaniux Date: Tue, 12 May 2020 14:05:12 +0800 Subject: [PATCH] add click-to-scrape feature | close #9 --- .gitignore | 4 +- books/forms.py | 4 +- books/static/js/scrape.js | 6 +- books/templates/books/scrape.html | 17 ++- books/urls.py | 1 + books/views.py | 37 +++++++ common/forms.py | 18 ++- common/mastodon/auth.py | 1 - common/scraper.py | 163 ++++++++++++++++++++++++++++ common/static/js/create_update.js | 6 +- common/templates/widgets/image.html | 2 + users/views.py | 10 +- 12 files changed, 249 insertions(+), 20 deletions(-) create mode 100644 common/scraper.py create mode 100644 common/templates/widgets/image.html diff --git a/.gitignore b/.gitignore index cb3970e1..875d99a8 100644 --- a/.gitignore +++ b/.gitignore @@ -18,5 +18,5 @@ migrations/ *.sqlite3 # deployed media and static files -.media/ -.static/ \ No newline at end of file +media/ +static/ \ No newline at end of file diff --git a/books/forms.py b/books/forms.py index 031234b7..9ae560d3 100644 --- a/books/forms.py +++ b/books/forms.py @@ -59,11 +59,13 @@ class BookForm(forms.ModelForm): 'brief': _("简介"), 'other_info': _("其他信息"), } + from common.forms import ImageInput widgets = { 'author': forms.TextInput(attrs={'placeholder': _("多个作者使用英文逗号分隔")}), 'translator': forms.TextInput(attrs={'placeholder': _("多个译者使用英文逗号分隔")}), 'other_info': KeyValueInput(), - 'cover': forms.FileInput(), + # 'cover': forms.FileInput(), + 'cover': ImageInput(), } def clean_isbn(self): diff --git a/books/static/js/scrape.js b/books/static/js/scrape.js index c12fab27..f5174069 100644 --- a/books/static/js/scrape.js +++ b/books/static/js/scrape.js @@ -5,9 +5,9 @@ $(document).ready( function() { }); // assume there is only one input[file] on page - $("input[type='file']").each(function() { - $(this).after(''); - }); + // $("input[type='file']").each(function() { + // $(this).after(''); + // }); // preview uploaded pic $("input[type='file']").change(function() { diff --git a/books/templates/books/scrape.html b/books/templates/books/scrape.html index f576dca2..2f5af257 100644 --- a/books/templates/books/scrape.html +++ b/books/templates/books/scrape.html @@ -48,8 +48,9 @@
- - +
+ {% trans '根据豆瓣内容填写下方表单' %} +
@@ -81,11 +82,15 @@ ISBN: 9787020104345
- -
- {% trans '根据豆瓣内容填写下方表格!' %} +
+ {% trans '或者复制详情页链接' %}
- {% trans '剽取!' %} +
+ {% csrf_token %} + + +
+
diff --git a/books/urls.py b/books/urls.py index d768014f..7ca4d24e 100644 --- a/books/urls.py +++ b/books/urls.py @@ -17,4 +17,5 @@ urlpatterns = [ path('review//', retrieve_review, name='retrieve_review'), path('/review/list/', retrieve_review_list, name='retrieve_review_list'), path('scrape/', scrape, name='scrape'), + path('click_to_scrape/', click_to_scrape, name='click_to_scrape'), ] diff --git a/books/views.py b/books/views.py index 6ab9cc90..7fdb56cd 100644 --- a/books/views.py +++ b/books/views.py @@ -6,6 +6,7 @@ from django.core.exceptions import ObjectDoesNotExist, PermissionDenied from django.db import IntegrityError, transaction from django.utils import timezone from django.core.paginator import Paginator +from django.core.files.uploadedfile import SimpleUploadedFile from common.mastodon import mastodon_request_included from common.mastodon.api import check_visibility, post_toot, TootVisibilityEnum from common.mastodon.utils import rating_to_emoji @@ -454,5 +455,41 @@ def scrape(request): 'form': form, } ) + else: + return HttpResponseBadRequest() + + +@login_required +def click_to_scrape(request): + if request.method == "POST": + url = request.POST.get("url") + if url: + from common.scraper import scrape_douban_book + try: + scraped_book, raw_cover = scrape_douban_book(url) + except TimeoutError: + return render( + request, + 'common/error.html', + { + 'msg': _("爬取数据失败😫"), + } + ) + scraped_cover = {'cover': SimpleUploadedFile('temp.jpg', raw_cover)} + form = BookForm(scraped_book, scraped_cover) + if form.is_valid(): + form.instance.last_editor = request.user + form.save() + return redirect(reverse('books:retrieve', args=[form.instance.id])) + else: + return render( + request, + 'common/error.html', + { + 'msg': _("爬取数据失败😫"), + } + ) + else: + return HttpResponseBadRequest() else: return HttpResponseBadRequest() \ No newline at end of file diff --git a/common/forms.py b/common/forms.py index f3f1fd89..a13145cd 100644 --- a/common/forms.py +++ b/common/forms.py @@ -56,4 +56,20 @@ class RatingValidator: raise ValidationError( _('%(value)s is not an integer in range 1-10'), params={'value': value}, - ) \ No newline at end of file + ) + + +class ImageInput(forms.FileInput): + template_name = 'widgets/image.html' + def format_value(self, value): + """ + Return the file object if it has a defined url attribute. + """ + if self.is_initial(value): + return value + + def is_initial(self, value): + """ + Return whether value is considered to be initial value. + """ + return bool(value and getattr(value, 'url', False)) \ No newline at end of file diff --git a/common/mastodon/auth.py b/common/mastodon/auth.py index 676145b7..bb4e1c32 100644 --- a/common/mastodon/auth.py +++ b/common/mastodon/auth.py @@ -6,7 +6,6 @@ from .api import * def obtain_token(request, code): """ Returns token if success else None. """ - # TODO change http! payload = { 'client_id': CLIENT_ID, 'client_secret': CLIENT_SECRET, diff --git a/common/scraper.py b/common/scraper.py new file mode 100644 index 00000000..61a277d4 --- /dev/null +++ b/common/scraper.py @@ -0,0 +1,163 @@ +import requests +import random +from lxml import html +import re + + +RE_NUMBERS = re.compile(r"\d+\d*") +RE_WHITESPACES = re.compile(r"\s+") + +DEFAULT_REQUEST_HEADERS = { + 'Host': 'book.douban.com', + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:70.0) Gecko/20100101 Firefox/70.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', + # well, since brotli lib is so bothering, remove `br` + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive', + 'DNT': '1', + 'Upgrade-Insecure-Requests': '1', + 'Cache-Control': 'max-age=0', +} + +# in seconds +TIMEOUT = 10 + +# luminati account credentials +USERNAME = '***REMOVED***' +PASSWORD = '***REMOVED***' +PORT = 22225 + + +def scrape_douban_book(url): + session_id = random.random() + proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' % + (USERNAME, session_id, PASSWORD, PORT)) + proxies = { + 'http': proxy_url, + 'https': proxy_url, + } + # r = requests.get(url, proxies=proxies, headers=DEFAULT_REQUEST_HEADERS, timeout=TIMEOUT) + r = requests.get(url, headers=DEFAULT_REQUEST_HEADERS, timeout=TIMEOUT) + + content = html.fromstring(r.content.decode('utf-8')) + + title = content.xpath("/html/body/div[3]/h1/span/text()")[0].strip() + + subtitle_elem = content.xpath("//div[@id='info']//span[text()='副标题:']/following::text()") + subtitle = subtitle_elem[0].strip() if subtitle_elem else None + + orig_title_elem = content.xpath("//div[@id='info']//span[text()='原作名:']/following::text()") + orig_title = orig_title_elem[0].strip() if orig_title_elem else None + + language_elem = content.xpath("//div[@id='info']//span[text()='语言:']/following::text()") + language = language_elem[0].strip() if language_elem else None + + pub_house_elem = content.xpath("//div[@id='info']//span[text()='出版社:']/following::text()") + pub_house = pub_house_elem[0].strip() if pub_house_elem else None + + pub_date_elem = content.xpath("//div[@id='info']//span[text()='出版年:']/following::text()") + pub_date = pub_date_elem[0].strip() if pub_date_elem else None + year_month_day = RE_NUMBERS.findall(pub_date) + if len(year_month_day) in (2, 3): + pub_year = int(year_month_day[0]) + pub_month = int(year_month_day[1]) + elif len(year_month_day) == 1: + pub_year = int(year_month_day[0]) + pub_month = None + else: + pub_year = None + pub_month = None + if pub_year and pub_month and pub_year < pub_month: + pub_year, pub_month = pub_month, pub_year + pub_year = None if pub_year is not None and not pub_year in range(0, 3000) else pub_year + pub_month = None if pub_month is not None and not pub_month in range(1, 12) else pub_month + + binding_elem = content.xpath("//div[@id='info']//span[text()='装帧:']/following::text()") + binding = binding_elem[0].strip() if binding_elem else None + + price_elem = content.xpath("//div[@id='info']//span[text()='定价:']/following::text()") + price = price_elem[0].strip() if price_elem else None + + pages_elem = content.xpath("//div[@id='info']//span[text()='页数:']/following::text()") + pages = pages_elem[0].strip() if pages_elem else None + pages = int(RE_NUMBERS.findall(pages)[0]) if RE_NUMBERS.findall(pages) else None + + isbn_elem = content.xpath("//div[@id='info']//span[text()='ISBN:']/following::text()") + isbn = isbn_elem[0].strip() if isbn_elem else None + + brief_elem = content.xpath("//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro']/p/text()") + brief = '\n'.join(p.strip() for p in brief_elem) if brief_elem else None + + img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src") + img_url = img_url_elem[0].strip() if img_url_elem else None + raw_img = None + if img_url: + img_response = requests.get( + img_url, + headers={ + 'accept': 'image/webp,image/apng,image/*,*/*;q=0.8', + 'accept-encoding': 'gzip, deflate', + 'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,fr-FR;q=0.6,fr;q=0.5,zh-TW;q=0.4', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 Edg/81.0.416.72', + 'cache-control': 'no-cache', + 'dnt': '1' , + }, + # proxies=proxies, + timeout=TIMEOUT, + ) + if img_response.status_code == 200: + raw_img = img_response.content + + # there are two html formats for authors and translators + authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/ + preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""") + if not authors_elem: + authors_elem = content.xpath("""//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""") + if authors_elem: + authors = [] + for author in authors_elem: + authors.append(RE_WHITESPACES.sub(' ', author.strip())) + else: + authors = None + + translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/ + preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""") + if not translators_elem: + translators_elem = content.xpath("""//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""") + if translators_elem: + translators = [] + for translator in translators_elem: + translators.append(RE_WHITESPACES.sub(' ', translator.strip())) + else: + translators = None + + other = {} + cncode_elem = content.xpath("//div[@id='info']//span[text()='统一书号:']/following::text()") + if cncode_elem: + other['统一书号'] = cncode_elem[0].strip() + series_elem = content.xpath("//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()") + if series_elem: + other['丛书'] = series_elem[0].strip() + imprint_elem = content.xpath("//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()") + if imprint_elem: + other['出品方'] = imprint_elem[0].strip() + + data = { + 'title' : title, + 'subtitle' : subtitle, + 'orig_title' : orig_title, + 'author' : authors, + 'translator' : translators, + 'language' : language, + 'pub_house' : pub_house, + 'pub_year' : pub_year, + 'pub_month' : pub_month, + 'binding' : binding, + 'price' : price, + 'pages' : pages, + 'isbn' : isbn, + 'brief' : brief, + 'other_info' : other + } + return data, raw_img diff --git a/common/static/js/create_update.js b/common/static/js/create_update.js index 32a35c08..e83a624a 100644 --- a/common/static/js/create_update.js +++ b/common/static/js/create_update.js @@ -1,8 +1,8 @@ $(document).ready( function() { // assume there is only one input[file] on page - $("input[type='file']").each(function() { - $(this).after(''); - }) + // $("input[type='file']").each(function() { + // $(this).after(''); + // }) // mark required $("input[required]").each(function() { diff --git a/common/templates/widgets/image.html b/common/templates/widgets/image.html new file mode 100644 index 00000000..f469416d --- /dev/null +++ b/common/templates/widgets/image.html @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/users/views.py b/users/views.py index adfa12bc..db4aa14d 100644 --- a/users/views.py +++ b/users/views.py @@ -40,8 +40,13 @@ def OAuth2_login(request): request.session['new_user_token'] = token return redirect(reverse('users:register')) else: - # TODO better fail result page - return HttpResponse(content="Authentication failed.") + return render( + request, + 'common/error.html', + { + 'msg': _("认证失败😫") + } + ) else: return HttpResponseBadRequest() @@ -49,7 +54,6 @@ def OAuth2_login(request): # the 'login' page that user can see def login(request): if request.method == 'GET': - # TODO NOTE replace http with https!!!! auth_url = f"https://{MASTODON_DOMAIN_NAME}{API_OAUTH_AUTHORIZE}?" +\ f"client_id={CLIENT_ID}&scope=read+write&" +\ f"redirect_uri=https://{request.get_host()}{reverse('users:OAuth2_login')}" +\