add click-to-scrape feature | close #9

This commit is contained in:
doubaniux 2020-05-12 14:05:12 +08:00
parent 0d22ba4144
commit 62d89bb5bc
12 changed files with 249 additions and 20 deletions

4
.gitignore vendored
View file

@ -18,5 +18,5 @@ migrations/
*.sqlite3 *.sqlite3
# deployed media and static files # deployed media and static files
.media/ media/
.static/ static/

View file

@ -59,11 +59,13 @@ class BookForm(forms.ModelForm):
'brief': _("简介"), 'brief': _("简介"),
'other_info': _("其他信息"), 'other_info': _("其他信息"),
} }
from common.forms import ImageInput
widgets = { widgets = {
'author': forms.TextInput(attrs={'placeholder': _("多个作者使用英文逗号分隔")}), 'author': forms.TextInput(attrs={'placeholder': _("多个作者使用英文逗号分隔")}),
'translator': forms.TextInput(attrs={'placeholder': _("多个译者使用英文逗号分隔")}), 'translator': forms.TextInput(attrs={'placeholder': _("多个译者使用英文逗号分隔")}),
'other_info': KeyValueInput(), 'other_info': KeyValueInput(),
'cover': forms.FileInput(), # 'cover': forms.FileInput(),
'cover': ImageInput(),
} }
def clean_isbn(self): def clean_isbn(self):

View file

@ -5,9 +5,9 @@ $(document).ready( function() {
}); });
// assume there is only one input[file] on page // assume there is only one input[file] on page
$("input[type='file']").each(function() { // $("input[type='file']").each(function() {
$(this).after('<img src="#" alt="" id="previewImage" style="margin:10px 0; max-width:500px;"/>'); // $(this).after('<img src="#" alt="" id="previewImage" style="margin:10px 0; max-width:500px;"/>');
}); // });
// preview uploaded pic // preview uploaded pic
$("input[type='file']").change(function() { $("input[type='file']").change(function() {

View file

@ -48,8 +48,9 @@
<section id="content" class="container"> <section id="content" class="container">
<div class="row"> <div class="row">
<div id="main"> <div id="main">
<div class="set">
{% trans '根据豆瓣内容填写下方表单' %}
</div>
<iframe id='test' sandbox="allow-same-origin allow-scripts allow-popups allow-forms" src="https://search.douban.com/book/subject_search{% if q %}?search_text={{ q }}{% endif %}" frameborder="0"></iframe> <iframe id='test' sandbox="allow-same-origin allow-scripts allow-popups allow-forms" src="https://search.douban.com/book/subject_search{% if q %}?search_text={{ q }}{% endif %}" frameborder="0"></iframe>
<div class="dividing-line"></div> <div class="dividing-line"></div>
<div id="parser"> <div id="parser">
@ -81,11 +82,15 @@ ISBN: 9787020104345
<div id="aside"> <div id="aside">
<div class="aside-card"> <div class="aside-card">
<div class="add-nav"> <div class="add-nav">
<div class="set">
<div> {% trans '或者复制详情页链接' %}
{% trans '根据豆瓣内容填写下方表格!' %}
</div> </div>
<a href="{% url 'books:scrape' %}" class="button add-button submit">{% trans '剽取!' %}</a> <form action="{% url 'books:click_to_scrape' %}" method="post">
{% csrf_token %}
<input type="text" name="url" required placeholder="https://book.douban.com/subject/1000000/">
<input type="submit" class="button add-button" value="{% trans '一键取剽取!' %}">
</form>
</div> </div>
</div> </div>

View file

@ -17,4 +17,5 @@ urlpatterns = [
path('review/<int:id>/', retrieve_review, name='retrieve_review'), path('review/<int:id>/', retrieve_review, name='retrieve_review'),
path('<int:book_id>/review/list/', retrieve_review_list, name='retrieve_review_list'), path('<int:book_id>/review/list/', retrieve_review_list, name='retrieve_review_list'),
path('scrape/', scrape, name='scrape'), path('scrape/', scrape, name='scrape'),
path('click_to_scrape/', click_to_scrape, name='click_to_scrape'),
] ]

View file

@ -6,6 +6,7 @@ from django.core.exceptions import ObjectDoesNotExist, PermissionDenied
from django.db import IntegrityError, transaction from django.db import IntegrityError, transaction
from django.utils import timezone from django.utils import timezone
from django.core.paginator import Paginator from django.core.paginator import Paginator
from django.core.files.uploadedfile import SimpleUploadedFile
from common.mastodon import mastodon_request_included from common.mastodon import mastodon_request_included
from common.mastodon.api import check_visibility, post_toot, TootVisibilityEnum from common.mastodon.api import check_visibility, post_toot, TootVisibilityEnum
from common.mastodon.utils import rating_to_emoji from common.mastodon.utils import rating_to_emoji
@ -454,5 +455,41 @@ def scrape(request):
'form': form, 'form': form,
} }
) )
else:
return HttpResponseBadRequest()
@login_required
def click_to_scrape(request):
if request.method == "POST":
url = request.POST.get("url")
if url:
from common.scraper import scrape_douban_book
try:
scraped_book, raw_cover = scrape_douban_book(url)
except TimeoutError:
return render(
request,
'common/error.html',
{
'msg': _("爬取数据失败😫"),
}
)
scraped_cover = {'cover': SimpleUploadedFile('temp.jpg', raw_cover)}
form = BookForm(scraped_book, scraped_cover)
if form.is_valid():
form.instance.last_editor = request.user
form.save()
return redirect(reverse('books:retrieve', args=[form.instance.id]))
else:
return render(
request,
'common/error.html',
{
'msg': _("爬取数据失败😫"),
}
)
else:
return HttpResponseBadRequest()
else: else:
return HttpResponseBadRequest() return HttpResponseBadRequest()

View file

@ -56,4 +56,20 @@ class RatingValidator:
raise ValidationError( raise ValidationError(
_('%(value)s is not an integer in range 1-10'), _('%(value)s is not an integer in range 1-10'),
params={'value': value}, params={'value': value},
) )
class ImageInput(forms.FileInput):
template_name = 'widgets/image.html'
def format_value(self, value):
"""
Return the file object if it has a defined url attribute.
"""
if self.is_initial(value):
return value
def is_initial(self, value):
"""
Return whether value is considered to be initial value.
"""
return bool(value and getattr(value, 'url', False))

View file

@ -6,7 +6,6 @@ from .api import *
def obtain_token(request, code): def obtain_token(request, code):
""" Returns token if success else None. """ """ Returns token if success else None. """
# TODO change http!
payload = { payload = {
'client_id': CLIENT_ID, 'client_id': CLIENT_ID,
'client_secret': CLIENT_SECRET, 'client_secret': CLIENT_SECRET,

163
common/scraper.py Normal file
View file

@ -0,0 +1,163 @@
import requests
import random
from lxml import html
import re
RE_NUMBERS = re.compile(r"\d+\d*")
RE_WHITESPACES = re.compile(r"\s+")
DEFAULT_REQUEST_HEADERS = {
'Host': 'book.douban.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:70.0) Gecko/20100101 Firefox/70.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
# well, since brotli lib is so bothering, remove `br`
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0',
}
# in seconds
TIMEOUT = 10
# luminati account credentials
USERNAME = '***REMOVED***'
PASSWORD = '***REMOVED***'
PORT = 22225
def scrape_douban_book(url):
session_id = random.random()
proxy_url = ('http://%s-country-cn-session-%s:%s@zproxy.lum-superproxy.io:%d' %
(USERNAME, session_id, PASSWORD, PORT))
proxies = {
'http': proxy_url,
'https': proxy_url,
}
# r = requests.get(url, proxies=proxies, headers=DEFAULT_REQUEST_HEADERS, timeout=TIMEOUT)
r = requests.get(url, headers=DEFAULT_REQUEST_HEADERS, timeout=TIMEOUT)
content = html.fromstring(r.content.decode('utf-8'))
title = content.xpath("/html/body/div[3]/h1/span/text()")[0].strip()
subtitle_elem = content.xpath("//div[@id='info']//span[text()='副标题:']/following::text()")
subtitle = subtitle_elem[0].strip() if subtitle_elem else None
orig_title_elem = content.xpath("//div[@id='info']//span[text()='原作名:']/following::text()")
orig_title = orig_title_elem[0].strip() if orig_title_elem else None
language_elem = content.xpath("//div[@id='info']//span[text()='语言:']/following::text()")
language = language_elem[0].strip() if language_elem else None
pub_house_elem = content.xpath("//div[@id='info']//span[text()='出版社:']/following::text()")
pub_house = pub_house_elem[0].strip() if pub_house_elem else None
pub_date_elem = content.xpath("//div[@id='info']//span[text()='出版年:']/following::text()")
pub_date = pub_date_elem[0].strip() if pub_date_elem else None
year_month_day = RE_NUMBERS.findall(pub_date)
if len(year_month_day) in (2, 3):
pub_year = int(year_month_day[0])
pub_month = int(year_month_day[1])
elif len(year_month_day) == 1:
pub_year = int(year_month_day[0])
pub_month = None
else:
pub_year = None
pub_month = None
if pub_year and pub_month and pub_year < pub_month:
pub_year, pub_month = pub_month, pub_year
pub_year = None if pub_year is not None and not pub_year in range(0, 3000) else pub_year
pub_month = None if pub_month is not None and not pub_month in range(1, 12) else pub_month
binding_elem = content.xpath("//div[@id='info']//span[text()='装帧:']/following::text()")
binding = binding_elem[0].strip() if binding_elem else None
price_elem = content.xpath("//div[@id='info']//span[text()='定价:']/following::text()")
price = price_elem[0].strip() if price_elem else None
pages_elem = content.xpath("//div[@id='info']//span[text()='页数:']/following::text()")
pages = pages_elem[0].strip() if pages_elem else None
pages = int(RE_NUMBERS.findall(pages)[0]) if RE_NUMBERS.findall(pages) else None
isbn_elem = content.xpath("//div[@id='info']//span[text()='ISBN:']/following::text()")
isbn = isbn_elem[0].strip() if isbn_elem else None
brief_elem = content.xpath("//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro']/p/text()")
brief = '\n'.join(p.strip() for p in brief_elem) if brief_elem else None
img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
raw_img = None
if img_url:
img_response = requests.get(
img_url,
headers={
'accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'accept-encoding': 'gzip, deflate',
'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,fr-FR;q=0.6,fr;q=0.5,zh-TW;q=0.4',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 Edg/81.0.416.72',
'cache-control': 'no-cache',
'dnt': '1' ,
},
# proxies=proxies,
timeout=TIMEOUT,
)
if img_response.status_code == 200:
raw_img = img_response.content
# there are two html formats for authors and translators
authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""")
if not authors_elem:
authors_elem = content.xpath("""//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""")
if authors_elem:
authors = []
for author in authors_elem:
authors.append(RE_WHITESPACES.sub(' ', author.strip()))
else:
authors = None
translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""")
if not translators_elem:
translators_elem = content.xpath("""//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""")
if translators_elem:
translators = []
for translator in translators_elem:
translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
else:
translators = None
other = {}
cncode_elem = content.xpath("//div[@id='info']//span[text()='统一书号:']/following::text()")
if cncode_elem:
other['统一书号'] = cncode_elem[0].strip()
series_elem = content.xpath("//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
if series_elem:
other['丛书'] = series_elem[0].strip()
imprint_elem = content.xpath("//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
if imprint_elem:
other['出品方'] = imprint_elem[0].strip()
data = {
'title' : title,
'subtitle' : subtitle,
'orig_title' : orig_title,
'author' : authors,
'translator' : translators,
'language' : language,
'pub_house' : pub_house,
'pub_year' : pub_year,
'pub_month' : pub_month,
'binding' : binding,
'price' : price,
'pages' : pages,
'isbn' : isbn,
'brief' : brief,
'other_info' : other
}
return data, raw_img

View file

@ -1,8 +1,8 @@
$(document).ready( function() { $(document).ready( function() {
// assume there is only one input[file] on page // assume there is only one input[file] on page
$("input[type='file']").each(function() { // $("input[type='file']").each(function() {
$(this).after('<img src="#" alt="" id="previewImage" style="margin:10px 0; max-width:500px;"/>'); // $(this).after('<img src="#" alt="" id="previewImage" style="margin:10px 0; max-width:500px;"/>');
}) // })
// mark required // mark required
$("input[required]").each(function() { $("input[required]").each(function() {

View file

@ -0,0 +1,2 @@
<input type="{{ widget.type }}" name="{{ widget.name }}"{% include "django/forms/widgets/attrs.html" %}>
<img src="{{ widget.value|default_if_none:''|stringformat:'s' }}" alt="" id="previewImage" style="margin:10px 0; max-width:500px;">

View file

@ -40,8 +40,13 @@ def OAuth2_login(request):
request.session['new_user_token'] = token request.session['new_user_token'] = token
return redirect(reverse('users:register')) return redirect(reverse('users:register'))
else: else:
# TODO better fail result page return render(
return HttpResponse(content="Authentication failed.") request,
'common/error.html',
{
'msg': _("认证失败😫")
}
)
else: else:
return HttpResponseBadRequest() return HttpResponseBadRequest()
@ -49,7 +54,6 @@ def OAuth2_login(request):
# the 'login' page that user can see # the 'login' page that user can see
def login(request): def login(request):
if request.method == 'GET': if request.method == 'GET':
# TODO NOTE replace http with https!!!!
auth_url = f"https://{MASTODON_DOMAIN_NAME}{API_OAUTH_AUTHORIZE}?" +\ auth_url = f"https://{MASTODON_DOMAIN_NAME}{API_OAUTH_AUTHORIZE}?" +\
f"client_id={CLIENT_ID}&scope=read+write&" +\ f"client_id={CLIENT_ID}&scope=read+write&" +\
f"redirect_uri=https://{request.get_host()}{reverse('users:OAuth2_login')}" +\ f"redirect_uri=https://{request.get_host()}{reverse('users:OAuth2_login')}" +\