import douban review

This commit is contained in:
Your Name 2022-05-03 20:46:06 -04:00
parent 2f7dd18bbb
commit 9e05e01c6c
12 changed files with 281 additions and 1 deletions

View file

@ -22,6 +22,7 @@
<script src="{% static 'js/rating-star-readonly.js' %}"></script>
<link rel="stylesheet" href="{% static 'lib/css/rating-star.css' %}">
<link rel="stylesheet" href="{% static 'css/boofilsic.min.css' %}">
<link rel="stylesheet" href="{% static 'lib/css/neo.css' %}">
</head>
<body>

202
common/importers/douban.py Normal file
View file

@ -0,0 +1,202 @@
import openpyxl
import requests
import re
from lxml import html
from markdownify import markdownify as md
from datetime import datetime
from common.scraper import get_scraper_by_url
import logging
import pytz
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from user_messages import api as msg
import django_rq
from common.utils import GenerateDateUUIDMediaFilePath
import os
from books.models import BookReview, Book
from movies.models import MovieReview, Movie
from music.models import AlbumReview, Album
from games.models import GameReview, Game
from common.scraper import DoubanAlbumScraper, DoubanBookScraper, DoubanGameScraper, DoubanMovieScraper
from PIL import Image
from io import BytesIO
import filetype
logger = logging.getLogger(__name__)
def fetch_remote_image(url):
try:
print(f'fetching remote image {url}')
raw_img = None
ext = None
if settings.SCRAPESTACK_KEY is not None:
dl_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}'
elif settings.SCRAPERAPI_KEY is not None:
dl_url = f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}'
else:
dl_url = url
img_response = requests.get(dl_url, timeout=settings.SCRAPING_TIMEOUT)
raw_img = img_response.content
img = Image.open(BytesIO(raw_img))
img.load() # corrupted image will trigger exception
content_type = img_response.headers.get('Content-Type')
ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
f = GenerateDateUUIDMediaFilePath(None, "x." + ext, settings.MARKDOWNX_MEDIA_PATH)
file = settings.MEDIA_ROOT + f
local_url = settings.MEDIA_URL + f
os.makedirs(os.path.dirname(file), exist_ok=True)
img.save(file)
print(f'remote image saved as {local_url}')
return local_url
except Exception:
print(f'unable to fetch remote image {url}')
return url
class DoubanImporter:
total = 0
skipped = 0
imported = 0
failed = []
user = None
visibility = 0
file = None
def __init__(self, user, visibility):
self.user = user
self.visibility = visibility
def update_user_import_status(self, status):
self.user.preference.import_status['douban_pending'] = status
self.user.preference.import_status['douban_file'] = self.file
self.user.preference.import_status['douban_visibility'] = self.visibility
self.user.preference.import_status['douban_total'] = self.total
self.user.preference.import_status['douban_skipped'] = self.skipped
self.user.preference.import_status['douban_imported'] = self.imported
self.user.preference.import_status['douban_failed'] = self.failed
self.user.preference.save(update_fields=['import_status'])
def import_from_file(self, uploaded_file):
try:
wb = openpyxl.open(uploaded_file, read_only=True, data_only=True, keep_links=False)
wb.close()
file = settings.MEDIA_ROOT + GenerateDateUUIDMediaFilePath(None, "x.xlsx", settings.SYNC_FILE_PATH_ROOT)
os.makedirs(os.path.dirname(file), exist_ok=True)
with open(file, 'wb') as destination:
for chunk in uploaded_file.chunks():
destination.write(chunk)
self.file = file
self.update_user_import_status(2)
django_rq.get_queue('doufen').enqueue(self.import_from_file_task)
except Exception:
return False
# self.import_from_file_task(file, user, visibility)
return True
def import_from_file_task(self):
msg.info(self.user, f'开始导入豆瓣评论')
self.update_user_import_status(1)
f = open(self.file, 'rb')
wb = openpyxl.load_workbook(f, read_only=True, data_only=True, keep_links=False)
self.import_sheet(wb['书评'], DoubanBookScraper, Book, BookReview)
self.import_sheet(wb['影评'], DoubanMovieScraper, Movie, MovieReview)
self.import_sheet(wb['乐评'], DoubanAlbumScraper, Album, AlbumReview)
self.import_sheet(wb['游戏评论&攻略'], DoubanGameScraper, Game, GameReview)
self.update_user_import_status(0)
msg.success(self.user, f'豆瓣评论导入完成,共处理{self.total}篇,已存在{self.skipped}篇,新增{self.imported}篇。')
if len(self.failed):
msg.error(self.user, f'豆瓣评论导入时未能处理以下网址:\n{" , ".join(self.failed)}')
def import_sheet(self, worksheet, scraper, entity_class, review_class):
prefix = f'{self.user} {review_class.__name__} |'
if worksheet is None: # or worksheet.max_row < 2:
print(f'{prefix} empty sheet')
return
for row in worksheet.iter_rows(min_row=2, values_only=True):
cells = [cell for cell in row]
if len(cells) < 6:
continue
title = cells[0]
review_url = cells[2]
time = cells[3]
content = cells[6]
self.total += 1
if time:
time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
tz = pytz.timezone('Asia/Shanghai')
time = time.replace(tzinfo=tz)
else:
time = None
if not content:
content = ""
if not title:
title = ""
r = self.import_review(title, review_url, content, time, scraper, entity_class, review_class)
if r == 1:
self.imported += 1
elif r == 2:
self.skipped += 1
else:
self.failed.append(review_url)
self.update_user_import_status(1)
def import_review(self, title, review_url, content, time, scraper, entity_class, review_class):
# return 1: done / 2: skipped / None: failed
prefix = f'{self.user} {review_class.__name__} |'
url = None
print(f'{prefix} fetching {review_url}')
try:
if settings.SCRAPESTACK_KEY is not None:
_review_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={review_url}'
else:
_review_url = review_url
r = requests.get(_review_url, timeout=settings.SCRAPING_TIMEOUT)
if r.status_code != 200:
print(f'{prefix} fetching error {review_url} {r.status_code}')
return
h = html.fromstring(r.content.decode('utf-8'))
for u in h.xpath("//header[@class='main-hd']/a/@href"):
if '.douban.com/subject/' in u:
url = u
if not url:
print(f'{prefix} fetching error {review_url} unable to locate url')
return
except Exception:
print(f'{prefix} fetching exception {review_url}')
return
try:
entity = entity_class.objects.get(source_url=url)
print(f'{prefix} matched {url}')
except ObjectDoesNotExist:
try:
print(f'{prefix} scraping {url}')
scraper.scrape(url)
form = scraper.save(request_user=self.user)
entity = form.instance
except Exception as e:
print(f"{prefix} scrape failed: {url} {e}")
logger.error(f"{prefix} scrape failed: {url}", exc_info=e)
return
params = {
'owner': self.user,
entity_class.__name__.lower(): entity
}
if review_class.objects.filter(**params).exists():
return 2
content = re.sub(r'<span style="font-weight: bold;">([^<]+)</span>', r'<b>\1</b>', content)
content = re.sub(r'<div class="image-caption">([^<]+)</div>', r'<br><i>\1</i><br>', content)
content = md(content)
content = re.sub(r'(?<=!\[\]\()([^)]+)(?=\))', lambda x: fetch_remote_image(x[1]), content)
params = {
'owner': self.user,
'created_time': time,
'edited_time': time,
'title': title,
'content': content,
'visibility': self.visibility,
entity_class.__name__.lower(): entity,
}
review_class.objects.create(**params)
return 1

View file

@ -1,3 +1,15 @@
.markdownx-preview h1 {
font-size: 2.5em;
}
.markdownx-preview h2 {
font-size: 2.0em;
}
.markdownx-preview h3 {
font-size: 1.6em;
}
.collection-item-position-edit {
float: right;
}

View file

@ -24,6 +24,7 @@
<script src="{% static 'js/rating-star-readonly.js' %}"></script>
<link rel="stylesheet" href="{% static 'lib/css/rating-star.css' %}">
<link rel="stylesheet" href="{% static 'css/boofilsic.min.css' %}">
<link rel="stylesheet" href="{% static 'lib/css/neo.css' %}">
</head>
<body>

View file

@ -24,6 +24,7 @@
<script src="{% static 'js/rating-star-readonly.js' %}"></script>
<link rel="stylesheet" href="{% static 'lib/css/rating-star.css' %}">
<link rel="stylesheet" href="{% static 'css/boofilsic.min.css' %}">
<link rel="stylesheet" href="{% static 'lib/css/neo.css' %}">
</head>
<body>

View file

@ -24,6 +24,7 @@
<script src="{% static 'js/rating-star-readonly.js' %}"></script>
<link rel="stylesheet" href="{% static 'lib/css/rating-star.css' %}">
<link rel="stylesheet" href="{% static 'css/boofilsic.min.css' %}">
<link rel="stylesheet" href="{% static 'lib/css/neo.css' %}">
</head>
<body>

View file

@ -24,6 +24,7 @@
<script src="{% static 'js/rating-star-readonly.js' %}"></script>
<link rel="stylesheet" href="{% static 'lib/css/rating-star.css' %}">
<link rel="stylesheet" href="{% static 'css/boofilsic.min.css' %}">
<link rel="stylesheet" href="{% static 'lib/css/neo.css' %}">
</head>
<body>

View file

@ -19,3 +19,4 @@ tqdm
opencc
dnspython
typesense
markdownify

View file

@ -114,6 +114,7 @@ class Preference(models.Model):
default=list,
)
export_status = models.JSONField(blank=True, null=True, encoder=DjangoJSONEncoder, default=dict)
import_status = models.JSONField(blank=True, null=True, encoder=DjangoJSONEncoder, default=dict)
mastodon_publish_public = models.BooleanField(null=False, default=False)
mastodon_append_tag = models.CharField(max_length=2048, default='')

View file

@ -145,6 +145,51 @@
</div>
</div>
<div class="main-section-wrapper">
<div class="tools-section-wrapper">
<div class="import-panel">
<h5 class="import-panel__label">{% trans '导入豆瓣评论数据' %}</h5>
<div class="import-panel__body">
<form action="{% url 'users:import_douban' %}" method="POST" enctype="multipart/form-data" >
{% csrf_token %}
<div class="import-panel__checkbox">
<p><a href="https://doufen.org" target="_blank">豆伴(豆坟)</a>备份导出的.xlsx文件:
<input type="file" name="file" id="excel" required accept=".xlsx">
</p>
<p>可见性:
<label for="id_visibility_0"><input type="radio" name="visibility" value="0" required="" id="id_visibility_0" checked>
公开</label>
<label for="id_visibility_1"><input type="radio" name="visibility" value="1" required="" id="id_visibility_1">
仅关注者</label>
<label for="id_visibility_2"><input type="radio" name="visibility" value="2" required="" id="id_visibility_2">
仅自己</label>
</p>
{% if import_status.douban_pending %}
<input type="submit" class="import-panel__button" value="{% trans '备份文件已上传' %}" disabled />
{% else %}
<input type="submit" class="import-panel__button" value="{% trans '导入' %}"/>
{% endif %}
{% if import_status.douban_pending == 2 %}
正在等待
{% elif import_status.douban_total %}
{% if import_status.douban_pending == 1 %}
正在导入,目前已
{% else %}
上次共计
{% endif %}
处理{{ import_status.douban_total }}篇,其中已存在{{ import_status.douban_skipped }}篇,新增{{ import_status.douban_imported }}篇
{% endif %}
</div>
<div>
NeoDB中已经存在的标记、短评和评论不会被覆盖匿名用户看不到的评论目前无法导入。
</div>
</form>
</div>
</div>
</div>
</div>
<div class="main-section-wrapper">
<div class="tools-section-wrapper">
<div class="import-panel">

View file

@ -9,6 +9,7 @@ urlpatterns = [
path('reconnect/', reconnect, name='reconnect'),
path('data/', data, name='data'),
path('data/import_goodreads', import_goodreads, name='import_goodreads'),
path('data/import_douban', import_douban, name='import_douban'),
path('data/export_reviews', export_reviews, name='export_reviews'),
path('data/export_marks', export_marks, name='export_marks'),
path('data/sync_mastodon', sync_mastodon, name='sync_mastodon'),

View file

@ -39,6 +39,7 @@ from games.models import GameMark, GameReview
from music.models import AlbumMark, SongMark, AlbumReview, SongReview
from collection.models import Collection
from common.importers.goodreads import GoodreadsImporter
from common.importers.douban import DoubanImporter
# Views
@ -1060,6 +1061,7 @@ def preferences(request):
def data(request):
return render(request, 'users/data.html', {
'latest_task': request.user.user_synctasks.order_by("-id").first(),
'import_status': request.user.preference.import_status,
'export_status': request.user.preference.export_status
})
@ -1152,7 +1154,18 @@ def import_goodreads(request):
if request.method == 'POST':
raw_url = request.POST.get('url')
if GoodreadsImporter.import_from_url(raw_url, request.user):
messages.add_message(request, messages.INFO, _('开始后台导入。'))
messages.add_message(request, messages.INFO, _('链接已保存,等待后台导入。'))
else:
messages.add_message(request, messages.ERROR, _('无法识别链接。'))
return redirect(reverse("users:data"))
@login_required
def import_douban(request):
if request.method == 'POST':
importer = DoubanImporter(request.user, request.POST.get('visibility'))
if importer.import_from_file(request.FILES['file']):
messages.add_message(request, messages.INFO, _('文件上传成功,等待后台导入。'))
else:
messages.add_message(request, messages.ERROR, _('无法识别文件。'))
return redirect(reverse("users:data"))