diff --git a/books/templates/books/review_detail.html b/books/templates/books/review_detail.html index d3eba529..da3fbaa3 100644 --- a/books/templates/books/review_detail.html +++ b/books/templates/books/review_detail.html @@ -22,6 +22,7 @@ + diff --git a/common/importers/douban.py b/common/importers/douban.py new file mode 100644 index 00000000..3f01b2c7 --- /dev/null +++ b/common/importers/douban.py @@ -0,0 +1,202 @@ +import openpyxl +import requests +import re +from lxml import html +from markdownify import markdownify as md +from datetime import datetime +from common.scraper import get_scraper_by_url +import logging +import pytz +from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist +from user_messages import api as msg +import django_rq +from common.utils import GenerateDateUUIDMediaFilePath +import os +from books.models import BookReview, Book +from movies.models import MovieReview, Movie +from music.models import AlbumReview, Album +from games.models import GameReview, Game +from common.scraper import DoubanAlbumScraper, DoubanBookScraper, DoubanGameScraper, DoubanMovieScraper +from PIL import Image +from io import BytesIO +import filetype + + +logger = logging.getLogger(__name__) + + +def fetch_remote_image(url): + try: + print(f'fetching remote image {url}') + raw_img = None + ext = None + if settings.SCRAPESTACK_KEY is not None: + dl_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}' + elif settings.SCRAPERAPI_KEY is not None: + dl_url = f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}' + else: + dl_url = url + img_response = requests.get(dl_url, timeout=settings.SCRAPING_TIMEOUT) + raw_img = img_response.content + img = Image.open(BytesIO(raw_img)) + img.load() # corrupted image will trigger exception + content_type = img_response.headers.get('Content-Type') + ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension + f = GenerateDateUUIDMediaFilePath(None, "x." + ext, settings.MARKDOWNX_MEDIA_PATH) + file = settings.MEDIA_ROOT + f + local_url = settings.MEDIA_URL + f + os.makedirs(os.path.dirname(file), exist_ok=True) + img.save(file) + print(f'remote image saved as {local_url}') + return local_url + except Exception: + print(f'unable to fetch remote image {url}') + return url + + +class DoubanImporter: + total = 0 + skipped = 0 + imported = 0 + failed = [] + user = None + visibility = 0 + file = None + + def __init__(self, user, visibility): + self.user = user + self.visibility = visibility + + def update_user_import_status(self, status): + self.user.preference.import_status['douban_pending'] = status + self.user.preference.import_status['douban_file'] = self.file + self.user.preference.import_status['douban_visibility'] = self.visibility + self.user.preference.import_status['douban_total'] = self.total + self.user.preference.import_status['douban_skipped'] = self.skipped + self.user.preference.import_status['douban_imported'] = self.imported + self.user.preference.import_status['douban_failed'] = self.failed + self.user.preference.save(update_fields=['import_status']) + + def import_from_file(self, uploaded_file): + try: + wb = openpyxl.open(uploaded_file, read_only=True, data_only=True, keep_links=False) + wb.close() + file = settings.MEDIA_ROOT + GenerateDateUUIDMediaFilePath(None, "x.xlsx", settings.SYNC_FILE_PATH_ROOT) + os.makedirs(os.path.dirname(file), exist_ok=True) + with open(file, 'wb') as destination: + for chunk in uploaded_file.chunks(): + destination.write(chunk) + self.file = file + self.update_user_import_status(2) + django_rq.get_queue('doufen').enqueue(self.import_from_file_task) + except Exception: + return False + # self.import_from_file_task(file, user, visibility) + return True + + def import_from_file_task(self): + msg.info(self.user, f'开始导入豆瓣评论') + self.update_user_import_status(1) + f = open(self.file, 'rb') + wb = openpyxl.load_workbook(f, read_only=True, data_only=True, keep_links=False) + self.import_sheet(wb['书评'], DoubanBookScraper, Book, BookReview) + self.import_sheet(wb['影评'], DoubanMovieScraper, Movie, MovieReview) + self.import_sheet(wb['乐评'], DoubanAlbumScraper, Album, AlbumReview) + self.import_sheet(wb['游戏评论&攻略'], DoubanGameScraper, Game, GameReview) + self.update_user_import_status(0) + msg.success(self.user, f'豆瓣评论导入完成,共处理{self.total}篇,已存在{self.skipped}篇,新增{self.imported}篇。') + if len(self.failed): + msg.error(self.user, f'豆瓣评论导入时未能处理以下网址:\n{" , ".join(self.failed)}') + + def import_sheet(self, worksheet, scraper, entity_class, review_class): + prefix = f'{self.user} {review_class.__name__} |' + if worksheet is None: # or worksheet.max_row < 2: + print(f'{prefix} empty sheet') + return + for row in worksheet.iter_rows(min_row=2, values_only=True): + cells = [cell for cell in row] + if len(cells) < 6: + continue + title = cells[0] + review_url = cells[2] + time = cells[3] + content = cells[6] + self.total += 1 + if time: + time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S") + tz = pytz.timezone('Asia/Shanghai') + time = time.replace(tzinfo=tz) + else: + time = None + if not content: + content = "" + if not title: + title = "" + r = self.import_review(title, review_url, content, time, scraper, entity_class, review_class) + if r == 1: + self.imported += 1 + elif r == 2: + self.skipped += 1 + else: + self.failed.append(review_url) + self.update_user_import_status(1) + + def import_review(self, title, review_url, content, time, scraper, entity_class, review_class): + # return 1: done / 2: skipped / None: failed + prefix = f'{self.user} {review_class.__name__} |' + url = None + print(f'{prefix} fetching {review_url}') + try: + if settings.SCRAPESTACK_KEY is not None: + _review_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={review_url}' + else: + _review_url = review_url + r = requests.get(_review_url, timeout=settings.SCRAPING_TIMEOUT) + if r.status_code != 200: + print(f'{prefix} fetching error {review_url} {r.status_code}') + return + h = html.fromstring(r.content.decode('utf-8')) + for u in h.xpath("//header[@class='main-hd']/a/@href"): + if '.douban.com/subject/' in u: + url = u + if not url: + print(f'{prefix} fetching error {review_url} unable to locate url') + return + except Exception: + print(f'{prefix} fetching exception {review_url}') + return + try: + entity = entity_class.objects.get(source_url=url) + print(f'{prefix} matched {url}') + except ObjectDoesNotExist: + try: + print(f'{prefix} scraping {url}') + scraper.scrape(url) + form = scraper.save(request_user=self.user) + entity = form.instance + except Exception as e: + print(f"{prefix} scrape failed: {url} {e}") + logger.error(f"{prefix} scrape failed: {url}", exc_info=e) + return + params = { + 'owner': self.user, + entity_class.__name__.lower(): entity + } + if review_class.objects.filter(**params).exists(): + return 2 + content = re.sub(r'([^<]+)', r'\1', content) + content = re.sub(r'
([^<]+)
', r'
\1
', content) + content = md(content) + content = re.sub(r'(?<=!\[\]\()([^)]+)(?=\))', lambda x: fetch_remote_image(x[1]), content) + params = { + 'owner': self.user, + 'created_time': time, + 'edited_time': time, + 'title': title, + 'content': content, + 'visibility': self.visibility, + entity_class.__name__.lower(): entity, + } + review_class.objects.create(**params) + return 1 diff --git a/common/static/lib/css/neo.css b/common/static/lib/css/neo.css index b12fa96c..c5a197e1 100644 --- a/common/static/lib/css/neo.css +++ b/common/static/lib/css/neo.css @@ -1,3 +1,15 @@ +.markdownx-preview h1 { + font-size: 2.5em; +} + +.markdownx-preview h2 { + font-size: 2.0em; +} + +.markdownx-preview h3 { + font-size: 1.6em; +} + .collection-item-position-edit { float: right; } diff --git a/games/templates/games/review_detail.html b/games/templates/games/review_detail.html index 986fd881..fa36fd8f 100644 --- a/games/templates/games/review_detail.html +++ b/games/templates/games/review_detail.html @@ -24,6 +24,7 @@ + diff --git a/movies/templates/movies/review_detail.html b/movies/templates/movies/review_detail.html index 401c2ae8..bced5529 100644 --- a/movies/templates/movies/review_detail.html +++ b/movies/templates/movies/review_detail.html @@ -24,6 +24,7 @@ + diff --git a/music/templates/music/album_review_detail.html b/music/templates/music/album_review_detail.html index bb665184..4ff09227 100644 --- a/music/templates/music/album_review_detail.html +++ b/music/templates/music/album_review_detail.html @@ -24,6 +24,7 @@ + diff --git a/music/templates/music/song_review_detail.html b/music/templates/music/song_review_detail.html index 6096ae93..82a723d4 100644 --- a/music/templates/music/song_review_detail.html +++ b/music/templates/music/song_review_detail.html @@ -24,6 +24,7 @@ + diff --git a/requirements.txt b/requirements.txt index 6d0e8a10..148df4b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ tqdm opencc dnspython typesense +markdownify diff --git a/users/models.py b/users/models.py index c118724d..174f97b3 100644 --- a/users/models.py +++ b/users/models.py @@ -114,6 +114,7 @@ class Preference(models.Model): default=list, ) export_status = models.JSONField(blank=True, null=True, encoder=DjangoJSONEncoder, default=dict) + import_status = models.JSONField(blank=True, null=True, encoder=DjangoJSONEncoder, default=dict) mastodon_publish_public = models.BooleanField(null=False, default=False) mastodon_append_tag = models.CharField(max_length=2048, default='') diff --git a/users/templates/users/data.html b/users/templates/users/data.html index 2250c690..fbb13044 100644 --- a/users/templates/users/data.html +++ b/users/templates/users/data.html @@ -145,6 +145,51 @@ +
+
+
+
{% trans '导入豆瓣评论数据' %}
+
+
+ {% csrf_token %} +
+

豆伴(豆坟)备份导出的.xlsx文件: + +

+

可见性: + + + +

+ {% if import_status.douban_pending %} + + {% else %} + + {% endif %} + + {% if import_status.douban_pending == 2 %} + 正在等待 + {% elif import_status.douban_total %} + {% if import_status.douban_pending == 1 %} + 正在导入,目前已 + {% else %} + 上次共计 + {% endif %} + 处理{{ import_status.douban_total }}篇,其中已存在{{ import_status.douban_skipped }}篇,新增{{ import_status.douban_imported }}篇 + {% endif %} +
+
+ NeoDB中已经存在的标记、短评和评论不会被覆盖;匿名用户看不到的评论目前无法导入。 +
+
+
+
+
+
+
diff --git a/users/urls.py b/users/urls.py index 5e0f1ea0..7e7279e2 100644 --- a/users/urls.py +++ b/users/urls.py @@ -9,6 +9,7 @@ urlpatterns = [ path('reconnect/', reconnect, name='reconnect'), path('data/', data, name='data'), path('data/import_goodreads', import_goodreads, name='import_goodreads'), + path('data/import_douban', import_douban, name='import_douban'), path('data/export_reviews', export_reviews, name='export_reviews'), path('data/export_marks', export_marks, name='export_marks'), path('data/sync_mastodon', sync_mastodon, name='sync_mastodon'), diff --git a/users/views.py b/users/views.py index 9a85251b..a4776f3a 100644 --- a/users/views.py +++ b/users/views.py @@ -39,6 +39,7 @@ from games.models import GameMark, GameReview from music.models import AlbumMark, SongMark, AlbumReview, SongReview from collection.models import Collection from common.importers.goodreads import GoodreadsImporter +from common.importers.douban import DoubanImporter # Views @@ -1060,6 +1061,7 @@ def preferences(request): def data(request): return render(request, 'users/data.html', { 'latest_task': request.user.user_synctasks.order_by("-id").first(), + 'import_status': request.user.preference.import_status, 'export_status': request.user.preference.export_status }) @@ -1152,7 +1154,18 @@ def import_goodreads(request): if request.method == 'POST': raw_url = request.POST.get('url') if GoodreadsImporter.import_from_url(raw_url, request.user): - messages.add_message(request, messages.INFO, _('开始后台导入。')) + messages.add_message(request, messages.INFO, _('链接已保存,等待后台导入。')) else: messages.add_message(request, messages.ERROR, _('无法识别链接。')) return redirect(reverse("users:data")) + + +@login_required +def import_douban(request): + if request.method == 'POST': + importer = DoubanImporter(request.user, request.POST.get('visibility')) + if importer.import_from_file(request.FILES['file']): + messages.add_message(request, messages.INFO, _('文件上传成功,等待后台导入。')) + else: + messages.add_message(request, messages.ERROR, _('无法识别文件。')) + return redirect(reverse("users:data"))