lib.itmens/journal/importers/douban.py

374 lines
13 KiB
Python
Raw Normal View History

import os
2022-12-31 00:20:20 -05:00
import re
from datetime import datetime
import openpyxl
2022-12-31 00:20:20 -05:00
import pytz
from django.conf import settings
2023-08-11 11:55:42 -04:00
from loguru import logger
from markdownify import markdownify as md
2022-12-31 00:20:20 -05:00
from catalog.common import *
from catalog.common.downloaders import *
from catalog.models import *
from catalog.sites import DoubanBook, DoubanDrama, DoubanGame, DoubanMovie, DoubanMusic
2022-12-31 00:20:20 -05:00
from catalog.sites.douban import DoubanDownloader
from common.utils import GenerateDateUUIDMediaFilePath
2022-12-31 00:20:20 -05:00
from journal.models import *
2024-12-26 10:54:25 -05:00
from users.models import Task
2022-12-31 00:20:20 -05:00
_tz_sh = pytz.timezone("Asia/Shanghai")
def _fetch_remote_image(url):
try:
2023-08-11 11:55:42 -04:00
logger.info(f"fetching remote image {url}")
2022-12-31 00:20:20 -05:00
imgdl = ProxiedImageDownloader(url)
raw_img = imgdl.download().content
ext = imgdl.extention
2024-01-10 22:20:57 -05:00
f = GenerateDateUUIDMediaFilePath(f"x.{ext}", settings.MARKDOWNX_MEDIA_PATH)
2023-11-28 17:13:47 -05:00
file = settings.MEDIA_ROOT + "/" + f
2022-12-31 00:20:20 -05:00
local_url = settings.MEDIA_URL + f
os.makedirs(os.path.dirname(file), exist_ok=True)
with open(file, "wb") as binary_file:
binary_file.write(raw_img)
2023-08-11 11:55:42 -04:00
# logger.info(f'remote image saved as {local_url}')
2022-12-31 00:20:20 -05:00
return local_url
2024-05-25 23:38:11 -04:00
except Exception as e:
logger.error("unable to fetch image", extra={"url": url, "exception": e})
2022-12-31 00:20:20 -05:00
return url
2024-12-26 10:54:25 -05:00
class DoubanImporter(Task):
class Meta:
app_label = "journal" # workaround bug in TypedModel
2022-12-31 00:20:20 -05:00
2024-12-26 10:54:25 -05:00
TaskQueue = "import"
DefaultMetadata = {
"total": 0,
"processed": 0,
"skipped": 0,
"imported": 0,
"failed": 0,
"mode": 0,
"visibility": 0,
"failed_urls": [],
"file": None,
}
2022-12-31 00:20:20 -05:00
mark_sheet_config = {
"想读": [ShelfType.WISHLIST],
"在读": [ShelfType.PROGRESS],
"读过": [ShelfType.COMPLETE],
"想看": [ShelfType.WISHLIST],
"在看": [ShelfType.PROGRESS],
2023-01-09 15:59:21 -05:00
"看过": [ShelfType.COMPLETE],
2022-12-31 00:20:20 -05:00
"想听": [ShelfType.WISHLIST],
"在听": [ShelfType.PROGRESS],
"听过": [ShelfType.COMPLETE],
"想玩": [ShelfType.WISHLIST],
"在玩": [ShelfType.PROGRESS],
"玩过": [ShelfType.COMPLETE],
2023-06-05 03:39:37 -04:00
"想看的舞台剧": [ShelfType.WISHLIST],
"看过的舞台剧": [ShelfType.COMPLETE],
2022-12-31 00:20:20 -05:00
}
review_sheet_config = {
"书评": [Edition],
"影评": [Movie],
"乐评": [Album],
2023-06-05 03:39:37 -04:00
"剧评": [Performance],
2022-12-31 00:20:20 -05:00
"游戏评论&攻略": [Game],
}
2024-12-26 10:54:25 -05:00
@classmethod
def validate_file(cls, uploaded_file):
try:
wb = openpyxl.open(
uploaded_file, read_only=True, data_only=True, keep_links=False
)
sheets = cls.mark_sheet_config.keys() | cls.review_sheet_config.keys()
for name in sheets:
if name in wb:
return True
except Exception as e:
logger.error(
f"unable to validate excel file {uploaded_file}", extra={"exception": e}
)
return False
2022-12-31 00:20:20 -05:00
mark_data = {}
review_data = {}
entity_lookup = {}
def load_sheets(self):
"""Load data into mark_data / review_data / entity_lookup"""
2024-12-26 10:54:25 -05:00
f = open(self.metadata["file"], "rb")
2022-12-31 00:20:20 -05:00
wb = openpyxl.load_workbook(f, read_only=True, data_only=True, keep_links=False)
for data, config in [
(self.mark_data, self.mark_sheet_config),
(self.review_data, self.review_sheet_config),
]:
for name in config:
data[name] = []
if name in wb:
2023-08-11 11:55:42 -04:00
logger.info(f"{self.user} parsing {name}")
2022-12-31 00:20:20 -05:00
for row in wb[name].iter_rows(min_row=2, values_only=True):
cells = [cell for cell in row]
if len(cells) > 6 and cells[0]:
data[name].append(cells)
for sheet in self.mark_data.values():
for cells in sheet:
# entity_lookup["title|rating"] = [(url, time), ...]
k = f"{cells[0]}|{cells[5]}"
v = (cells[3], cells[4])
if k in self.entity_lookup:
self.entity_lookup[k].append(v)
else:
self.entity_lookup[k] = [v]
2024-12-26 10:54:25 -05:00
self.metadata["total"] = sum(map(lambda a: len(a), self.mark_data.values()))
self.metadata["total"] += sum(map(lambda a: len(a), self.review_data.values()))
self.save()
2022-12-31 00:20:20 -05:00
def guess_entity_url(self, title, rating, timestamp):
k = f"{title}|{rating}"
if k not in self.entity_lookup:
return None
v = self.entity_lookup[k]
if len(v) > 1:
v.sort(
key=lambda c: abs(
timestamp
- (
datetime.strptime(c[1], "%Y-%m-%d %H:%M:%S")
if isinstance(c[1], str)
2022-12-31 00:20:20 -05:00
else c[1]
).replace(tzinfo=_tz_sh)
)
)
return v[0][0]
# for sheet in self.mark_data.values():
# for cells in sheet:
# if cells[0] == title and cells[5] == rating:
# return cells[3]
2024-12-26 10:54:25 -05:00
def run(self):
2023-08-11 11:55:42 -04:00
logger.info(f"{self.user} import start")
2024-12-26 10:54:25 -05:00
self.load_sheets()
logger.info(f"{self.user} sheet loaded, {self.metadata['total']} lines total")
for name, param in self.mark_sheet_config.items():
self.import_mark_sheet(self.mark_data[name], param[0], name)
for name, param in self.review_sheet_config.items():
self.import_review_sheet(self.review_data[name], name)
self.message = f"豆瓣标记和评论导入完成,共处理{self.metadata['total']}篇,已存在{self.metadata['skipped']}篇,新增{self.metadata['imported']}篇。"
if len(self.metadata["failed_urls"]) > 0:
self.message += f"导入时未能处理{len(self.metadata['failed_urls'])}个网址。"
2024-12-26 10:54:25 -05:00
self.save()
2022-12-31 00:20:20 -05:00
def import_mark_sheet(self, worksheet, shelf_type, sheet_name):
prefix = f"{self.user} {sheet_name}|"
if worksheet is None: # or worksheet.max_row < 2:
2023-08-11 11:55:42 -04:00
logger.warning(f"{prefix} empty sheet")
2022-12-31 00:20:20 -05:00
return
for cells in worksheet:
if len(cells) < 6:
continue
# title = cells[0] or ""
url = cells[3]
time = cells[4]
rating = cells[5]
try:
rating_grade = int(rating) * 2 if rating else None
2024-04-06 00:13:50 -04:00
except Exception:
rating_grade = None
2022-12-31 00:20:20 -05:00
tags = cells[6] if len(cells) >= 7 else ""
2024-01-06 00:12:43 -05:00
try:
tags = tags.split(",") if tags else []
2024-04-06 00:13:50 -04:00
except Exception:
2024-01-06 00:12:43 -05:00
tags = []
2022-12-31 00:20:20 -05:00
comment = cells[7] if len(cells) >= 8 else None
2024-12-26 10:54:25 -05:00
self.metadata["processed"] += 1
try:
if isinstance(time, str):
2022-12-31 00:20:20 -05:00
time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
time = time.replace(tzinfo=_tz_sh)
2024-04-06 00:13:50 -04:00
except Exception:
2022-12-31 00:20:20 -05:00
time = None
r = self.import_mark(url, shelf_type, comment, rating_grade, tags, time)
if r == 1:
2024-12-26 10:54:25 -05:00
self.metadata["imported"] += 1
2022-12-31 00:20:20 -05:00
elif r == 2:
2024-12-26 10:54:25 -05:00
self.metadata["skipped"] += 1
self.save()
2022-12-31 00:20:20 -05:00
def import_mark(self, url, shelf_type, comment, rating_grade, tags, time):
"""
Import one mark: return 1: done / 2: skipped / None: failed
"""
item = self.get_item_by_url(url)
if not item:
2023-08-11 11:55:42 -04:00
logger.warning(f"{self.user} | match/fetch {url} failed")
2022-12-31 00:20:20 -05:00
return
2023-11-28 17:13:47 -05:00
mark = Mark(self.user.identity, item)
2024-12-26 10:54:25 -05:00
if self.metadata["mode"] == 0 and (
2022-12-31 00:20:20 -05:00
mark.shelf_type == shelf_type
or mark.shelf_type == ShelfType.COMPLETE
or (
2024-01-20 22:52:08 -05:00
mark.shelf_type in [ShelfType.PROGRESS, ShelfType.DROPPED]
2022-12-31 00:20:20 -05:00
and shelf_type == ShelfType.WISHLIST
)
):
2023-01-12 19:58:03 +00:00
print("-", end="", flush=True)
2022-12-31 00:20:20 -05:00
return 2
mark.update(
2024-12-26 10:54:25 -05:00
shelf_type,
comment,
rating_grade,
tags,
self.metadata["visibility"],
created_time=time,
2022-12-31 00:20:20 -05:00
)
2023-01-12 19:58:03 +00:00
print("+", end="", flush=True)
2022-12-31 00:20:20 -05:00
return 1
def import_review_sheet(self, worksheet, sheet_name):
prefix = f"{self.user} {sheet_name}|"
if worksheet is None: # or worksheet.max_row < 2:
2023-08-11 11:55:42 -04:00
logger.warning(f"{prefix} empty sheet")
2022-12-31 00:20:20 -05:00
return
for cells in worksheet:
if len(cells) < 6:
continue
title = cells[0]
entity_title = (
re.sub("^《", "", re.sub("》$", "", cells[1])) if cells[1] else ""
)
review_url = cells[2]
time = cells[3]
rating = cells[4]
content = cells[6]
2024-12-26 10:54:25 -05:00
self.metadata["processed"] += 1
2022-12-31 00:20:20 -05:00
if time:
if isinstance(time, str):
2022-12-31 00:20:20 -05:00
time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
time = time.replace(tzinfo=_tz_sh)
else:
time = None
if not content:
content = ""
if not title:
title = ""
r = self.import_review(
entity_title, rating, title, review_url, content, time
)
if r == 1:
2024-12-26 10:54:25 -05:00
self.metadata["imported"] += 1
2022-12-31 00:20:20 -05:00
elif r == 2:
2024-12-26 10:54:25 -05:00
self.metadata["skipped"] += 1
2022-12-31 00:20:20 -05:00
else:
2024-12-26 10:54:25 -05:00
self.metadata["failed_urls"].append(review_url)
self.save()
2022-12-31 00:20:20 -05:00
def get_item_by_url(self, url):
item = None
2023-12-12 21:17:52 -05:00
if not url:
logger.warning("URL empty")
2023-12-12 21:17:52 -05:00
return None
2022-12-31 00:20:20 -05:00
try:
site = SiteManager.get_site_by_url(url)
2023-08-11 11:55:42 -04:00
if not site:
raise ValueError(f"URL unrecognized {url}")
2022-12-31 00:20:20 -05:00
item = site.get_item()
if not item:
2023-08-11 11:55:42 -04:00
logger.info(f"fetching {url}")
2022-12-31 00:20:20 -05:00
site.get_resource_ready()
item = site.get_item()
else:
2023-08-11 11:55:42 -04:00
# logger.info(f"matched {url}")
2023-01-12 19:58:03 +00:00
print(".", end="", flush=True)
except DownloadError as e:
if e.response_type == RESPONSE_CENSORSHIP:
# avoid flood error log since there are too many
logger.warning(f"fetching error: {url}", extra={"exception": e})
else:
logger.error(f"fetching error: {url}", extra={"exception": e})
2022-12-31 00:20:20 -05:00
except Exception as e:
2024-05-25 23:38:11 -04:00
logger.error(f"fetching error: {url}", extra={"exception": e})
2022-12-31 00:20:20 -05:00
if item is None:
2024-12-26 10:54:25 -05:00
self.metadata["failed_urls"].append(str(url))
2022-12-31 00:20:20 -05:00
return item
def is_douban_item_url(self, url):
for cls in [
DoubanBook,
DoubanDrama,
DoubanMovie,
DoubanMusic,
DoubanGame,
]:
if cls.url_to_id(url):
return True
2022-12-31 00:20:20 -05:00
def import_review(self, entity_title, rating, title, review_url, content, time):
"""
Import one review: return 1: done / 2: skipped / None: failed
"""
prefix = f"{self.user} |"
url = self.guess_entity_url(entity_title, rating, time)
if url is None:
2023-08-11 11:55:42 -04:00
logger.info(f"{prefix} fetching review {review_url}")
2022-12-31 00:20:20 -05:00
try:
h = DoubanDownloader(review_url).download().html()
2023-08-11 11:55:42 -04:00
urls = h.xpath("//header[@class='main-hd']/a/@href")
for u in urls: # type:ignore
if self.is_douban_item_url(u):
2022-12-31 00:20:20 -05:00
url = u
if not url:
2023-08-11 11:55:42 -04:00
logger.warning(
2022-12-31 00:20:20 -05:00
f"{prefix} fetching error {review_url} unable to locate entity url"
)
return
except Exception:
2023-08-11 11:55:42 -04:00
logger.error(f"{prefix} fetching review exception {review_url}")
2022-12-31 00:20:20 -05:00
return
item = self.get_item_by_url(url)
if not item:
2023-08-11 11:55:42 -04:00
logger.warning(f"{prefix} match/fetch {url} failed")
2022-12-31 00:20:20 -05:00
return
2023-01-09 20:36:12 -05:00
if (
2024-12-26 10:54:25 -05:00
self.metadata["mode"] == 1
2023-11-28 17:13:47 -05:00
and Review.objects.filter(owner=self.user.identity, item=item).exists()
2023-01-09 20:36:12 -05:00
):
2022-12-31 00:20:20 -05:00
return 2
content = re.sub(
r'<span style="font-weight: bold;">([^<]+)</span>', r"<b>\1</b>", content
)
content = re.sub(r"(<img [^>]+>)", r"\1<br>", content)
content = re.sub(
r'<div class="image-caption">([^<]+)</div>', r"<br><i>\1</i><br>", content
)
content = md(content)
content = re.sub(
r"(?<=!\[\]\()([^)]+)(?=\))", lambda x: _fetch_remote_image(x[1]), content
)
params = {
"created_time": time,
"edited_time": time,
"title": title,
"body": content,
2024-12-26 10:54:25 -05:00
"visibility": self.metadata["visibility"],
2022-12-31 00:20:20 -05:00
}
2023-12-31 07:41:08 -05:00
try:
Review.objects.update_or_create(
owner=self.user.identity, item=item, defaults=params
)
2024-04-06 00:13:50 -04:00
except Exception:
2023-12-31 07:41:08 -05:00
logger.warning(f"{prefix} update multiple review {review_url}")
r = (
Review.objects.filter(owner=self.user.identity, item=item)
.order_by("-created_time")
.first()
)
if r:
Review.objects.filter(pk=r.pk).update(**params)
2022-12-31 00:20:20 -05:00
return 1