improve review import by guess entity urls

This commit is contained in:
Your Name 2022-05-05 10:12:36 -04:00
parent 3175fab439
commit 45f1bdcd18
2 changed files with 106 additions and 42 deletions

View file

@ -13,17 +13,19 @@ from user_messages import api as msg
import django_rq
from common.utils import GenerateDateUUIDMediaFilePath
import os
from books.models import BookReview, Book
from movies.models import MovieReview, Movie
from music.models import AlbumReview, Album
from games.models import GameReview, Game
from books.models import BookReview, Book, BookMark, BookTag
from movies.models import MovieReview, Movie, MovieMark, MovieTag
from music.models import AlbumReview, Album, AlbumMark, AlbumTag
from games.models import GameReview, Game, GameMark, GameTag
from common.scraper import DoubanAlbumScraper, DoubanBookScraper, DoubanGameScraper, DoubanMovieScraper
from PIL import Image
from io import BytesIO
import filetype
from common.models import MarkStatusEnum
logger = logging.getLogger(__name__)
tz_sh = pytz.timezone('Asia/Shanghai')
def fetch_remote_image(url):
@ -48,7 +50,7 @@ def fetch_remote_image(url):
local_url = settings.MEDIA_URL + f
os.makedirs(os.path.dirname(file), exist_ok=True)
img.save(file)
print(f'remote image saved as {local_url}')
# print(f'remote image saved as {local_url}')
return local_url
except Exception:
print(f'unable to fetch remote image {url}')
@ -57,6 +59,7 @@ def fetch_remote_image(url):
class DoubanImporter:
total = 0
processed = 0
skipped = 0
imported = 0
failed = []
@ -73,6 +76,7 @@ class DoubanImporter:
self.user.preference.import_status['douban_file'] = self.file
self.user.preference.import_status['douban_visibility'] = self.visibility
self.user.preference.import_status['douban_total'] = self.total
self.user.preference.import_status['douban_processed'] = self.processed
self.user.preference.import_status['douban_skipped'] = self.skipped
self.user.preference.import_status['douban_imported'] = self.imported
self.user.preference.import_status['douban_failed'] = self.failed
@ -96,46 +100,104 @@ class DoubanImporter:
# self.import_from_file_task(file, user, visibility)
return True
mark_sheet_config = {
'想读': [MarkStatusEnum.WISH, DoubanBookScraper, Book, BookMark, BookTag],
'在读': [MarkStatusEnum.DO, DoubanBookScraper, Book, BookMark, BookTag],
'读过': [MarkStatusEnum.COLLECT, DoubanBookScraper, Book, BookMark, BookTag],
'想看': [MarkStatusEnum.WISH, DoubanMovieScraper, Movie, MovieMark, MovieTag],
'在看': [MarkStatusEnum.DO, DoubanMovieScraper, Movie, MovieMark, MovieTag],
'想看': [MarkStatusEnum.COLLECT, DoubanMovieScraper, Movie, MovieMark, MovieTag],
'想听': [MarkStatusEnum.WISH, DoubanAlbumScraper, Album, AlbumMark, AlbumTag],
'在听': [MarkStatusEnum.DO, DoubanAlbumScraper, Album, AlbumMark, AlbumTag],
'听过': [MarkStatusEnum.COLLECT, DoubanAlbumScraper, Album, AlbumMark, AlbumTag],
'想玩': [MarkStatusEnum.WISH, DoubanGameScraper, Game, GameMark, GameTag],
'在玩': [MarkStatusEnum.DO, DoubanGameScraper, Game, GameMark, GameTag],
'玩过': [MarkStatusEnum.COLLECT, DoubanGameScraper, Game, GameMark, GameTag],
}
review_sheet_config = {
'书评': [DoubanBookScraper, Book, BookReview],
'影评': [DoubanMovieScraper, Movie, MovieReview],
'乐评': [DoubanAlbumScraper, Album, AlbumReview],
'游戏评论&攻略': [DoubanGameScraper, Game, GameReview],
}
mark_data = {}
review_data = {}
entity_lookup = {}
def load_sheets(self):
f = open(self.file, 'rb')
wb = openpyxl.load_workbook(f, read_only=True, data_only=True, keep_links=False)
for data, config in [(self.mark_data, self.mark_sheet_config), (self.review_data, self.review_sheet_config)]:
for name in config:
data[name] = []
if name in wb:
print(f'{self.user} parsing {name}')
for row in wb[name].iter_rows(min_row=2, values_only=True):
cells = [cell for cell in row]
if len(cells) > 6:
data[name].append(cells)
for sheet in self.mark_data.values():
for cells in sheet:
# entity_lookup["title|rating"] = [(url, time), ...]
k = f'{cells[0]}|{cells[5]}'
v = (cells[3], cells[4])
if k in self.entity_lookup:
self.entity_lookup[k].append(v)
else:
self.entity_lookup[k] = [v]
self.total = sum(map(lambda a: len(a), self.review_data.values()))
def guess_entity_url(self, title, rating, timestamp):
k = f'{title}|{rating}'
if k not in self.entity_lookup:
return None
v = self.entity_lookup[k]
if len(v) > 1:
v.sort(key=lambda c: abs(timestamp - datetime.strptime(c[1], "%Y-%m-%d %H:%M:%S").replace(tzinfo=tz_sh)))
return v[0][0]
# for sheet in self.mark_data.values():
# for cells in sheet:
# if cells[0] == title and cells[5] == rating:
# return cells[3]
def import_from_file_task(self):
print(f'{self.user} import start')
msg.info(self.user, f'开始导入豆瓣评论')
self.update_user_import_status(1)
f = open(self.file, 'rb')
wb = openpyxl.load_workbook(f, read_only=True, data_only=True, keep_links=False)
self.import_sheet(wb['书评'] if '书评' in wb else None, DoubanBookScraper, Book, BookReview)
self.import_sheet(wb['影评'] if '影评' in wb else None, DoubanMovieScraper, Movie, MovieReview)
self.import_sheet(wb['乐评'] if '乐评' in wb else None, DoubanAlbumScraper, Album, AlbumReview)
self.import_sheet(wb['游戏评论&攻略'] if '游戏评论&攻略' in wb else None, DoubanGameScraper, Game, GameReview)
self.load_sheets()
print(f'{self.user} sheet loaded, {self.total} lines total')
self.update_user_import_status(1)
for name, param in self.review_sheet_config.items():
self.import_review_sheet(self.review_data[name], param[0], param[1], param[2])
self.update_user_import_status(0)
msg.success(self.user, f'豆瓣评论导入完成,共处理{self.total}篇,已存在{self.skipped}篇,新增{self.imported}篇。')
if len(self.failed):
msg.error(self.user, f'豆瓣评论导入时未能处理以下网址:\n{" , ".join(self.failed)}')
def import_sheet(self, worksheet, scraper, entity_class, review_class):
def import_review_sheet(self, worksheet, scraper, entity_class, review_class):
prefix = f'{self.user} |'
if worksheet is None: # or worksheet.max_row < 2:
print(f'{prefix} {review_class.__name__} empty sheet')
return
for row in worksheet.iter_rows(min_row=2, values_only=True):
cells = [cell for cell in row]
for cells in worksheet:
if len(cells) < 6:
continue
title = cells[0]
entity_title = re.sub('^《', '', re.sub('》$', '', cells[1]))
review_url = cells[2]
time = cells[3]
rating = cells[4]
content = cells[6]
self.total += 1
self.processed += 1
if time:
time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
tz = pytz.timezone('Asia/Shanghai')
time = time.replace(tzinfo=tz)
time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S").replace(tzinfo=tz_sh)
else:
time = None
if not content:
content = ""
if not title:
title = ""
r = self.import_review(title, review_url, content, time, scraper, entity_class, review_class)
r = self.import_review(entity_title, rating, title, review_url, content, time, scraper, entity_class, review_class)
if r == 1:
self.imported += 1
elif r == 2:
@ -144,30 +206,31 @@ class DoubanImporter:
self.failed.append(review_url)
self.update_user_import_status(1)
def import_review(self, title, review_url, content, time, scraper, entity_class, review_class):
def import_review(self, entity_title, rating, title, review_url, content, time, scraper, entity_class, review_class):
# return 1: done / 2: skipped / None: failed
prefix = f'{self.user} |'
url = None
print(f'{prefix} fetching {review_url}')
try:
if settings.SCRAPESTACK_KEY is not None:
_review_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={review_url}'
else:
_review_url = review_url
r = requests.get(_review_url, timeout=settings.SCRAPING_TIMEOUT)
if r.status_code != 200:
print(f'{prefix} fetching error {review_url} {r.status_code}')
url = self.guess_entity_url(entity_title, rating, time)
if url is None:
print(f'{prefix} fetching {review_url}')
try:
if settings.SCRAPESTACK_KEY is not None:
_review_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={review_url}'
else:
_review_url = review_url
r = requests.get(_review_url, timeout=settings.SCRAPING_TIMEOUT)
if r.status_code != 200:
print(f'{prefix} fetching error {review_url} {r.status_code}')
return
h = html.fromstring(r.content.decode('utf-8'))
for u in h.xpath("//header[@class='main-hd']/a/@href"):
if '.douban.com/subject/' in u:
url = u
if not url:
print(f'{prefix} fetching error {review_url} unable to locate entity url')
return
except Exception:
print(f'{prefix} fetching exception {review_url}')
return
h = html.fromstring(r.content.decode('utf-8'))
for u in h.xpath("//header[@class='main-hd']/a/@href"):
if '.douban.com/subject/' in u:
url = u
if not url:
print(f'{prefix} fetching error {review_url} unable to locate entity url')
return
except Exception:
print(f'{prefix} fetching exception {review_url}')
return
try:
entity = entity_class.objects.get(source_url=url)
print(f'{prefix} matched {url}')

View file

@ -176,10 +176,11 @@
{% elif import_status.douban_pending == 1 %}
正在导入
{% if import_status.douban_total %}
目前已处理{{ import_status.douban_total }}篇,其中已存在{{ import_status.douban_skipped }}篇,新增{{ import_status.douban_imported }}篇
共{{ import_status.douban_total }}篇,目前已处理{{ import_status.douban_processed }}篇,其中已存在{{ import_status.douban_skipped }}篇,新增{{ import_status.douban_imported }}篇
{% endif %}
{% elif import_status.douban_file %}
上次共计处理{{ import_status.douban_total }}篇,其中已存在{{ import_status.douban_skipped }}篇,新增{{ import_status.douban_imported }}篇
上次结果
共计{{ import_status.douban_total }}篇,处理{{ import_status.douban_processed }}篇,其中已存在{{ import_status.douban_skipped }}篇,新增{{ import_status.douban_imported }}篇
{% endif %}
</div>
<div>