lib.itmens/journal/importers/goodreads.py
2023-08-11 18:17:34 -04:00

254 lines
11 KiB
Python

import re
from datetime import datetime
import django_rq
from auditlog.context import set_actor
from django.utils import timezone
from django.utils.timezone import make_aware
from user_messages import api as msg
from catalog.common import *
from catalog.common.downloaders import *
from catalog.models import *
from journal.models import *
re_list = r"^https://www.goodreads.com/list/show/\d+"
re_shelf = r"^https://www.goodreads.com/review/list/\d+[^?]*\?shelf=[^&]+"
re_profile = r"^https://www.goodreads.com/user/show/(\d+)"
gr_rating = {
"did not like it": 2,
"it was ok": 4,
"liked it": 6,
"really liked it": 8,
"it was amazing": 10,
}
class GoodreadsImporter:
@classmethod
def import_from_url(cls, raw_url, user):
match_list = re.match(re_list, raw_url)
match_shelf = re.match(re_shelf, raw_url)
match_profile = re.match(re_profile, raw_url)
if match_profile or match_shelf or match_list:
django_rq.get_queue("import").enqueue(
cls.import_from_url_task, raw_url, user
)
return True
else:
return False
@classmethod
def import_from_url_task(cls, url, user):
match_list = re.match(re_list, url)
match_shelf = re.match(re_shelf, url)
match_profile = re.match(re_profile, url)
total = 0
visibility = user.preference.default_visibility
with set_actor(user):
shelf = None
if match_shelf:
shelf = cls.parse_shelf(match_shelf[0], user)
elif match_list:
shelf = cls.parse_list(match_list[0], user)
if shelf:
if shelf["title"] and shelf["books"]:
collection = Collection.objects.create(
title=shelf["title"],
brief=shelf["description"]
+ "\n\nImported from [Goodreads]("
+ url
+ ")",
owner=user,
)
for book in shelf["books"]:
collection.append_item(book["book"], note=book["review"])
total += 1
collection.save()
msg.success(user, f'成功从Goodreads导入包含{total}本书的收藏单{shelf["title"]}')
elif match_profile:
uid = match_profile[1]
shelves = {
ShelfType.WISHLIST: f"https://www.goodreads.com/review/list/{uid}?shelf=to-read",
ShelfType.PROGRESS: f"https://www.goodreads.com/review/list/{uid}?shelf=currently-reading",
ShelfType.COMPLETE: f"https://www.goodreads.com/review/list/{uid}?shelf=read",
}
for shelf_type in shelves:
shelf_url = shelves.get(shelf_type)
shelf = cls.parse_shelf(shelf_url, user)
for book in shelf["books"]:
mark = Mark(user, book["book"])
if (
(
mark.shelf_type == shelf_type
and mark.comment_text == book["review"]
)
or (
mark.shelf_type == ShelfType.COMPLETE
and shelf_type != ShelfType.COMPLETE
)
or (
mark.shelf_type == ShelfType.PROGRESS
and shelf_type == ShelfType.WISHLIST
)
):
print(
f'Skip {shelf_type}/{book["book"]} bc it was marked {mark.shelf_type}'
)
else:
mark.update(
shelf_type,
book["review"],
book["rating"],
visibility=visibility,
created_time=book["last_updated"] or timezone.now(),
)
total += 1
msg.success(user, f"成功从Goodreads用户主页导入{total}个标记。")
@classmethod
def get_book(cls, url, user):
site = SiteManager.get_site_by_url(url)
if site:
book = site.get_item()
if not book:
resource = site.get_resource_ready()
if resource and resource.item:
book = resource.item
return book
@classmethod
def parse_shelf(cls, url, user):
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
title = ""
books = []
url_shelf = url + "&view=table"
while url_shelf:
print(f"Shelf loading {url_shelf}")
try:
content = BasicDownloader(url_shelf).download().html()
title_elem = content.xpath("//span[@class='h1Shelf']/text()")
if not title_elem:
print(f"Shelf parsing error {url_shelf}")
break
title = title_elem[0].strip() # type:ignore
print("Shelf title: " + title)
except Exception:
print(f"Shelf loading/parsing error {url_shelf}")
break
cells = content.xpath("//tbody[@id='booksBody']/tr")
for cell in cells: # type:ignore
url_book = (
"https://www.goodreads.com"
+ cell.xpath(".//td[@class='field title']//a/@href")[0].strip()
)
# has_review = cell.xpath(
# ".//td[@class='field actions']//a/text()")[0].strip() == 'view (with text)'
rating_elem = cell.xpath(".//td[@class='field rating']//span/@title")
rating = gr_rating.get(rating_elem[0].strip()) if rating_elem else None
url_review = (
"https://www.goodreads.com"
+ cell.xpath(".//td[@class='field actions']//a/@href")[0].strip()
)
review = None
last_updated = None
date_elem = cell.xpath(".//td[@class='field date_added']//span/text()")
for d in date_elem:
date_matched = re.search(r"(\w+)\s+(\d+),\s+(\d+)", d)
if date_matched:
last_updated = make_aware(
datetime.strptime(
date_matched[1]
+ " "
+ date_matched[2]
+ " "
+ date_matched[3],
"%b %d %Y",
)
)
try:
c2 = BasicDownloader(url_review).download().html()
review_elem = c2.xpath("//div[@itemprop='reviewBody']/text()")
review = (
"\n".join(p.strip() for p in review_elem) # type:ignore
if review_elem
else ""
)
date_elem = c2.xpath("//div[@class='readingTimeline__text']/text()")
for d in date_elem: # type:ignore
date_matched = re.search(r"(\w+)\s+(\d+),\s+(\d+)", d)
if date_matched:
last_updated = make_aware(
datetime.strptime(
date_matched[1]
+ " "
+ date_matched[2]
+ " "
+ date_matched[3],
"%B %d %Y",
)
)
except Exception:
print(f"Error loading/parsing review{url_review}, ignored")
try:
book = cls.get_book(url_book, user)
books.append(
{
"url": url_book,
"book": book,
"rating": rating,
"review": review,
"last_updated": last_updated,
}
)
except Exception as e:
print(f"Error adding {url_book} {e}")
pass # likely just download error
next_elem = content.xpath("//a[@class='next_page']/@href")
url_shelf = (
f"https://www.goodreads.com{next_elem[0].strip()}" # type:ignore
if next_elem
else None
)
return {"title": title, "description": "", "books": books}
@classmethod
def parse_list(cls, url, user):
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
title = ""
description = ""
books = []
url_shelf = url
while url_shelf:
print(f"List loading {url_shelf}")
content = BasicDownloader(url_shelf).download().html()
title_elem = content.xpath('//h1[@class="gr-h1 gr-h1--serif"]/text()')
if not title_elem:
print(f"List parsing error {url_shelf}")
break
title: str = title_elem[0].strip() # type:ignore
desc_elem = content.xpath('//div[@class="mediumText"]/text()')
description: str = desc_elem[0].strip() # type:ignore
print("List title: " + title)
links = content.xpath('//a[@class="bookTitle"]/@href')
for link in links: # type:ignore
url_book = "https://www.goodreads.com" + link
try:
book = cls.get_book(url_book, user)
books.append(
{
"url": url_book,
"book": book,
"review": "",
}
)
except Exception:
print("Error adding " + url_book)
pass # likely just download error
next_elem = content.xpath("//a[@class='next_page']/@href")
url_shelf = (
f"https://www.goodreads.com{next_elem[0].strip()}" # type:ignore
if next_elem
else None
)
return {"title": title, "description": description, "books": books}