254 lines
11 KiB
Python
254 lines
11 KiB
Python
import re
|
|
from datetime import datetime
|
|
|
|
import django_rq
|
|
from auditlog.context import set_actor
|
|
from django.utils import timezone
|
|
from django.utils.timezone import make_aware
|
|
from user_messages import api as msg
|
|
|
|
from catalog.common import *
|
|
from catalog.common.downloaders import *
|
|
from catalog.models import *
|
|
from journal.models import *
|
|
|
|
re_list = r"^https://www.goodreads.com/list/show/\d+"
|
|
re_shelf = r"^https://www.goodreads.com/review/list/\d+[^?]*\?shelf=[^&]+"
|
|
re_profile = r"^https://www.goodreads.com/user/show/(\d+)"
|
|
gr_rating = {
|
|
"did not like it": 2,
|
|
"it was ok": 4,
|
|
"liked it": 6,
|
|
"really liked it": 8,
|
|
"it was amazing": 10,
|
|
}
|
|
|
|
|
|
class GoodreadsImporter:
|
|
@classmethod
|
|
def import_from_url(cls, raw_url, user):
|
|
match_list = re.match(re_list, raw_url)
|
|
match_shelf = re.match(re_shelf, raw_url)
|
|
match_profile = re.match(re_profile, raw_url)
|
|
if match_profile or match_shelf or match_list:
|
|
django_rq.get_queue("import").enqueue(
|
|
cls.import_from_url_task, raw_url, user
|
|
)
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
@classmethod
|
|
def import_from_url_task(cls, url, user):
|
|
match_list = re.match(re_list, url)
|
|
match_shelf = re.match(re_shelf, url)
|
|
match_profile = re.match(re_profile, url)
|
|
total = 0
|
|
visibility = user.preference.default_visibility
|
|
with set_actor(user):
|
|
shelf = None
|
|
if match_shelf:
|
|
shelf = cls.parse_shelf(match_shelf[0], user)
|
|
elif match_list:
|
|
shelf = cls.parse_list(match_list[0], user)
|
|
if shelf:
|
|
if shelf["title"] and shelf["books"]:
|
|
collection = Collection.objects.create(
|
|
title=shelf["title"],
|
|
brief=shelf["description"]
|
|
+ "\n\nImported from [Goodreads]("
|
|
+ url
|
|
+ ")",
|
|
owner=user,
|
|
)
|
|
for book in shelf["books"]:
|
|
collection.append_item(book["book"], note=book["review"])
|
|
total += 1
|
|
collection.save()
|
|
msg.success(user, f'成功从Goodreads导入包含{total}本书的收藏单{shelf["title"]}。')
|
|
elif match_profile:
|
|
uid = match_profile[1]
|
|
shelves = {
|
|
ShelfType.WISHLIST: f"https://www.goodreads.com/review/list/{uid}?shelf=to-read",
|
|
ShelfType.PROGRESS: f"https://www.goodreads.com/review/list/{uid}?shelf=currently-reading",
|
|
ShelfType.COMPLETE: f"https://www.goodreads.com/review/list/{uid}?shelf=read",
|
|
}
|
|
for shelf_type in shelves:
|
|
shelf_url = shelves.get(shelf_type)
|
|
shelf = cls.parse_shelf(shelf_url, user)
|
|
for book in shelf["books"]:
|
|
mark = Mark(user, book["book"])
|
|
if (
|
|
(
|
|
mark.shelf_type == shelf_type
|
|
and mark.comment_text == book["review"]
|
|
)
|
|
or (
|
|
mark.shelf_type == ShelfType.COMPLETE
|
|
and shelf_type != ShelfType.COMPLETE
|
|
)
|
|
or (
|
|
mark.shelf_type == ShelfType.PROGRESS
|
|
and shelf_type == ShelfType.WISHLIST
|
|
)
|
|
):
|
|
print(
|
|
f'Skip {shelf_type}/{book["book"]} bc it was marked {mark.shelf_type}'
|
|
)
|
|
else:
|
|
mark.update(
|
|
shelf_type,
|
|
book["review"],
|
|
book["rating"],
|
|
visibility=visibility,
|
|
created_time=book["last_updated"] or timezone.now(),
|
|
)
|
|
total += 1
|
|
msg.success(user, f"成功从Goodreads用户主页导入{total}个标记。")
|
|
|
|
@classmethod
|
|
def get_book(cls, url, user):
|
|
site = SiteManager.get_site_by_url(url)
|
|
if site:
|
|
book = site.get_item()
|
|
if not book:
|
|
resource = site.get_resource_ready()
|
|
if resource and resource.item:
|
|
book = resource.item
|
|
return book
|
|
|
|
@classmethod
|
|
def parse_shelf(cls, url, user):
|
|
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
|
|
title = ""
|
|
books = []
|
|
url_shelf = url + "&view=table"
|
|
while url_shelf:
|
|
print(f"Shelf loading {url_shelf}")
|
|
try:
|
|
content = BasicDownloader(url_shelf).download().html()
|
|
title_elem = content.xpath("//span[@class='h1Shelf']/text()")
|
|
if not title_elem:
|
|
print(f"Shelf parsing error {url_shelf}")
|
|
break
|
|
title = title_elem[0].strip() # type:ignore
|
|
print("Shelf title: " + title)
|
|
except Exception:
|
|
print(f"Shelf loading/parsing error {url_shelf}")
|
|
break
|
|
cells = content.xpath("//tbody[@id='booksBody']/tr")
|
|
for cell in cells: # type:ignore
|
|
url_book = (
|
|
"https://www.goodreads.com"
|
|
+ cell.xpath(".//td[@class='field title']//a/@href")[0].strip()
|
|
)
|
|
# has_review = cell.xpath(
|
|
# ".//td[@class='field actions']//a/text()")[0].strip() == 'view (with text)'
|
|
rating_elem = cell.xpath(".//td[@class='field rating']//span/@title")
|
|
rating = gr_rating.get(rating_elem[0].strip()) if rating_elem else None
|
|
url_review = (
|
|
"https://www.goodreads.com"
|
|
+ cell.xpath(".//td[@class='field actions']//a/@href")[0].strip()
|
|
)
|
|
review = None
|
|
last_updated = None
|
|
date_elem = cell.xpath(".//td[@class='field date_added']//span/text()")
|
|
for d in date_elem:
|
|
date_matched = re.search(r"(\w+)\s+(\d+),\s+(\d+)", d)
|
|
if date_matched:
|
|
last_updated = make_aware(
|
|
datetime.strptime(
|
|
date_matched[1]
|
|
+ " "
|
|
+ date_matched[2]
|
|
+ " "
|
|
+ date_matched[3],
|
|
"%b %d %Y",
|
|
)
|
|
)
|
|
try:
|
|
c2 = BasicDownloader(url_review).download().html()
|
|
review_elem = c2.xpath("//div[@itemprop='reviewBody']/text()")
|
|
review = (
|
|
"\n".join(p.strip() for p in review_elem) # type:ignore
|
|
if review_elem
|
|
else ""
|
|
)
|
|
date_elem = c2.xpath("//div[@class='readingTimeline__text']/text()")
|
|
for d in date_elem: # type:ignore
|
|
date_matched = re.search(r"(\w+)\s+(\d+),\s+(\d+)", d)
|
|
if date_matched:
|
|
last_updated = make_aware(
|
|
datetime.strptime(
|
|
date_matched[1]
|
|
+ " "
|
|
+ date_matched[2]
|
|
+ " "
|
|
+ date_matched[3],
|
|
"%B %d %Y",
|
|
)
|
|
)
|
|
except Exception:
|
|
print(f"Error loading/parsing review{url_review}, ignored")
|
|
try:
|
|
book = cls.get_book(url_book, user)
|
|
books.append(
|
|
{
|
|
"url": url_book,
|
|
"book": book,
|
|
"rating": rating,
|
|
"review": review,
|
|
"last_updated": last_updated,
|
|
}
|
|
)
|
|
except Exception as e:
|
|
print(f"Error adding {url_book} {e}")
|
|
pass # likely just download error
|
|
next_elem = content.xpath("//a[@class='next_page']/@href")
|
|
url_shelf = (
|
|
f"https://www.goodreads.com{next_elem[0].strip()}" # type:ignore
|
|
if next_elem
|
|
else None
|
|
)
|
|
return {"title": title, "description": "", "books": books}
|
|
|
|
@classmethod
|
|
def parse_list(cls, url, user):
|
|
# return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
|
|
title = ""
|
|
description = ""
|
|
books = []
|
|
url_shelf = url
|
|
while url_shelf:
|
|
print(f"List loading {url_shelf}")
|
|
content = BasicDownloader(url_shelf).download().html()
|
|
title_elem = content.xpath('//h1[@class="gr-h1 gr-h1--serif"]/text()')
|
|
if not title_elem:
|
|
print(f"List parsing error {url_shelf}")
|
|
break
|
|
title: str = title_elem[0].strip() # type:ignore
|
|
desc_elem = content.xpath('//div[@class="mediumText"]/text()')
|
|
description: str = desc_elem[0].strip() # type:ignore
|
|
print("List title: " + title)
|
|
links = content.xpath('//a[@class="bookTitle"]/@href')
|
|
for link in links: # type:ignore
|
|
url_book = "https://www.goodreads.com" + link
|
|
try:
|
|
book = cls.get_book(url_book, user)
|
|
books.append(
|
|
{
|
|
"url": url_book,
|
|
"book": book,
|
|
"review": "",
|
|
}
|
|
)
|
|
except Exception:
|
|
print("Error adding " + url_book)
|
|
pass # likely just download error
|
|
next_elem = content.xpath("//a[@class='next_page']/@href")
|
|
url_shelf = (
|
|
f"https://www.goodreads.com{next_elem[0].strip()}" # type:ignore
|
|
if next_elem
|
|
else None
|
|
)
|
|
return {"title": title, "description": description, "books": books}
|