lib.itmens/journal/importers/goodreads.py

import re
from datetime import datetime

from django.utils import timezone
from django.utils.timezone import make_aware

from catalog.common import *
from catalog.common.downloaders import *
from catalog.models import *
from journal.models import *
from users.models import Task

re_list = r"^https://www\.goodreads\.com/list/show/\d+"
re_shelf = r"^https://www\.goodreads\.com/review/list/\d+[^\?]*\?shelf=[^&]+"
re_profile = r"^https://www\.goodreads\.com/user/show/(\d+)"
gr_rating = {
    "did not like it": 2,
    "it was ok": 4,
    "liked it": 6,
    "really liked it": 8,
    "it was amazing": 10,
}


class GoodreadsImporter(Task):
    class Meta:
        app_label = "journal"  # workaround bug in TypedModel

    TaskQueue = "import"
    DefaultMetadata = {
        "total": 0,
        "processed": 0,
        "skipped": 0,
        "imported": 0,
        "failed": 0,
        "visibility": 0,
        "failed_urls": [],
        "url": None,
    }

    @classmethod
    def validate_url(cls, raw_url):
        match_list = re.match(re_list, raw_url)
        match_shelf = re.match(re_shelf, raw_url)
        match_profile = re.match(re_profile, raw_url)
        if match_profile or match_shelf or match_list:
            return True
        else:
            return False

    def run(self):
        url = self.metadata["url"]
        user = self.user
        match_list = re.match(re_list, url)
        match_shelf = re.match(re_shelf, url)
        match_profile = re.match(re_profile, url)
        total = 0
        visibility = user.preference.default_visibility
        shelf = None
        if match_shelf:
            shelf = self.parse_shelf(match_shelf[0])
        elif match_list:
            shelf = self.parse_list(match_list[0])
        if shelf:
            if shelf["title"] and shelf["books"]:
                collection = Collection.objects.create(
                    title=shelf["title"],
                    brief=shelf["description"]
                    + "\n\nImported from [Goodreads]("
                    + url
                    + ")",
                    owner=user.identity,
                )
                for book in shelf["books"]:
                    collection.append_item(book["book"], note=book["review"])
                    total += 1
                collection.save()
            self.message = f"Imported {total} books from Goodreads as a Collection {shelf['title']}."
        elif match_profile:
            uid = match_profile[1]
            shelves = {
                ShelfType.WISHLIST: f"https://www.goodreads.com/review/list/{uid}?shelf=to-read",
                ShelfType.PROGRESS: f"https://www.goodreads.com/review/list/{uid}?shelf=currently-reading",
                ShelfType.COMPLETE: f"https://www.goodreads.com/review/list/{uid}?shelf=read",
            }
            for shelf_type in shelves:
                shelf_url = shelves.get(shelf_type)
                shelf = self.parse_shelf(shelf_url)
                for book in shelf["books"]:
                    mark = Mark(user.identity, book["book"])
                    if (
                        (
                            mark.shelf_type == shelf_type
                            and mark.comment_text == book["review"]
                        )
                        or (
                            mark.shelf_type == ShelfType.COMPLETE
                            and shelf_type != ShelfType.COMPLETE
                        )
                        or (
                            mark.shelf_type == ShelfType.PROGRESS
                            and shelf_type == ShelfType.WISHLIST
                        )
                    ):
                        print(
                            f"Skip {shelf_type}/{book['book']} bc it was marked {mark.shelf_type}"
                        )
                    else:
                        mark.update(
                            shelf_type,
                            book["review"],
                            book["rating"],
                            visibility=visibility,
                            created_time=book["last_updated"] or timezone.now(),
                        )
                    total += 1
            self.message = f"Imported {total} records from Goodreads profile."
        self.metadata["total"] = total
        self.save()

    @classmethod
    def get_book(cls, url):
        site = SiteManager.get_site_by_url(url)
        if site:
            book = site.get_item()
            if not book:
                resource = site.get_resource_ready()
                if resource and resource.item:
                    book = resource.item
            return book

    @classmethod
    def parse_shelf(cls, url):
        # return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
        title = ""
        books = []
        url_shelf = url + "&view=table"
        while url_shelf:
            print(f"Shelf loading {url_shelf}")
            try:
                content = BasicDownloader(url_shelf).download().html()
                title_elem = content.xpath("//span[@class='h1Shelf']/text()")
                if not title_elem:
                    print(f"Shelf parsing error {url_shelf}")
                    break
                title = title_elem[0].strip()  # type:ignore
                print(f"Shelf title: {title}")
            except Exception:
                print(f"Shelf loading/parsing error {url_shelf}")
                break
            cells = content.xpath("//tbody[@id='booksBody']/tr")
            for cell in cells:  # type:ignore
                url_book = (
                    "https://www.goodreads.com"
                    + cell.xpath(".//td[@class='field title']//a/@href")[0].strip()
                )
                # has_review = cell.xpath(
                #     ".//td[@class='field actions']//a/text()")[0].strip() == 'view (with text)'
                rating_elem = cell.xpath(".//td[@class='field rating']//span/@title")
                rating = gr_rating.get(rating_elem[0].strip()) if rating_elem else None
                url_review = (
                    "https://www.goodreads.com"
                    + cell.xpath(".//td[@class='field actions']//a/@href")[0].strip()
                )
                review = None
                last_updated = None
                date_elem = cell.xpath(".//td[@class='field date_added']//span/text()")
                for d in date_elem:
                    date_matched = re.search(r"(\w+)\s+(\d+),\s+(\d+)", d)
                    if date_matched:
                        last_updated = make_aware(
                            datetime.strptime(
                                date_matched[1]
                                + " "
                                + date_matched[2]
                                + " "
                                + date_matched[3],
                                "%b %d %Y",
                            )
                        )
                try:
                    c2 = BasicDownloader(url_review).download().html()
                    review_elem = c2.xpath("//div[@itemprop='reviewBody']/text()")
                    review = (
                        "\n".join(p.strip() for p in review_elem)  # type:ignore
                        if review_elem
                        else ""
                    )
                    date_elem = c2.xpath("//div[@class='readingTimeline__text']/text()")
                    for d in date_elem:  # type:ignore
                        date_matched = re.search(r"(\w+)\s+(\d+),\s+(\d+)", d)
                        if date_matched:
                            last_updated = make_aware(
                                datetime.strptime(
                                    date_matched[1]
                                    + " "
                                    + date_matched[2]
                                    + " "
                                    + date_matched[3],
                                    "%B %d %Y",
                                )
                            )
                except Exception:
                    print(f"Error loading/parsing review{url_review}, ignored")
                try:
                    book = cls.get_book(url_book)
                    books.append(
                        {
                            "url": url_book,
                            "book": book,
                            "rating": rating,
                            "review": review,
                            "last_updated": last_updated,
                        }
                    )
                except Exception as e:
                    print(f"Error adding {url_book} {e}")
                    pass  # likely just download error
            next_elem = content.xpath("//a[@class='next_page']/@href")
            url_shelf = (
                f"https://www.goodreads.com{next_elem[0].strip()}"  # type:ignore
                if next_elem
                else None
            )
        return {"title": title, "description": "", "books": books}

    @classmethod
    def parse_list(cls, url):
        # return {'title': 'abc', books: [{'book': obj, 'rating': 10, 'review': 'txt'}, ...]}
        title = ""
        description = ""
        books = []
        url_shelf = url
        while url_shelf:
            print(f"List loading {url_shelf}")
            content = BasicDownloader(url_shelf).download().html()
            title_elem = content.xpath('//h1[@class="gr-h1 gr-h1--serif"]/text()')
            if not title_elem:
                print(f"List parsing error {url_shelf}")
                break
            title: str = title_elem[0].strip()  # type:ignore
            desc_elem = content.xpath('//div[@class="mediumText"]/text()')
            description: str = desc_elem[0].strip()  # type:ignore
            print("List title: " + title)
            links = content.xpath('//a[@class="bookTitle"]/@href')
            for link in links:  # type:ignore
                url_book = "https://www.goodreads.com" + link
                try:
                    book = cls.get_book(url_book)
                    books.append(
                        {
                            "url": url_book,
                            "book": book,
                            "review": "",
                        }
                    )
                except Exception:
                    print("Error adding " + url_book)
                    pass  # likely just download error
            next_elem = content.xpath("//a[@class='next_page']/@href")
            url_shelf = (
                f"https://www.goodreads.com{next_elem[0].strip()}"  # type:ignore
                if next_elem
                else None
            )
        return {"title": title, "description": description, "books": books}