2022-12-08 05:53:00 +00:00
|
|
|
from catalog.book.models import Edition, Work
|
2022-12-07 19:09:05 -05:00
|
|
|
from catalog.common import *
|
|
|
|
from lxml import html
|
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
|
|
|
|
|
2022-12-08 05:53:00 +00:00
|
|
|
_logger = logging.getLogger(__name__)
|
2022-12-07 19:09:05 -05:00
|
|
|
|
|
|
|
|
|
|
|
class GoodreadsDownloader(RetryDownloader):
|
|
|
|
def validate_response(self, response):
|
|
|
|
if response is None:
|
|
|
|
return RESPONSE_NETWORK_ERROR
|
|
|
|
elif response.status_code == 200:
|
|
|
|
if response.text.find('__NEXT_DATA__') != -1:
|
|
|
|
return RESPONSE_OK
|
2022-12-08 16:59:03 +00:00
|
|
|
else:
|
|
|
|
# Goodreads may return legacy version for a/b testing
|
|
|
|
# retry if so
|
2022-12-07 19:09:05 -05:00
|
|
|
return RESPONSE_NETWORK_ERROR
|
|
|
|
else:
|
|
|
|
return RESPONSE_INVALID_CONTENT
|
|
|
|
|
|
|
|
|
|
|
|
@SiteList.register
|
|
|
|
class Goodreads(AbstractSite):
|
|
|
|
ID_TYPE = IdType.Goodreads
|
|
|
|
WIKI_PROPERTY_ID = 'P2968'
|
|
|
|
DEFAULT_MODEL = Edition
|
|
|
|
URL_PATTERNS = [r".+goodreads.com/.*book/show/(\d+)", r".+goodreads.com/.*book/(\d+)"]
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def id_to_url(self, id_value):
|
|
|
|
return "https://www.goodreads.com/book/show/" + id_value
|
|
|
|
|
|
|
|
def scrape(self, response=None):
|
|
|
|
data = {}
|
|
|
|
if response is not None:
|
2022-12-08 16:59:03 +00:00
|
|
|
h = html.fromstring(response.text.strip())
|
2022-12-07 19:09:05 -05:00
|
|
|
else:
|
|
|
|
dl = GoodreadsDownloader(self.url)
|
2022-12-08 16:59:03 +00:00
|
|
|
h = dl.download().html()
|
2022-12-07 19:09:05 -05:00
|
|
|
# Next.JS version of GoodReads
|
|
|
|
# JSON.parse(document.getElementById('__NEXT_DATA__').innerHTML)['props']['pageProps']['apolloState']
|
|
|
|
elem = h.xpath('//script[@id="__NEXT_DATA__"]/text()')
|
|
|
|
src = elem[0].strip() if elem else None
|
|
|
|
if not src:
|
|
|
|
raise ParseError(self, '__NEXT_DATA__ element')
|
|
|
|
d = json.loads(src)['props']['pageProps']['apolloState']
|
|
|
|
o = {'Book': [], 'Work': [], 'Series': [], 'Contributor': []}
|
|
|
|
for v in d.values():
|
|
|
|
t = v.get('__typename')
|
|
|
|
if t and t in o:
|
|
|
|
o[t].append(v)
|
|
|
|
b = next(filter(lambda x: x.get('title'), o['Book']), None)
|
|
|
|
if not b:
|
2022-12-08 16:59:03 +00:00
|
|
|
# Goodreads may return empty page template when internal service timeouts
|
|
|
|
raise ParseError(self, 'Book in __NEXT_DATA__ json')
|
2022-12-07 19:09:05 -05:00
|
|
|
data['title'] = b['title']
|
|
|
|
data['brief'] = b['description']
|
|
|
|
data['isbn'] = b['details'].get('isbn13')
|
|
|
|
asin = b['details'].get('asin')
|
|
|
|
if asin and asin != data['isbn']:
|
|
|
|
data['asin'] = asin
|
|
|
|
data['pages'] = b['details'].get('numPages')
|
|
|
|
data['cover_image_url'] = b['imageUrl']
|
|
|
|
w = next(filter(lambda x: x.get('details'), o['Work']), None)
|
|
|
|
if w:
|
2022-12-08 16:08:59 +00:00
|
|
|
data['required_resources'] = [{
|
2022-12-08 05:53:00 +00:00
|
|
|
'model': 'Work',
|
2022-12-08 16:59:03 +00:00
|
|
|
'id_type': IdType.Goodreads_Work,
|
2022-12-08 05:53:00 +00:00
|
|
|
'id_value': str(w['legacyId']),
|
|
|
|
'title': w['details']['originalTitle'],
|
|
|
|
'url': w['editions']['webUrl'],
|
|
|
|
}]
|
2022-12-08 16:08:59 +00:00
|
|
|
pd = ResourceContent(metadata=data)
|
2022-12-07 19:09:05 -05:00
|
|
|
pd.lookup_ids[IdType.ISBN] = data.get('isbn')
|
|
|
|
pd.lookup_ids[IdType.ASIN] = data.get('asin')
|
|
|
|
if data["cover_image_url"]:
|
|
|
|
imgdl = BasicImageDownloader(data["cover_image_url"], self.url)
|
|
|
|
try:
|
|
|
|
pd.cover_image = imgdl.download().content
|
|
|
|
pd.cover_image_extention = imgdl.extention
|
|
|
|
except Exception:
|
2022-12-08 05:53:00 +00:00
|
|
|
_logger.debug(f'failed to download cover for {self.url} from {data["cover_image_url"]}')
|
|
|
|
return pd
|
|
|
|
|
|
|
|
|
|
|
|
@SiteList.register
|
|
|
|
class Goodreads_Work(AbstractSite):
|
|
|
|
ID_TYPE = IdType.Goodreads_Work
|
|
|
|
WIKI_PROPERTY_ID = ''
|
|
|
|
DEFAULT_MODEL = Work
|
|
|
|
URL_PATTERNS = [r".+goodreads.com/work/editions/(\d+)"]
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def id_to_url(self, id_value):
|
|
|
|
return "https://www.goodreads.com/work/editions/" + id_value
|
|
|
|
|
|
|
|
def scrape(self, response=None):
|
2022-12-08 16:59:03 +00:00
|
|
|
content = BasicDownloader(self.url).download().html()
|
2022-12-08 05:53:00 +00:00
|
|
|
title_elem = content.xpath("//h1/a/text()")
|
|
|
|
title = title_elem[0].strip() if title_elem else None
|
|
|
|
if not title:
|
|
|
|
raise ParseError(self, 'title')
|
|
|
|
author_elem = content.xpath("//h2/a/text()")
|
|
|
|
author = author_elem[0].strip() if author_elem else None
|
|
|
|
first_published_elem = content.xpath("//h2/span/text()")
|
|
|
|
first_published = first_published_elem[0].strip() if first_published_elem else None
|
2022-12-08 16:08:59 +00:00
|
|
|
pd = ResourceContent(metadata={
|
2022-12-08 05:53:00 +00:00
|
|
|
'title': title,
|
|
|
|
'author': author,
|
|
|
|
'first_published': first_published
|
|
|
|
})
|
2022-12-07 19:09:05 -05:00
|
|
|
return pd
|