new data model: add legacy fields for Book to Edition as virtual fields

This commit is contained in:
Your Name 2022-12-14 21:12:37 -05:00
parent 3d6eeab70b
commit 486dd16e1f
7 changed files with 190 additions and 92 deletions

View file

@ -31,13 +31,40 @@ class Edition(Item):
cubn = PrimaryLookupIdDescriptor(IdType.CUBN)
# douban_book = LookupIdDescriptor(IdType.DoubanBook)
# goodreads = LookupIdDescriptor(IdType.Goodreads)
languages = jsondata.ArrayField(_("语言"), null=True, blank=True, default=list)
publish_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True)
publish_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True)
METADATA_COPY_LIST = [
'title',
'brief',
# legacy fields
'subtitle',
'orig_title',
'author',
'translator',
'language',
'pub_house',
'pub_year',
'pub_month',
'binding',
'price',
'pages',
'contents',
'series',
'producer',
]
subtitle = jsondata.CharField(null=True, blank=True, default=None)
orig_title = jsondata.CharField(null=True, blank=True, default=None)
author = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list)
translator = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list)
language = jsondata.ArrayField(_("语言"), null=True, blank=True, default=list)
pub_house = jsondata.ArrayField(_('出版方'), null=True, blank=True, default=list)
pub_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True)
pub_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True)
binding = jsondata.CharField(null=True, blank=True, default=None)
pages = jsondata.IntegerField(blank=True, default=None)
authors = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list)
translaters = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list)
publishers = jsondata.ArrayField(_('出版方'), null=True, blank=True, default=list)
series = jsondata.CharField(null=True, blank=True, default=None)
contents = jsondata.CharField(null=True, blank=True, default=None)
price = jsondata.FloatField(_("发表月份"), null=True, blank=True)
producer = jsondata.FloatField(_("发表月份"), null=True, blank=True)
@property
def isbn10(self):

View file

@ -235,7 +235,7 @@ class MockResponse:
return json.load(StringIO(self.text))
def html(self):
return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug
return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5
@property
def headers(self):

View file

@ -184,7 +184,7 @@ class Item(PolymorphicModel, SoftDeleteMixin):
self.primary_lookup_id_type = None
def __str__(self):
return f"{self.id}{' ' + self.primary_lookup_id_type + ':' + self.primary_lookup_id_value if self.primary_lookup_id_value else ''} ({self.title})"
return f"{self.id}|{self.url_id}{' ' + self.primary_lookup_id_type + ':' + self.primary_lookup_id_value if self.primary_lookup_id_value else ''} ({self.title})"
@classmethod
def get_best_lookup_id(cls, lookup_ids):
@ -210,9 +210,13 @@ class Item(PolymorphicModel, SoftDeleteMixin):
else:
self.merged_to_item = to_item
@property
def url_id(self):
return base62.encode(self.uid.int)
@property
def url(self):
return f'/{self.url_path}/{base62.encode(self.uid.int)}'
return f'/{self.url_path}/{self.url_id}'
@classmethod
def get_by_url(cls, url_or_b62):
@ -236,6 +240,9 @@ class Item(PolymorphicModel, SoftDeleteMixin):
def copy_metadata(cls, metadata):
return dict((k, v) for k, v in metadata.items() if k in cls.METADATA_COPY_LIST and v is not None)
def has_cover(self):
return self.cover and self.cover != DEFAULT_ITEM_COVER
def merge_data_from_external_resources(self):
"""Subclass may override this"""
lookup_ids = []
@ -245,7 +252,7 @@ class Item(PolymorphicModel, SoftDeleteMixin):
for k in self.METADATA_COPY_LIST:
if not getattr(self, k) and p.metadata.get(k):
setattr(self, k, p.metadata.get(k))
if not self.cover and p.cover:
if not self.has_cover() and p.cover:
self.cover = p.cover
self.update_lookup_ids(lookup_ids)
@ -284,6 +291,10 @@ class ExternalResource(models.Model):
def __str__(self):
return f"{self.id}{':' + self.id_type + ':' + self.id_value if self.id_value else ''} ({self.url})"
@property
def site_name(self):
return self.id_type # TODO change to localized name
def update_content(self, resource_content):
self.other_lookup_ids = resource_content.lookup_ids
self.metadata = resource_content.metadata

View file

@ -11,4 +11,4 @@ DEFAULT_ITEM_COVER = 'item/default.svg'
def item_cover_path(resource, filename):
fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1]
return 'items/' + resource.id_type + '/' + fn
return 'item/' + resource.id_type + '/' + fn

View file

@ -5,10 +5,15 @@ from catalog.sites import *
class Command(BaseCommand):
help = 'Scrape a catalog item from external resource (but not save it)'
help = 'Scrape a catalog item from external resource (and save it)'
def add_arguments(self, parser):
parser.add_argument('url', type=str, help='URL to scrape')
parser.add_argument(
'--save',
action='store_true',
help='save to database',
)
def handle(self, *args, **options):
url = str(options['url'])
@ -17,7 +22,13 @@ class Command(BaseCommand):
self.stdout.write(self.style.ERROR(f'Unknown site for {url}'))
return
self.stdout.write(f'Fetching from {site}')
resource = site.scrape()
if options['save']:
resource = site.get_resource_ready()
pprint.pp(resource.metadata)
pprint.pp(site.get_item())
pprint.pp(site.get_item().metadata)
else:
resource = site.scrape()
pprint.pp(resource.metadata)
pprint.pp(resource.lookup_ids)
self.stdout.write(self.style.SUCCESS(f'Done.'))
pprint.pp(resource.metadata)
pprint.pp(resource.lookup_ids)

View file

@ -8,27 +8,8 @@ import logging
_logger = logging.getLogger(__name__)
class ScraperMixin:
def set_field(self, field, value=None):
self.data[field] = value
def parse_str(self, query):
elem = self.html.xpath(query)
return elem[0].strip() if elem else None
def parse_field(self, field, query, error_when_missing=False):
elem = self.html.xpath(query)
if elem:
self.data[field] = elem[0].strip()
elif error_when_missing:
raise ParseError(self, field)
else:
self.data[field] = None
return elem
@SiteList.register
class DoubanBook(AbstractSite, ScraperMixin):
class DoubanBook(AbstractSite):
ID_TYPE = IdType.DoubanBook
URL_PATTERNS = [r"\w+://book\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/book/subject/(\d+)/{0,1}"]
WIKI_PROPERTY_ID = '?'
@ -39,28 +20,34 @@ class DoubanBook(AbstractSite, ScraperMixin):
return "https://book.douban.com/subject/" + id_value + "/"
def scrape(self):
self.data = {}
self.html = DoubanDownloader(self.url).download().html()
self.parse_field('title', "/html/body//h1/span/text()")
self.parse_field('isbn', "//div[@id='info']//span[text()='ISBN:']/following::text()")
# TODO does douban store ASIN as ISBN, need more cleanup if so
if not self.data['title']:
if self.data['isbn']:
self.data['title'] = 'isbn: ' + isbn
else:
raise ParseError(self, 'title')
content = DoubanDownloader(self.url).download().html()
self.parse_field('cover_image_url', "//*[@id='mainpic']/a/img/@src")
self.parse_field('brief', "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
self.parse_field('series', "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
self.parse_field('producer', "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
self.parse_field('cubn', "//div[@id='info']//span[text()='统一书号:']/following::text()")
self.parse_field('subtitle', "//div[@id='info']//span[text()='副标题:']/following::text()")
self.parse_field('orig_title', "//div[@id='info']//span[text()='原作名:']/following::text()")
self.parse_field('language', "//div[@id='info']//span[text()='语言:']/following::text()")
self.parse_field('pub_house', "//div[@id='info']//span[text()='出版社:']/following::text()")
self.parse_field('pub_date', "//div[@id='info']//span[text()='出版年:']/following::text()")
year_month_day = RE_NUMBERS.findall(self.data['pub_date']) if self.data['pub_date'] else []
isbn_elem = content.xpath("//div[@id='info']//span[text()='ISBN:']/following::text()")
isbn = isbn_elem[0].strip() if isbn_elem else None
title_elem = content.xpath("/html/body//h1/span/text()")
title = title_elem[0].strip() if title_elem else f"Unknown Title {self.id_value}"
subtitle_elem = content.xpath(
"//div[@id='info']//span[text()='副标题:']/following::text()")
subtitle = subtitle_elem[0].strip()[:500] if subtitle_elem else None
orig_title_elem = content.xpath(
"//div[@id='info']//span[text()='原作名:']/following::text()")
orig_title = orig_title_elem[0].strip()[:500] if orig_title_elem else None
language_elem = content.xpath(
"//div[@id='info']//span[text()='语言:']/following::text()")
language = language_elem[0].strip() if language_elem else None
pub_house_elem = content.xpath(
"//div[@id='info']//span[text()='出版社:']/following::text()")
pub_house = pub_house_elem[0].strip() if pub_house_elem else None
pub_date_elem = content.xpath(
"//div[@id='info']//span[text()='出版年:']/following::text()")
pub_date = pub_date_elem[0].strip() if pub_date_elem else ''
year_month_day = RE_NUMBERS.findall(pub_date)
if len(year_month_day) in (2, 3):
pub_year = int(year_month_day[0])
pub_month = int(year_month_day[1])
@ -77,33 +64,51 @@ class DoubanBook(AbstractSite, ScraperMixin):
pub_month = None if pub_month is not None and pub_month not in range(
1, 12) else pub_month
self.parse_field('binding', "//div[@id='info']//span[text()='装帧:']/following::text()")
self.parse_field('price', "//div[@id='info']//span[text()='定价:']/following::text()")
self.parse_field('pages', "//div[@id='info']//span[text()='页数:']/following::text()")
if self.data['pages'] is not None:
self.data['pages'] = int(RE_NUMBERS.findall(self.data['pages'])[0]) if RE_NUMBERS.findall(self.data['pages']) else None
if self.data['pages'] and (self.data['pages'] > 999999 or self.data['pages'] < 1):
self.data['pages'] = None
binding_elem = content.xpath(
"//div[@id='info']//span[text()='装帧:']/following::text()")
binding = binding_elem[0].strip() if binding_elem else None
price_elem = content.xpath(
"//div[@id='info']//span[text()='定价:']/following::text()")
price = price_elem[0].strip() if price_elem else None
pages_elem = content.xpath(
"//div[@id='info']//span[text()='页数:']/following::text()")
pages = pages_elem[0].strip() if pages_elem else None
if pages is not None:
pages = int(RE_NUMBERS.findall(pages)[
0]) if RE_NUMBERS.findall(pages) else None
if pages and (pages > 999999 or pages < 1):
pages = None
brief_elem = content.xpath(
"//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
brief = '\n'.join(p.strip()
for p in brief_elem) if brief_elem else None
contents = None
try:
contents_elem = self.html.xpath(
contents_elem = content.xpath(
"//h2/span[text()='目录']/../following-sibling::div[1]")[0]
# if next the id of next sibling contains `dir`, that would be the full contents
if "dir" in contents_elem.getnext().xpath("@id")[0]:
contents_elem = contents_elem.getnext()
contents = '\n'.join(p.strip() for p in contents_elem.xpath("text()")[:-2]) if len(contents_elem) else None
contents = '\n'.join(p.strip() for p in contents_elem.xpath(
"text()")[:-2]) if contents_elem is not None else None
else:
contents = '\n'.join(p.strip() for p in contents_elem.xpath("text()")) if len(contents_elem) else None
contents = '\n'.join(p.strip() for p in contents_elem.xpath(
"text()")) if contents_elem is not None else None
except Exception:
pass
self.data['contents'] = contents
img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src")
img_url = img_url_elem[0].strip() if img_url_elem else None
# there are two html formats for authors and translators
authors_elem = self.html.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""")
if not authors_elem:
authors_elem = self.html.xpath(
authors_elem = content.xpath(
"""//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""")
if authors_elem:
authors = []
@ -111,12 +116,11 @@ class DoubanBook(AbstractSite, ScraperMixin):
authors.append(RE_WHITESPACES.sub(' ', author.strip())[:200])
else:
authors = None
self.data['authors'] = authors
translators_elem = self.html.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""")
if not translators_elem:
translators_elem = self.html.xpath(
translators_elem = content.xpath(
"""//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""")
if translators_elem:
translators = []
@ -124,28 +128,56 @@ class DoubanBook(AbstractSite, ScraperMixin):
translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
else:
translators = None
self.data['translators'] = translators
work_link = self.parse_str('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
if work_link:
r = re.match(r'\w+://book.douban.com/works/(\d+)', work_link)
self.data['required_resources'] = [{
cncode_elem = content.xpath(
"//div[@id='info']//span[text()='统一书号:']/following::text()")
cubn = cncode_elem[0].strip() if cncode_elem else None
series_elem = content.xpath(
"//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
series = series_elem[0].strip() if series_elem else None
imprint_elem = content.xpath(
"//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
producer = imprint_elem[0].strip() if imprint_elem else None
data = {
'title': title,
'subtitle': subtitle,
'orig_title': orig_title,
'author': authors,
'translator': translators,
'language': language,
'pub_house': pub_house,
'pub_year': pub_year,
'pub_month': pub_month,
'binding': binding,
'price': price,
'pages': pages,
'isbn': isbn,
'cubn': cubn,
'brief': brief,
'contents': contents,
'series': series,
'producer': producer,
'cover_image_url': img_url,
}
works_element = content.xpath('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
if works_element:
r = re.match(r'\w+://book.douban.com/works/(\d+)', works_element[0])
data['required_resources'] = [{
'model': 'Work',
'id_type': IdType.DoubanBook_Work,
'id_type': IdType.DoubanBook_Work,
'id_value': r[1] if r else None,
'title': self.data['title'],
'url': work_link,
'title': data['title'],
'url': works_element[0],
}]
pd = ResourceContent(metadata=self.data)
pd.lookup_ids[IdType.ISBN] = self.data.get('isbn')
pd.lookup_ids[IdType.CUBN] = self.data.get('cubn')
if self.data["cover_image_url"]:
imgdl = BasicImageDownloader(self.data["cover_image_url"], self.url)
try:
pd.cover_image = imgdl.download().content
pd.cover_image_extention = imgdl.extention
except Exception:
_logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}')
pd = ResourceContent(metadata=data)
pd.lookup_ids[IdType.ISBN] = isbn
pd.lookup_ids[IdType.CUBN] = cubn
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(img_url, self.url)
return pd

View file

@ -14,8 +14,9 @@ from django.core.validators import MaxValueValidator, MinValueValidator
from django.utils.translation import gettext_lazy as _
from django.core.validators import RegexValidator
from functools import cached_property
from django.db.models import Count
from django.db.models import Count, Avg
import django.dispatch
import math
class Piece(PolymorphicModel, UserOwnedObjectMixin):
@ -29,7 +30,7 @@ class Piece(PolymorphicModel, UserOwnedObjectMixin):
class Content(SoftDeleteMixin, Piece):
item: models.ForeignKey(Item, on_delete=models.PROTECT)
item = models.ForeignKey(Item, on_delete=models.PROTECT)
def __str__(self):
return f"{self.id}({self.item})"
@ -50,6 +51,22 @@ class Rating(Content):
grade = models.IntegerField(default=0, validators=[MaxValueValidator(10), MinValueValidator(0)])
class RatingManager:
@staticmethod
def get_rating_for_item(item):
stat = Rating.objects.filter(item=item).aggregate(average=Avg('grade'), count=Count('item'))
return math.ceil(stat['average']) if stat['count'] >= 5 else 0
@staticmethod
def get_rating_count_for_item(item):
stat = Rating.objects.filter(item=item).aggregate(count=Count('item'))
return stat['count']
Item.rating = property(RatingManager.get_rating_for_item)
Item.rating_count = property(RatingManager.get_rating_count_for_item)
class Reply(Content):
reply_to_content = models.ForeignKey(Content, on_delete=models.PROTECT, related_name='replies')
title = models.CharField(max_length=500, null=True)