new data model: add legacy fields for Book to Edition as virtual fields
This commit is contained in:
parent
3d6eeab70b
commit
486dd16e1f
7 changed files with 190 additions and 92 deletions
|
@ -31,13 +31,40 @@ class Edition(Item):
|
|||
cubn = PrimaryLookupIdDescriptor(IdType.CUBN)
|
||||
# douban_book = LookupIdDescriptor(IdType.DoubanBook)
|
||||
# goodreads = LookupIdDescriptor(IdType.Goodreads)
|
||||
languages = jsondata.ArrayField(_("语言"), null=True, blank=True, default=list)
|
||||
publish_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True)
|
||||
publish_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True)
|
||||
|
||||
METADATA_COPY_LIST = [
|
||||
'title',
|
||||
'brief',
|
||||
# legacy fields
|
||||
'subtitle',
|
||||
'orig_title',
|
||||
'author',
|
||||
'translator',
|
||||
'language',
|
||||
'pub_house',
|
||||
'pub_year',
|
||||
'pub_month',
|
||||
'binding',
|
||||
'price',
|
||||
'pages',
|
||||
'contents',
|
||||
'series',
|
||||
'producer',
|
||||
]
|
||||
subtitle = jsondata.CharField(null=True, blank=True, default=None)
|
||||
orig_title = jsondata.CharField(null=True, blank=True, default=None)
|
||||
author = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list)
|
||||
translator = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list)
|
||||
language = jsondata.ArrayField(_("语言"), null=True, blank=True, default=list)
|
||||
pub_house = jsondata.ArrayField(_('出版方'), null=True, blank=True, default=list)
|
||||
pub_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True)
|
||||
pub_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True)
|
||||
binding = jsondata.CharField(null=True, blank=True, default=None)
|
||||
pages = jsondata.IntegerField(blank=True, default=None)
|
||||
authors = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list)
|
||||
translaters = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list)
|
||||
publishers = jsondata.ArrayField(_('出版方'), null=True, blank=True, default=list)
|
||||
series = jsondata.CharField(null=True, blank=True, default=None)
|
||||
contents = jsondata.CharField(null=True, blank=True, default=None)
|
||||
price = jsondata.FloatField(_("发表月份"), null=True, blank=True)
|
||||
producer = jsondata.FloatField(_("发表月份"), null=True, blank=True)
|
||||
|
||||
@property
|
||||
def isbn10(self):
|
||||
|
|
|
@ -235,7 +235,7 @@ class MockResponse:
|
|||
return json.load(StringIO(self.text))
|
||||
|
||||
def html(self):
|
||||
return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug
|
||||
return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5
|
||||
|
||||
@property
|
||||
def headers(self):
|
||||
|
|
|
@ -184,7 +184,7 @@ class Item(PolymorphicModel, SoftDeleteMixin):
|
|||
self.primary_lookup_id_type = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.id}{' ' + self.primary_lookup_id_type + ':' + self.primary_lookup_id_value if self.primary_lookup_id_value else ''} ({self.title})"
|
||||
return f"{self.id}|{self.url_id}{' ' + self.primary_lookup_id_type + ':' + self.primary_lookup_id_value if self.primary_lookup_id_value else ''} ({self.title})"
|
||||
|
||||
@classmethod
|
||||
def get_best_lookup_id(cls, lookup_ids):
|
||||
|
@ -210,9 +210,13 @@ class Item(PolymorphicModel, SoftDeleteMixin):
|
|||
else:
|
||||
self.merged_to_item = to_item
|
||||
|
||||
@property
|
||||
def url_id(self):
|
||||
return base62.encode(self.uid.int)
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return f'/{self.url_path}/{base62.encode(self.uid.int)}'
|
||||
return f'/{self.url_path}/{self.url_id}'
|
||||
|
||||
@classmethod
|
||||
def get_by_url(cls, url_or_b62):
|
||||
|
@ -236,6 +240,9 @@ class Item(PolymorphicModel, SoftDeleteMixin):
|
|||
def copy_metadata(cls, metadata):
|
||||
return dict((k, v) for k, v in metadata.items() if k in cls.METADATA_COPY_LIST and v is not None)
|
||||
|
||||
def has_cover(self):
|
||||
return self.cover and self.cover != DEFAULT_ITEM_COVER
|
||||
|
||||
def merge_data_from_external_resources(self):
|
||||
"""Subclass may override this"""
|
||||
lookup_ids = []
|
||||
|
@ -245,7 +252,7 @@ class Item(PolymorphicModel, SoftDeleteMixin):
|
|||
for k in self.METADATA_COPY_LIST:
|
||||
if not getattr(self, k) and p.metadata.get(k):
|
||||
setattr(self, k, p.metadata.get(k))
|
||||
if not self.cover and p.cover:
|
||||
if not self.has_cover() and p.cover:
|
||||
self.cover = p.cover
|
||||
self.update_lookup_ids(lookup_ids)
|
||||
|
||||
|
@ -284,6 +291,10 @@ class ExternalResource(models.Model):
|
|||
def __str__(self):
|
||||
return f"{self.id}{':' + self.id_type + ':' + self.id_value if self.id_value else ''} ({self.url})"
|
||||
|
||||
@property
|
||||
def site_name(self):
|
||||
return self.id_type # TODO change to localized name
|
||||
|
||||
def update_content(self, resource_content):
|
||||
self.other_lookup_ids = resource_content.lookup_ids
|
||||
self.metadata = resource_content.metadata
|
||||
|
|
|
@ -11,4 +11,4 @@ DEFAULT_ITEM_COVER = 'item/default.svg'
|
|||
|
||||
def item_cover_path(resource, filename):
|
||||
fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1]
|
||||
return 'items/' + resource.id_type + '/' + fn
|
||||
return 'item/' + resource.id_type + '/' + fn
|
||||
|
|
|
@ -5,10 +5,15 @@ from catalog.sites import *
|
|||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Scrape a catalog item from external resource (but not save it)'
|
||||
help = 'Scrape a catalog item from external resource (and save it)'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('url', type=str, help='URL to scrape')
|
||||
parser.add_argument(
|
||||
'--save',
|
||||
action='store_true',
|
||||
help='save to database',
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
url = str(options['url'])
|
||||
|
@ -17,7 +22,13 @@ class Command(BaseCommand):
|
|||
self.stdout.write(self.style.ERROR(f'Unknown site for {url}'))
|
||||
return
|
||||
self.stdout.write(f'Fetching from {site}')
|
||||
resource = site.scrape()
|
||||
if options['save']:
|
||||
resource = site.get_resource_ready()
|
||||
pprint.pp(resource.metadata)
|
||||
pprint.pp(site.get_item())
|
||||
pprint.pp(site.get_item().metadata)
|
||||
else:
|
||||
resource = site.scrape()
|
||||
pprint.pp(resource.metadata)
|
||||
pprint.pp(resource.lookup_ids)
|
||||
self.stdout.write(self.style.SUCCESS(f'Done.'))
|
||||
pprint.pp(resource.metadata)
|
||||
pprint.pp(resource.lookup_ids)
|
||||
|
|
|
@ -8,27 +8,8 @@ import logging
|
|||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ScraperMixin:
|
||||
def set_field(self, field, value=None):
|
||||
self.data[field] = value
|
||||
|
||||
def parse_str(self, query):
|
||||
elem = self.html.xpath(query)
|
||||
return elem[0].strip() if elem else None
|
||||
|
||||
def parse_field(self, field, query, error_when_missing=False):
|
||||
elem = self.html.xpath(query)
|
||||
if elem:
|
||||
self.data[field] = elem[0].strip()
|
||||
elif error_when_missing:
|
||||
raise ParseError(self, field)
|
||||
else:
|
||||
self.data[field] = None
|
||||
return elem
|
||||
|
||||
|
||||
@SiteList.register
|
||||
class DoubanBook(AbstractSite, ScraperMixin):
|
||||
class DoubanBook(AbstractSite):
|
||||
ID_TYPE = IdType.DoubanBook
|
||||
URL_PATTERNS = [r"\w+://book\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/book/subject/(\d+)/{0,1}"]
|
||||
WIKI_PROPERTY_ID = '?'
|
||||
|
@ -39,28 +20,34 @@ class DoubanBook(AbstractSite, ScraperMixin):
|
|||
return "https://book.douban.com/subject/" + id_value + "/"
|
||||
|
||||
def scrape(self):
|
||||
self.data = {}
|
||||
self.html = DoubanDownloader(self.url).download().html()
|
||||
self.parse_field('title', "/html/body//h1/span/text()")
|
||||
self.parse_field('isbn', "//div[@id='info']//span[text()='ISBN:']/following::text()")
|
||||
# TODO does douban store ASIN as ISBN, need more cleanup if so
|
||||
if not self.data['title']:
|
||||
if self.data['isbn']:
|
||||
self.data['title'] = 'isbn: ' + isbn
|
||||
else:
|
||||
raise ParseError(self, 'title')
|
||||
content = DoubanDownloader(self.url).download().html()
|
||||
|
||||
self.parse_field('cover_image_url', "//*[@id='mainpic']/a/img/@src")
|
||||
self.parse_field('brief', "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
|
||||
self.parse_field('series', "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
|
||||
self.parse_field('producer', "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
|
||||
self.parse_field('cubn', "//div[@id='info']//span[text()='统一书号:']/following::text()")
|
||||
self.parse_field('subtitle', "//div[@id='info']//span[text()='副标题:']/following::text()")
|
||||
self.parse_field('orig_title', "//div[@id='info']//span[text()='原作名:']/following::text()")
|
||||
self.parse_field('language', "//div[@id='info']//span[text()='语言:']/following::text()")
|
||||
self.parse_field('pub_house', "//div[@id='info']//span[text()='出版社:']/following::text()")
|
||||
self.parse_field('pub_date', "//div[@id='info']//span[text()='出版年:']/following::text()")
|
||||
year_month_day = RE_NUMBERS.findall(self.data['pub_date']) if self.data['pub_date'] else []
|
||||
isbn_elem = content.xpath("//div[@id='info']//span[text()='ISBN:']/following::text()")
|
||||
isbn = isbn_elem[0].strip() if isbn_elem else None
|
||||
|
||||
title_elem = content.xpath("/html/body//h1/span/text()")
|
||||
title = title_elem[0].strip() if title_elem else f"Unknown Title {self.id_value}"
|
||||
|
||||
subtitle_elem = content.xpath(
|
||||
"//div[@id='info']//span[text()='副标题:']/following::text()")
|
||||
subtitle = subtitle_elem[0].strip()[:500] if subtitle_elem else None
|
||||
|
||||
orig_title_elem = content.xpath(
|
||||
"//div[@id='info']//span[text()='原作名:']/following::text()")
|
||||
orig_title = orig_title_elem[0].strip()[:500] if orig_title_elem else None
|
||||
|
||||
language_elem = content.xpath(
|
||||
"//div[@id='info']//span[text()='语言:']/following::text()")
|
||||
language = language_elem[0].strip() if language_elem else None
|
||||
|
||||
pub_house_elem = content.xpath(
|
||||
"//div[@id='info']//span[text()='出版社:']/following::text()")
|
||||
pub_house = pub_house_elem[0].strip() if pub_house_elem else None
|
||||
|
||||
pub_date_elem = content.xpath(
|
||||
"//div[@id='info']//span[text()='出版年:']/following::text()")
|
||||
pub_date = pub_date_elem[0].strip() if pub_date_elem else ''
|
||||
year_month_day = RE_NUMBERS.findall(pub_date)
|
||||
if len(year_month_day) in (2, 3):
|
||||
pub_year = int(year_month_day[0])
|
||||
pub_month = int(year_month_day[1])
|
||||
|
@ -77,33 +64,51 @@ class DoubanBook(AbstractSite, ScraperMixin):
|
|||
pub_month = None if pub_month is not None and pub_month not in range(
|
||||
1, 12) else pub_month
|
||||
|
||||
self.parse_field('binding', "//div[@id='info']//span[text()='装帧:']/following::text()")
|
||||
self.parse_field('price', "//div[@id='info']//span[text()='定价:']/following::text()")
|
||||
self.parse_field('pages', "//div[@id='info']//span[text()='页数:']/following::text()")
|
||||
if self.data['pages'] is not None:
|
||||
self.data['pages'] = int(RE_NUMBERS.findall(self.data['pages'])[0]) if RE_NUMBERS.findall(self.data['pages']) else None
|
||||
if self.data['pages'] and (self.data['pages'] > 999999 or self.data['pages'] < 1):
|
||||
self.data['pages'] = None
|
||||
binding_elem = content.xpath(
|
||||
"//div[@id='info']//span[text()='装帧:']/following::text()")
|
||||
binding = binding_elem[0].strip() if binding_elem else None
|
||||
|
||||
price_elem = content.xpath(
|
||||
"//div[@id='info']//span[text()='定价:']/following::text()")
|
||||
price = price_elem[0].strip() if price_elem else None
|
||||
|
||||
pages_elem = content.xpath(
|
||||
"//div[@id='info']//span[text()='页数:']/following::text()")
|
||||
pages = pages_elem[0].strip() if pages_elem else None
|
||||
if pages is not None:
|
||||
pages = int(RE_NUMBERS.findall(pages)[
|
||||
0]) if RE_NUMBERS.findall(pages) else None
|
||||
if pages and (pages > 999999 or pages < 1):
|
||||
pages = None
|
||||
|
||||
brief_elem = content.xpath(
|
||||
"//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
|
||||
brief = '\n'.join(p.strip()
|
||||
for p in brief_elem) if brief_elem else None
|
||||
|
||||
contents = None
|
||||
try:
|
||||
contents_elem = self.html.xpath(
|
||||
contents_elem = content.xpath(
|
||||
"//h2/span[text()='目录']/../following-sibling::div[1]")[0]
|
||||
# if next the id of next sibling contains `dir`, that would be the full contents
|
||||
if "dir" in contents_elem.getnext().xpath("@id")[0]:
|
||||
contents_elem = contents_elem.getnext()
|
||||
contents = '\n'.join(p.strip() for p in contents_elem.xpath("text()")[:-2]) if len(contents_elem) else None
|
||||
contents = '\n'.join(p.strip() for p in contents_elem.xpath(
|
||||
"text()")[:-2]) if contents_elem is not None else None
|
||||
else:
|
||||
contents = '\n'.join(p.strip() for p in contents_elem.xpath("text()")) if len(contents_elem) else None
|
||||
contents = '\n'.join(p.strip() for p in contents_elem.xpath(
|
||||
"text()")) if contents_elem is not None else None
|
||||
except Exception:
|
||||
pass
|
||||
self.data['contents'] = contents
|
||||
|
||||
img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src")
|
||||
img_url = img_url_elem[0].strip() if img_url_elem else None
|
||||
|
||||
# there are two html formats for authors and translators
|
||||
authors_elem = self.html.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
|
||||
authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
|
||||
preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""")
|
||||
if not authors_elem:
|
||||
authors_elem = self.html.xpath(
|
||||
authors_elem = content.xpath(
|
||||
"""//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""")
|
||||
if authors_elem:
|
||||
authors = []
|
||||
|
@ -111,12 +116,11 @@ class DoubanBook(AbstractSite, ScraperMixin):
|
|||
authors.append(RE_WHITESPACES.sub(' ', author.strip())[:200])
|
||||
else:
|
||||
authors = None
|
||||
self.data['authors'] = authors
|
||||
|
||||
translators_elem = self.html.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
|
||||
translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
|
||||
preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""")
|
||||
if not translators_elem:
|
||||
translators_elem = self.html.xpath(
|
||||
translators_elem = content.xpath(
|
||||
"""//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""")
|
||||
if translators_elem:
|
||||
translators = []
|
||||
|
@ -124,28 +128,56 @@ class DoubanBook(AbstractSite, ScraperMixin):
|
|||
translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
|
||||
else:
|
||||
translators = None
|
||||
self.data['translators'] = translators
|
||||
|
||||
work_link = self.parse_str('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
|
||||
if work_link:
|
||||
r = re.match(r'\w+://book.douban.com/works/(\d+)', work_link)
|
||||
self.data['required_resources'] = [{
|
||||
cncode_elem = content.xpath(
|
||||
"//div[@id='info']//span[text()='统一书号:']/following::text()")
|
||||
cubn = cncode_elem[0].strip() if cncode_elem else None
|
||||
|
||||
series_elem = content.xpath(
|
||||
"//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
|
||||
series = series_elem[0].strip() if series_elem else None
|
||||
|
||||
imprint_elem = content.xpath(
|
||||
"//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
|
||||
producer = imprint_elem[0].strip() if imprint_elem else None
|
||||
|
||||
data = {
|
||||
'title': title,
|
||||
'subtitle': subtitle,
|
||||
'orig_title': orig_title,
|
||||
'author': authors,
|
||||
'translator': translators,
|
||||
'language': language,
|
||||
'pub_house': pub_house,
|
||||
'pub_year': pub_year,
|
||||
'pub_month': pub_month,
|
||||
'binding': binding,
|
||||
'price': price,
|
||||
'pages': pages,
|
||||
'isbn': isbn,
|
||||
'cubn': cubn,
|
||||
'brief': brief,
|
||||
'contents': contents,
|
||||
'series': series,
|
||||
'producer': producer,
|
||||
'cover_image_url': img_url,
|
||||
}
|
||||
|
||||
works_element = content.xpath('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
|
||||
if works_element:
|
||||
r = re.match(r'\w+://book.douban.com/works/(\d+)', works_element[0])
|
||||
data['required_resources'] = [{
|
||||
'model': 'Work',
|
||||
'id_type': IdType.DoubanBook_Work,
|
||||
'id_type': IdType.DoubanBook_Work,
|
||||
'id_value': r[1] if r else None,
|
||||
'title': self.data['title'],
|
||||
'url': work_link,
|
||||
'title': data['title'],
|
||||
'url': works_element[0],
|
||||
}]
|
||||
pd = ResourceContent(metadata=self.data)
|
||||
pd.lookup_ids[IdType.ISBN] = self.data.get('isbn')
|
||||
pd.lookup_ids[IdType.CUBN] = self.data.get('cubn')
|
||||
if self.data["cover_image_url"]:
|
||||
imgdl = BasicImageDownloader(self.data["cover_image_url"], self.url)
|
||||
try:
|
||||
pd.cover_image = imgdl.download().content
|
||||
pd.cover_image_extention = imgdl.extention
|
||||
except Exception:
|
||||
_logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}')
|
||||
|
||||
pd = ResourceContent(metadata=data)
|
||||
pd.lookup_ids[IdType.ISBN] = isbn
|
||||
pd.lookup_ids[IdType.CUBN] = cubn
|
||||
pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(img_url, self.url)
|
||||
return pd
|
||||
|
||||
|
||||
|
|
|
@ -14,8 +14,9 @@ from django.core.validators import MaxValueValidator, MinValueValidator
|
|||
from django.utils.translation import gettext_lazy as _
|
||||
from django.core.validators import RegexValidator
|
||||
from functools import cached_property
|
||||
from django.db.models import Count
|
||||
from django.db.models import Count, Avg
|
||||
import django.dispatch
|
||||
import math
|
||||
|
||||
|
||||
class Piece(PolymorphicModel, UserOwnedObjectMixin):
|
||||
|
@ -29,7 +30,7 @@ class Piece(PolymorphicModel, UserOwnedObjectMixin):
|
|||
|
||||
|
||||
class Content(SoftDeleteMixin, Piece):
|
||||
item: models.ForeignKey(Item, on_delete=models.PROTECT)
|
||||
item = models.ForeignKey(Item, on_delete=models.PROTECT)
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.id}({self.item})"
|
||||
|
@ -50,6 +51,22 @@ class Rating(Content):
|
|||
grade = models.IntegerField(default=0, validators=[MaxValueValidator(10), MinValueValidator(0)])
|
||||
|
||||
|
||||
class RatingManager:
|
||||
@staticmethod
|
||||
def get_rating_for_item(item):
|
||||
stat = Rating.objects.filter(item=item).aggregate(average=Avg('grade'), count=Count('item'))
|
||||
return math.ceil(stat['average']) if stat['count'] >= 5 else 0
|
||||
|
||||
@staticmethod
|
||||
def get_rating_count_for_item(item):
|
||||
stat = Rating.objects.filter(item=item).aggregate(count=Count('item'))
|
||||
return stat['count']
|
||||
|
||||
|
||||
Item.rating = property(RatingManager.get_rating_for_item)
|
||||
Item.rating_count = property(RatingManager.get_rating_count_for_item)
|
||||
|
||||
|
||||
class Reply(Content):
|
||||
reply_to_content = models.ForeignKey(Content, on_delete=models.PROTECT, related_name='replies')
|
||||
title = models.CharField(max_length=500, null=True)
|
||||
|
|
Loading…
Add table
Reference in a new issue