From 486dd16e1ff93ae34513645ea1858267e908bd89 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 14 Dec 2022 21:12:37 -0500 Subject: [PATCH] new data model: add legacy fields for Book to Edition as virtual fields --- catalog/book/models.py | 39 ++++++- catalog/common/downloaders.py | 2 +- catalog/common/models.py | 17 ++- catalog/common/utils.py | 2 +- catalog/management/commands/cat.py | 19 ++- catalog/sites/douban_book.py | 182 +++++++++++++++++------------ journal/models.py | 21 +++- 7 files changed, 190 insertions(+), 92 deletions(-) diff --git a/catalog/book/models.py b/catalog/book/models.py index 2d9a9d86..8c0390d6 100644 --- a/catalog/book/models.py +++ b/catalog/book/models.py @@ -31,13 +31,40 @@ class Edition(Item): cubn = PrimaryLookupIdDescriptor(IdType.CUBN) # douban_book = LookupIdDescriptor(IdType.DoubanBook) # goodreads = LookupIdDescriptor(IdType.Goodreads) - languages = jsondata.ArrayField(_("语言"), null=True, blank=True, default=list) - publish_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True) - publish_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True) + + METADATA_COPY_LIST = [ + 'title', + 'brief', + # legacy fields + 'subtitle', + 'orig_title', + 'author', + 'translator', + 'language', + 'pub_house', + 'pub_year', + 'pub_month', + 'binding', + 'price', + 'pages', + 'contents', + 'series', + 'producer', + ] + subtitle = jsondata.CharField(null=True, blank=True, default=None) + orig_title = jsondata.CharField(null=True, blank=True, default=None) + author = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list) + translator = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list) + language = jsondata.ArrayField(_("语言"), null=True, blank=True, default=list) + pub_house = jsondata.ArrayField(_('出版方'), null=True, blank=True, default=list) + pub_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True) + pub_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True) + binding = jsondata.CharField(null=True, blank=True, default=None) pages = jsondata.IntegerField(blank=True, default=None) - authors = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list) - translaters = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list) - publishers = jsondata.ArrayField(_('出版方'), null=True, blank=True, default=list) + series = jsondata.CharField(null=True, blank=True, default=None) + contents = jsondata.CharField(null=True, blank=True, default=None) + price = jsondata.FloatField(_("发表月份"), null=True, blank=True) + producer = jsondata.FloatField(_("发表月份"), null=True, blank=True) @property def isbn10(self): diff --git a/catalog/common/downloaders.py b/catalog/common/downloaders.py index f7d1ed83..31e7818e 100644 --- a/catalog/common/downloaders.py +++ b/catalog/common/downloaders.py @@ -235,7 +235,7 @@ class MockResponse: return json.load(StringIO(self.text)) def html(self): - return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug + return html.fromstring(self.text) # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5 @property def headers(self): diff --git a/catalog/common/models.py b/catalog/common/models.py index 46b3a9d3..f608d304 100644 --- a/catalog/common/models.py +++ b/catalog/common/models.py @@ -184,7 +184,7 @@ class Item(PolymorphicModel, SoftDeleteMixin): self.primary_lookup_id_type = None def __str__(self): - return f"{self.id}{' ' + self.primary_lookup_id_type + ':' + self.primary_lookup_id_value if self.primary_lookup_id_value else ''} ({self.title})" + return f"{self.id}|{self.url_id}{' ' + self.primary_lookup_id_type + ':' + self.primary_lookup_id_value if self.primary_lookup_id_value else ''} ({self.title})" @classmethod def get_best_lookup_id(cls, lookup_ids): @@ -210,9 +210,13 @@ class Item(PolymorphicModel, SoftDeleteMixin): else: self.merged_to_item = to_item + @property + def url_id(self): + return base62.encode(self.uid.int) + @property def url(self): - return f'/{self.url_path}/{base62.encode(self.uid.int)}' + return f'/{self.url_path}/{self.url_id}' @classmethod def get_by_url(cls, url_or_b62): @@ -236,6 +240,9 @@ class Item(PolymorphicModel, SoftDeleteMixin): def copy_metadata(cls, metadata): return dict((k, v) for k, v in metadata.items() if k in cls.METADATA_COPY_LIST and v is not None) + def has_cover(self): + return self.cover and self.cover != DEFAULT_ITEM_COVER + def merge_data_from_external_resources(self): """Subclass may override this""" lookup_ids = [] @@ -245,7 +252,7 @@ class Item(PolymorphicModel, SoftDeleteMixin): for k in self.METADATA_COPY_LIST: if not getattr(self, k) and p.metadata.get(k): setattr(self, k, p.metadata.get(k)) - if not self.cover and p.cover: + if not self.has_cover() and p.cover: self.cover = p.cover self.update_lookup_ids(lookup_ids) @@ -284,6 +291,10 @@ class ExternalResource(models.Model): def __str__(self): return f"{self.id}{':' + self.id_type + ':' + self.id_value if self.id_value else ''} ({self.url})" + @property + def site_name(self): + return self.id_type # TODO change to localized name + def update_content(self, resource_content): self.other_lookup_ids = resource_content.lookup_ids self.metadata = resource_content.metadata diff --git a/catalog/common/utils.py b/catalog/common/utils.py index 39b115a9..5bfc82c4 100644 --- a/catalog/common/utils.py +++ b/catalog/common/utils.py @@ -11,4 +11,4 @@ DEFAULT_ITEM_COVER = 'item/default.svg' def item_cover_path(resource, filename): fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1] - return 'items/' + resource.id_type + '/' + fn + return 'item/' + resource.id_type + '/' + fn diff --git a/catalog/management/commands/cat.py b/catalog/management/commands/cat.py index 1854c08b..9d714693 100644 --- a/catalog/management/commands/cat.py +++ b/catalog/management/commands/cat.py @@ -5,10 +5,15 @@ from catalog.sites import * class Command(BaseCommand): - help = 'Scrape a catalog item from external resource (but not save it)' + help = 'Scrape a catalog item from external resource (and save it)' def add_arguments(self, parser): parser.add_argument('url', type=str, help='URL to scrape') + parser.add_argument( + '--save', + action='store_true', + help='save to database', + ) def handle(self, *args, **options): url = str(options['url']) @@ -17,7 +22,13 @@ class Command(BaseCommand): self.stdout.write(self.style.ERROR(f'Unknown site for {url}')) return self.stdout.write(f'Fetching from {site}') - resource = site.scrape() + if options['save']: + resource = site.get_resource_ready() + pprint.pp(resource.metadata) + pprint.pp(site.get_item()) + pprint.pp(site.get_item().metadata) + else: + resource = site.scrape() + pprint.pp(resource.metadata) + pprint.pp(resource.lookup_ids) self.stdout.write(self.style.SUCCESS(f'Done.')) - pprint.pp(resource.metadata) - pprint.pp(resource.lookup_ids) diff --git a/catalog/sites/douban_book.py b/catalog/sites/douban_book.py index 021857c5..19aeecaf 100644 --- a/catalog/sites/douban_book.py +++ b/catalog/sites/douban_book.py @@ -8,27 +8,8 @@ import logging _logger = logging.getLogger(__name__) -class ScraperMixin: - def set_field(self, field, value=None): - self.data[field] = value - - def parse_str(self, query): - elem = self.html.xpath(query) - return elem[0].strip() if elem else None - - def parse_field(self, field, query, error_when_missing=False): - elem = self.html.xpath(query) - if elem: - self.data[field] = elem[0].strip() - elif error_when_missing: - raise ParseError(self, field) - else: - self.data[field] = None - return elem - - @SiteList.register -class DoubanBook(AbstractSite, ScraperMixin): +class DoubanBook(AbstractSite): ID_TYPE = IdType.DoubanBook URL_PATTERNS = [r"\w+://book\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/book/subject/(\d+)/{0,1}"] WIKI_PROPERTY_ID = '?' @@ -39,28 +20,34 @@ class DoubanBook(AbstractSite, ScraperMixin): return "https://book.douban.com/subject/" + id_value + "/" def scrape(self): - self.data = {} - self.html = DoubanDownloader(self.url).download().html() - self.parse_field('title', "/html/body//h1/span/text()") - self.parse_field('isbn', "//div[@id='info']//span[text()='ISBN:']/following::text()") - # TODO does douban store ASIN as ISBN, need more cleanup if so - if not self.data['title']: - if self.data['isbn']: - self.data['title'] = 'isbn: ' + isbn - else: - raise ParseError(self, 'title') + content = DoubanDownloader(self.url).download().html() - self.parse_field('cover_image_url', "//*[@id='mainpic']/a/img/@src") - self.parse_field('brief', "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()") - self.parse_field('series', "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()") - self.parse_field('producer', "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()") - self.parse_field('cubn', "//div[@id='info']//span[text()='统一书号:']/following::text()") - self.parse_field('subtitle', "//div[@id='info']//span[text()='副标题:']/following::text()") - self.parse_field('orig_title', "//div[@id='info']//span[text()='原作名:']/following::text()") - self.parse_field('language', "//div[@id='info']//span[text()='语言:']/following::text()") - self.parse_field('pub_house', "//div[@id='info']//span[text()='出版社:']/following::text()") - self.parse_field('pub_date', "//div[@id='info']//span[text()='出版年:']/following::text()") - year_month_day = RE_NUMBERS.findall(self.data['pub_date']) if self.data['pub_date'] else [] + isbn_elem = content.xpath("//div[@id='info']//span[text()='ISBN:']/following::text()") + isbn = isbn_elem[0].strip() if isbn_elem else None + + title_elem = content.xpath("/html/body//h1/span/text()") + title = title_elem[0].strip() if title_elem else f"Unknown Title {self.id_value}" + + subtitle_elem = content.xpath( + "//div[@id='info']//span[text()='副标题:']/following::text()") + subtitle = subtitle_elem[0].strip()[:500] if subtitle_elem else None + + orig_title_elem = content.xpath( + "//div[@id='info']//span[text()='原作名:']/following::text()") + orig_title = orig_title_elem[0].strip()[:500] if orig_title_elem else None + + language_elem = content.xpath( + "//div[@id='info']//span[text()='语言:']/following::text()") + language = language_elem[0].strip() if language_elem else None + + pub_house_elem = content.xpath( + "//div[@id='info']//span[text()='出版社:']/following::text()") + pub_house = pub_house_elem[0].strip() if pub_house_elem else None + + pub_date_elem = content.xpath( + "//div[@id='info']//span[text()='出版年:']/following::text()") + pub_date = pub_date_elem[0].strip() if pub_date_elem else '' + year_month_day = RE_NUMBERS.findall(pub_date) if len(year_month_day) in (2, 3): pub_year = int(year_month_day[0]) pub_month = int(year_month_day[1]) @@ -77,33 +64,51 @@ class DoubanBook(AbstractSite, ScraperMixin): pub_month = None if pub_month is not None and pub_month not in range( 1, 12) else pub_month - self.parse_field('binding', "//div[@id='info']//span[text()='装帧:']/following::text()") - self.parse_field('price', "//div[@id='info']//span[text()='定价:']/following::text()") - self.parse_field('pages', "//div[@id='info']//span[text()='页数:']/following::text()") - if self.data['pages'] is not None: - self.data['pages'] = int(RE_NUMBERS.findall(self.data['pages'])[0]) if RE_NUMBERS.findall(self.data['pages']) else None - if self.data['pages'] and (self.data['pages'] > 999999 or self.data['pages'] < 1): - self.data['pages'] = None + binding_elem = content.xpath( + "//div[@id='info']//span[text()='装帧:']/following::text()") + binding = binding_elem[0].strip() if binding_elem else None + + price_elem = content.xpath( + "//div[@id='info']//span[text()='定价:']/following::text()") + price = price_elem[0].strip() if price_elem else None + + pages_elem = content.xpath( + "//div[@id='info']//span[text()='页数:']/following::text()") + pages = pages_elem[0].strip() if pages_elem else None + if pages is not None: + pages = int(RE_NUMBERS.findall(pages)[ + 0]) if RE_NUMBERS.findall(pages) else None + if pages and (pages > 999999 or pages < 1): + pages = None + + brief_elem = content.xpath( + "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()") + brief = '\n'.join(p.strip() + for p in brief_elem) if brief_elem else None contents = None try: - contents_elem = self.html.xpath( + contents_elem = content.xpath( "//h2/span[text()='目录']/../following-sibling::div[1]")[0] # if next the id of next sibling contains `dir`, that would be the full contents if "dir" in contents_elem.getnext().xpath("@id")[0]: contents_elem = contents_elem.getnext() - contents = '\n'.join(p.strip() for p in contents_elem.xpath("text()")[:-2]) if len(contents_elem) else None + contents = '\n'.join(p.strip() for p in contents_elem.xpath( + "text()")[:-2]) if contents_elem is not None else None else: - contents = '\n'.join(p.strip() for p in contents_elem.xpath("text()")) if len(contents_elem) else None + contents = '\n'.join(p.strip() for p in contents_elem.xpath( + "text()")) if contents_elem is not None else None except Exception: pass - self.data['contents'] = contents + + img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src") + img_url = img_url_elem[0].strip() if img_url_elem else None # there are two html formats for authors and translators - authors_elem = self.html.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/ + authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/ preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""") if not authors_elem: - authors_elem = self.html.xpath( + authors_elem = content.xpath( """//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""") if authors_elem: authors = [] @@ -111,12 +116,11 @@ class DoubanBook(AbstractSite, ScraperMixin): authors.append(RE_WHITESPACES.sub(' ', author.strip())[:200]) else: authors = None - self.data['authors'] = authors - translators_elem = self.html.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/ + translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/ preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""") if not translators_elem: - translators_elem = self.html.xpath( + translators_elem = content.xpath( """//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""") if translators_elem: translators = [] @@ -124,28 +128,56 @@ class DoubanBook(AbstractSite, ScraperMixin): translators.append(RE_WHITESPACES.sub(' ', translator.strip())) else: translators = None - self.data['translators'] = translators - work_link = self.parse_str('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href') - if work_link: - r = re.match(r'\w+://book.douban.com/works/(\d+)', work_link) - self.data['required_resources'] = [{ + cncode_elem = content.xpath( + "//div[@id='info']//span[text()='统一书号:']/following::text()") + cubn = cncode_elem[0].strip() if cncode_elem else None + + series_elem = content.xpath( + "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()") + series = series_elem[0].strip() if series_elem else None + + imprint_elem = content.xpath( + "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()") + producer = imprint_elem[0].strip() if imprint_elem else None + + data = { + 'title': title, + 'subtitle': subtitle, + 'orig_title': orig_title, + 'author': authors, + 'translator': translators, + 'language': language, + 'pub_house': pub_house, + 'pub_year': pub_year, + 'pub_month': pub_month, + 'binding': binding, + 'price': price, + 'pages': pages, + 'isbn': isbn, + 'cubn': cubn, + 'brief': brief, + 'contents': contents, + 'series': series, + 'producer': producer, + 'cover_image_url': img_url, + } + + works_element = content.xpath('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href') + if works_element: + r = re.match(r'\w+://book.douban.com/works/(\d+)', works_element[0]) + data['required_resources'] = [{ 'model': 'Work', - 'id_type': IdType.DoubanBook_Work, + 'id_type': IdType.DoubanBook_Work, 'id_value': r[1] if r else None, - 'title': self.data['title'], - 'url': work_link, + 'title': data['title'], + 'url': works_element[0], }] - pd = ResourceContent(metadata=self.data) - pd.lookup_ids[IdType.ISBN] = self.data.get('isbn') - pd.lookup_ids[IdType.CUBN] = self.data.get('cubn') - if self.data["cover_image_url"]: - imgdl = BasicImageDownloader(self.data["cover_image_url"], self.url) - try: - pd.cover_image = imgdl.download().content - pd.cover_image_extention = imgdl.extention - except Exception: - _logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}') + + pd = ResourceContent(metadata=data) + pd.lookup_ids[IdType.ISBN] = isbn + pd.lookup_ids[IdType.CUBN] = cubn + pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(img_url, self.url) return pd diff --git a/journal/models.py b/journal/models.py index 7839a2c6..8aad9964 100644 --- a/journal/models.py +++ b/journal/models.py @@ -14,8 +14,9 @@ from django.core.validators import MaxValueValidator, MinValueValidator from django.utils.translation import gettext_lazy as _ from django.core.validators import RegexValidator from functools import cached_property -from django.db.models import Count +from django.db.models import Count, Avg import django.dispatch +import math class Piece(PolymorphicModel, UserOwnedObjectMixin): @@ -29,7 +30,7 @@ class Piece(PolymorphicModel, UserOwnedObjectMixin): class Content(SoftDeleteMixin, Piece): - item: models.ForeignKey(Item, on_delete=models.PROTECT) + item = models.ForeignKey(Item, on_delete=models.PROTECT) def __str__(self): return f"{self.id}({self.item})" @@ -50,6 +51,22 @@ class Rating(Content): grade = models.IntegerField(default=0, validators=[MaxValueValidator(10), MinValueValidator(0)]) +class RatingManager: + @staticmethod + def get_rating_for_item(item): + stat = Rating.objects.filter(item=item).aggregate(average=Avg('grade'), count=Count('item')) + return math.ceil(stat['average']) if stat['count'] >= 5 else 0 + + @staticmethod + def get_rating_count_for_item(item): + stat = Rating.objects.filter(item=item).aggregate(count=Count('item')) + return stat['count'] + + +Item.rating = property(RatingManager.get_rating_for_item) +Item.rating_count = property(RatingManager.get_rating_count_for_item) + + class Reply(Content): reply_to_content = models.ForeignKey(Content, on_delete=models.PROTECT, related_name='replies') title = models.CharField(max_length=500, null=True)