new data model: add legacy fields for Book to Edition as virtual fields

2022-12-14 21:12:37 -05:00 · 2022-12-14 21:12:37 -05:00 · 486dd16e1f
commit 486dd16e1f
parent 3d6eeab70b
7 changed files with 190 additions and 92 deletions
--- a/catalog/book/models.py
+++ b/catalog/book/models.py
@ -31,13 +31,40 @@ class Edition(Item):
    cubn = PrimaryLookupIdDescriptor(IdType.CUBN)
    # douban_book = LookupIdDescriptor(IdType.DoubanBook)
    # goodreads = LookupIdDescriptor(IdType.Goodreads)
-    languages = jsondata.ArrayField(_("语言"), null=True, blank=True, default=list)
-    publish_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True)
-    publish_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True)
+
+    METADATA_COPY_LIST = [
+        'title',
+        'brief',
+        # legacy fields
+        'subtitle',
+        'orig_title',
+        'author',
+        'translator',
+        'language',
+        'pub_house',
+        'pub_year',
+        'pub_month',
+        'binding',
+        'price',
+        'pages',
+        'contents',
+        'series',
+        'producer',
+    ]
+    subtitle = jsondata.CharField(null=True, blank=True, default=None)
+    orig_title = jsondata.CharField(null=True, blank=True, default=None)
+    author = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list)
+    translator = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list)
+    language = jsondata.ArrayField(_("语言"), null=True, blank=True, default=list)
+    pub_house = jsondata.ArrayField(_('出版方'), null=True, blank=True, default=list)
+    pub_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True)
+    pub_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True)
+    binding = jsondata.CharField(null=True, blank=True, default=None)
    pages = jsondata.IntegerField(blank=True, default=None)
-    authors = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list)
-    translaters = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list)
-    publishers = jsondata.ArrayField(_('出版方'), null=True, blank=True, default=list)
+    series = jsondata.CharField(null=True, blank=True, default=None)
+    contents = jsondata.CharField(null=True, blank=True, default=None)
+    price = jsondata.FloatField(_("发表月份"), null=True, blank=True)
+    producer = jsondata.FloatField(_("发表月份"), null=True, blank=True)

    @property
    def isbn10(self):
--- a/catalog/common/downloaders.py
+++ b/catalog/common/downloaders.py
@ -235,7 +235,7 @@ class MockResponse:
        return json.load(StringIO(self.text))

    def html(self):
-        return html.fromstring(self.text)  # may throw exception unexpectedly due to OS bug
+        return html.fromstring(self.text)  # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5

    @property
    def headers(self):
--- a/catalog/common/models.py
+++ b/catalog/common/models.py
@ -184,7 +184,7 @@ class Item(PolymorphicModel, SoftDeleteMixin):
        self.primary_lookup_id_type = None

    def __str__(self):
-        return f"{self.id}{' ' + self.primary_lookup_id_type + ':' + self.primary_lookup_id_value if self.primary_lookup_id_value else ''} ({self.title})"
+        return f"{self.id}|{self.url_id}{' ' + self.primary_lookup_id_type + ':' + self.primary_lookup_id_value if self.primary_lookup_id_value else ''} ({self.title})"

    @classmethod
    def get_best_lookup_id(cls, lookup_ids):
@ -210,9 +210,13 @@ class Item(PolymorphicModel, SoftDeleteMixin):
        else:
            self.merged_to_item = to_item

+    @property
+    def url_id(self):
+        return base62.encode(self.uid.int)
+
    @property
    def url(self):
-        return f'/{self.url_path}/{base62.encode(self.uid.int)}'
+        return f'/{self.url_path}/{self.url_id}'

    @classmethod
    def get_by_url(cls, url_or_b62):
@ -236,6 +240,9 @@ class Item(PolymorphicModel, SoftDeleteMixin):
    def copy_metadata(cls, metadata):
        return dict((k, v) for k, v in metadata.items() if k in cls.METADATA_COPY_LIST and v is not None)

+    def has_cover(self):
+        return self.cover and self.cover != DEFAULT_ITEM_COVER
+
    def merge_data_from_external_resources(self):
        """Subclass may override this"""
        lookup_ids = []
@ -245,7 +252,7 @@ class Item(PolymorphicModel, SoftDeleteMixin):
            for k in self.METADATA_COPY_LIST:
                if not getattr(self, k) and p.metadata.get(k):
                    setattr(self, k, p.metadata.get(k))
-            if not self.cover and p.cover:
+            if not self.has_cover() and p.cover:
                self.cover = p.cover
        self.update_lookup_ids(lookup_ids)

@ -284,6 +291,10 @@ class ExternalResource(models.Model):
    def __str__(self):
        return f"{self.id}{':' + self.id_type + ':' + self.id_value if self.id_value else ''} ({self.url})"

+    @property
+    def site_name(self):
+        return self.id_type  # TODO change to localized name
+
    def update_content(self, resource_content):
        self.other_lookup_ids = resource_content.lookup_ids
        self.metadata = resource_content.metadata
--- a/catalog/common/utils.py
+++ b/catalog/common/utils.py
@ -11,4 +11,4 @@ DEFAULT_ITEM_COVER = 'item/default.svg'

 def item_cover_path(resource, filename):
    fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1]
-    return 'items/' + resource.id_type + '/' + fn
+    return 'item/' + resource.id_type + '/' + fn
--- a/catalog/management/commands/cat.py
+++ b/catalog/management/commands/cat.py
@ -5,10 +5,15 @@ from catalog.sites import *


 class Command(BaseCommand):
-    help = 'Scrape a catalog item from external resource (but not save it)'
+    help = 'Scrape a catalog item from external resource (and save it)'

    def add_arguments(self, parser):
        parser.add_argument('url', type=str, help='URL to scrape')
+        parser.add_argument(
+            '--save',
+            action='store_true',
+            help='save to database',
+        )

    def handle(self, *args, **options):
        url = str(options['url'])
@ -17,7 +22,13 @@ class Command(BaseCommand):
            self.stdout.write(self.style.ERROR(f'Unknown site for {url}'))
            return
        self.stdout.write(f'Fetching from {site}')
-        resource = site.scrape()
+        if options['save']:
+            resource = site.get_resource_ready()
+            pprint.pp(resource.metadata)
+            pprint.pp(site.get_item())
+            pprint.pp(site.get_item().metadata)
+        else:
+            resource = site.scrape()
+            pprint.pp(resource.metadata)
+            pprint.pp(resource.lookup_ids)
        self.stdout.write(self.style.SUCCESS(f'Done.'))
-        pprint.pp(resource.metadata)
-        pprint.pp(resource.lookup_ids)
--- a/catalog/sites/douban_book.py
+++ b/catalog/sites/douban_book.py
@ -8,27 +8,8 @@ import logging
 _logger = logging.getLogger(__name__)


-class ScraperMixin:
-    def set_field(self, field, value=None):
-        self.data[field] = value
-
-    def parse_str(self, query):
-        elem = self.html.xpath(query)
-        return elem[0].strip() if elem else None
-
-    def parse_field(self, field, query, error_when_missing=False):
-        elem = self.html.xpath(query)
-        if elem:
-            self.data[field] = elem[0].strip()
-        elif error_when_missing:
-            raise ParseError(self, field)
-        else:
-            self.data[field] = None
-        return elem
-
-
@SiteList.register
-class DoubanBook(AbstractSite, ScraperMixin):
+class DoubanBook(AbstractSite):
    ID_TYPE = IdType.DoubanBook
    URL_PATTERNS = [r"\w+://book\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/book/subject/(\d+)/{0,1}"]
    WIKI_PROPERTY_ID = '?'
@ -39,28 +20,34 @@ class DoubanBook(AbstractSite, ScraperMixin):
        return "https://book.douban.com/subject/" + id_value + "/"

    def scrape(self):
-        self.data = {}
-        self.html = DoubanDownloader(self.url).download().html()
-        self.parse_field('title', "/html/body//h1/span/text()")
-        self.parse_field('isbn', "//div[@id='info']//span[text()='ISBN:']/following::text()")
-        # TODO does douban store ASIN as ISBN, need more cleanup if so
-        if not self.data['title']:
-            if self.data['isbn']:
-                self.data['title'] = 'isbn: ' + isbn
-            else:
-                raise ParseError(self, 'title')
+        content = DoubanDownloader(self.url).download().html()

-        self.parse_field('cover_image_url', "//*[@id='mainpic']/a/img/@src")
-        self.parse_field('brief', "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
-        self.parse_field('series', "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
-        self.parse_field('producer', "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
-        self.parse_field('cubn', "//div[@id='info']//span[text()='统一书号:']/following::text()")
-        self.parse_field('subtitle', "//div[@id='info']//span[text()='副标题:']/following::text()")
-        self.parse_field('orig_title', "//div[@id='info']//span[text()='原作名:']/following::text()")
-        self.parse_field('language', "//div[@id='info']//span[text()='语言:']/following::text()")
-        self.parse_field('pub_house', "//div[@id='info']//span[text()='出版社:']/following::text()")
-        self.parse_field('pub_date', "//div[@id='info']//span[text()='出版年:']/following::text()")
-        year_month_day = RE_NUMBERS.findall(self.data['pub_date']) if self.data['pub_date'] else []
+        isbn_elem = content.xpath("//div[@id='info']//span[text()='ISBN:']/following::text()")
+        isbn = isbn_elem[0].strip() if isbn_elem else None
+
+        title_elem = content.xpath("/html/body//h1/span/text()")
+        title = title_elem[0].strip() if title_elem else f"Unknown Title {self.id_value}"
+
+        subtitle_elem = content.xpath(
+            "//div[@id='info']//span[text()='副标题:']/following::text()")
+        subtitle = subtitle_elem[0].strip()[:500] if subtitle_elem else None
+
+        orig_title_elem = content.xpath(
+            "//div[@id='info']//span[text()='原作名:']/following::text()")
+        orig_title = orig_title_elem[0].strip()[:500] if orig_title_elem else None
+
+        language_elem = content.xpath(
+            "//div[@id='info']//span[text()='语言:']/following::text()")
+        language = language_elem[0].strip() if language_elem else None
+
+        pub_house_elem = content.xpath(
+            "//div[@id='info']//span[text()='出版社:']/following::text()")
+        pub_house = pub_house_elem[0].strip() if pub_house_elem else None
+
+        pub_date_elem = content.xpath(
+            "//div[@id='info']//span[text()='出版年:']/following::text()")
+        pub_date = pub_date_elem[0].strip() if pub_date_elem else ''
+        year_month_day = RE_NUMBERS.findall(pub_date)
        if len(year_month_day) in (2, 3):
            pub_year = int(year_month_day[0])
            pub_month = int(year_month_day[1])
@ -77,33 +64,51 @@ class DoubanBook(AbstractSite, ScraperMixin):
        pub_month = None if pub_month is not None and pub_month not in range(
            1, 12) else pub_month

-        self.parse_field('binding', "//div[@id='info']//span[text()='装帧:']/following::text()")
-        self.parse_field('price', "//div[@id='info']//span[text()='定价:']/following::text()")
-        self.parse_field('pages', "//div[@id='info']//span[text()='页数:']/following::text()")
-        if self.data['pages'] is not None:
-            self.data['pages'] = int(RE_NUMBERS.findall(self.data['pages'])[0]) if RE_NUMBERS.findall(self.data['pages']) else None
-            if self.data['pages'] and (self.data['pages'] > 999999 or self.data['pages'] < 1):
-                self.data['pages'] = None
+        binding_elem = content.xpath(
+            "//div[@id='info']//span[text()='装帧:']/following::text()")
+        binding = binding_elem[0].strip() if binding_elem else None
+
+        price_elem = content.xpath(
+            "//div[@id='info']//span[text()='定价:']/following::text()")
+        price = price_elem[0].strip() if price_elem else None
+
+        pages_elem = content.xpath(
+            "//div[@id='info']//span[text()='页数:']/following::text()")
+        pages = pages_elem[0].strip() if pages_elem else None
+        if pages is not None:
+            pages = int(RE_NUMBERS.findall(pages)[
+                        0]) if RE_NUMBERS.findall(pages) else None
+            if pages and (pages > 999999 or pages < 1):
+                pages = None
+
+        brief_elem = content.xpath(
+            "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
+        brief = '\n'.join(p.strip()
+                          for p in brief_elem) if brief_elem else None

        contents = None
        try:
-            contents_elem = self.html.xpath(
+            contents_elem = content.xpath(
                "//h2/span[text()='目录']/../following-sibling::div[1]")[0]
            # if next the id of next sibling contains `dir`, that would be the full contents
            if "dir" in contents_elem.getnext().xpath("@id")[0]:
                contents_elem = contents_elem.getnext()
-                contents = '\n'.join(p.strip() for p in contents_elem.xpath("text()")[:-2]) if len(contents_elem) else None
+                contents = '\n'.join(p.strip() for p in contents_elem.xpath(
+                    "text()")[:-2]) if contents_elem is not None else None
            else:
-                contents = '\n'.join(p.strip() for p in contents_elem.xpath("text()")) if len(contents_elem) else None
+                contents = '\n'.join(p.strip() for p in contents_elem.xpath(
+                    "text()")) if contents_elem is not None else None
        except Exception:
            pass
-        self.data['contents'] = contents
+
+        img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src")
+        img_url = img_url_elem[0].strip() if img_url_elem else None

        # there are two html formats for authors and translators
-        authors_elem = self.html.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
+        authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
            preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""")
        if not authors_elem:
-            authors_elem = self.html.xpath(
+            authors_elem = content.xpath(
                """//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""")
        if authors_elem:
            authors = []
@ -111,12 +116,11 @@ class DoubanBook(AbstractSite, ScraperMixin):
                authors.append(RE_WHITESPACES.sub(' ', author.strip())[:200])
        else:
            authors = None
-        self.data['authors'] = authors

-        translators_elem = self.html.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
+        translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
            preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""")
        if not translators_elem:
-            translators_elem = self.html.xpath(
+            translators_elem = content.xpath(
                """//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""")
        if translators_elem:
            translators = []
@ -124,28 +128,56 @@ class DoubanBook(AbstractSite, ScraperMixin):
                translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
        else:
            translators = None
-        self.data['translators'] = translators

-        work_link = self.parse_str('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
-        if work_link:
-            r = re.match(r'\w+://book.douban.com/works/(\d+)', work_link)
-            self.data['required_resources'] = [{
+        cncode_elem = content.xpath(
+            "//div[@id='info']//span[text()='统一书号:']/following::text()")
+        cubn = cncode_elem[0].strip() if cncode_elem else None
+
+        series_elem = content.xpath(
+            "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
+        series = series_elem[0].strip() if series_elem else None
+
+        imprint_elem = content.xpath(
+            "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
+        producer = imprint_elem[0].strip() if imprint_elem else None
+
+        data = {
+            'title': title,
+            'subtitle': subtitle,
+            'orig_title': orig_title,
+            'author': authors,
+            'translator': translators,
+            'language': language,
+            'pub_house': pub_house,
+            'pub_year': pub_year,
+            'pub_month': pub_month,
+            'binding': binding,
+            'price': price,
+            'pages': pages,
+            'isbn': isbn,
+            'cubn': cubn,
+            'brief': brief,
+            'contents': contents,
+            'series': series,
+            'producer': producer,
+            'cover_image_url': img_url,
+        }
+
+        works_element = content.xpath('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
+        if works_element:
+            r = re.match(r'\w+://book.douban.com/works/(\d+)', works_element[0])
+            data['required_resources'] = [{
                'model': 'Work',
-                'id_type': IdType.DoubanBook_Work, 
+                'id_type': IdType.DoubanBook_Work,
                'id_value': r[1] if r else None,
-                'title': self.data['title'],
-                'url': work_link,
+                'title': data['title'],
+                'url': works_element[0],
            }]
-        pd = ResourceContent(metadata=self.data)
-        pd.lookup_ids[IdType.ISBN] = self.data.get('isbn')
-        pd.lookup_ids[IdType.CUBN] = self.data.get('cubn')
-        if self.data["cover_image_url"]:
-            imgdl = BasicImageDownloader(self.data["cover_image_url"], self.url)
-            try:
-                pd.cover_image = imgdl.download().content
-                pd.cover_image_extention = imgdl.extention
-            except Exception:
-                _logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}')
+
+        pd = ResourceContent(metadata=data)
+        pd.lookup_ids[IdType.ISBN] = isbn
+        pd.lookup_ids[IdType.CUBN] = cubn
+        pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(img_url, self.url)
        return pd


--- a/journal/models.py
+++ b/journal/models.py
@ -14,8 +14,9 @@ from django.core.validators import MaxValueValidator, MinValueValidator
 from django.utils.translation import gettext_lazy as _
 from django.core.validators import RegexValidator
 from functools import cached_property
-from django.db.models import Count
+from django.db.models import Count, Avg
 import django.dispatch
+import math


 class Piece(PolymorphicModel, UserOwnedObjectMixin):
@ -29,7 +30,7 @@ class Piece(PolymorphicModel, UserOwnedObjectMixin):


 class Content(SoftDeleteMixin, Piece):
-    item: models.ForeignKey(Item, on_delete=models.PROTECT)
+    item = models.ForeignKey(Item, on_delete=models.PROTECT)

    def __str__(self):
        return f"{self.id}({self.item})"
@ -50,6 +51,22 @@ class Rating(Content):
    grade = models.IntegerField(default=0, validators=[MaxValueValidator(10), MinValueValidator(0)])


+class RatingManager:
+    @staticmethod
+    def get_rating_for_item(item):
+        stat = Rating.objects.filter(item=item).aggregate(average=Avg('grade'), count=Count('item'))
+        return math.ceil(stat['average']) if stat['count'] >= 5 else 0
+
+    @staticmethod
+    def get_rating_count_for_item(item):
+        stat = Rating.objects.filter(item=item).aggregate(count=Count('item'))
+        return stat['count']
+
+
+Item.rating = property(RatingManager.get_rating_for_item)
+Item.rating_count = property(RatingManager.get_rating_count_for_item)
+
+
 class Reply(Content):
    reply_to_content = models.ForeignKey(Content, on_delete=models.PROTECT, related_name='replies')
    title = models.CharField(max_length=500, null=True)