From 486dd16e1ff93ae34513645ea1858267e908bd89 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 14 Dec 2022 21:12:37 -0500
Subject: [PATCH] new data model: add legacy fields for Book to Edition as
 virtual fields

---
 catalog/book/models.py             |  39 ++++++-
 catalog/common/downloaders.py      |   2 +-
 catalog/common/models.py           |  17 ++-
 catalog/common/utils.py            |   2 +-
 catalog/management/commands/cat.py |  19 ++-
 catalog/sites/douban_book.py       | 182 +++++++++++++++++------------
 journal/models.py                  |  21 +++-
 7 files changed, 190 insertions(+), 92 deletions(-)

diff --git a/catalog/book/models.py b/catalog/book/models.py
index 2d9a9d86..8c0390d6 100644
--- a/catalog/book/models.py
+++ b/catalog/book/models.py
@@ -31,13 +31,40 @@ class Edition(Item):
     cubn = PrimaryLookupIdDescriptor(IdType.CUBN)
     # douban_book = LookupIdDescriptor(IdType.DoubanBook)
     # goodreads = LookupIdDescriptor(IdType.Goodreads)
-    languages = jsondata.ArrayField(_("语言"), null=True, blank=True, default=list)
-    publish_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True)
-    publish_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True)
+
+    METADATA_COPY_LIST = [
+        'title',
+        'brief',
+        # legacy fields
+        'subtitle',
+        'orig_title',
+        'author',
+        'translator',
+        'language',
+        'pub_house',
+        'pub_year',
+        'pub_month',
+        'binding',
+        'price',
+        'pages',
+        'contents',
+        'series',
+        'producer',
+    ]
+    subtitle = jsondata.CharField(null=True, blank=True, default=None)
+    orig_title = jsondata.CharField(null=True, blank=True, default=None)
+    author = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list)
+    translator = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list)
+    language = jsondata.ArrayField(_("语言"), null=True, blank=True, default=list)
+    pub_house = jsondata.ArrayField(_('出版方'), null=True, blank=True, default=list)
+    pub_year = jsondata.IntegerField(_("发表年份"), null=True, blank=True)
+    pub_month = jsondata.IntegerField(_("发表月份"), null=True, blank=True)
+    binding = jsondata.CharField(null=True, blank=True, default=None)
     pages = jsondata.IntegerField(blank=True, default=None)
-    authors = jsondata.ArrayField(_('作者'), null=False, blank=False, default=list)
-    translaters = jsondata.ArrayField(_('译者'), null=True, blank=True, default=list)
-    publishers = jsondata.ArrayField(_('出版方'), null=True, blank=True, default=list)
+    series = jsondata.CharField(null=True, blank=True, default=None)
+    contents = jsondata.CharField(null=True, blank=True, default=None)
+    price = jsondata.FloatField(_("发表月份"), null=True, blank=True)
+    producer = jsondata.FloatField(_("发表月份"), null=True, blank=True)
 
     @property
     def isbn10(self):
diff --git a/catalog/common/downloaders.py b/catalog/common/downloaders.py
index f7d1ed83..31e7818e 100644
--- a/catalog/common/downloaders.py
+++ b/catalog/common/downloaders.py
@@ -235,7 +235,7 @@ class MockResponse:
         return json.load(StringIO(self.text))
 
     def html(self):
-        return html.fromstring(self.text)  # may throw exception unexpectedly due to OS bug
+        return html.fromstring(self.text)  # may throw exception unexpectedly due to OS bug, see https://github.com/neodb-social/neodb/issues/5
 
     @property
     def headers(self):
diff --git a/catalog/common/models.py b/catalog/common/models.py
index 46b3a9d3..f608d304 100644
--- a/catalog/common/models.py
+++ b/catalog/common/models.py
@@ -184,7 +184,7 @@ class Item(PolymorphicModel, SoftDeleteMixin):
         self.primary_lookup_id_type = None
 
     def __str__(self):
-        return f"{self.id}{' ' + self.primary_lookup_id_type + ':' + self.primary_lookup_id_value if self.primary_lookup_id_value else ''} ({self.title})"
+        return f"{self.id}|{self.url_id}{' ' + self.primary_lookup_id_type + ':' + self.primary_lookup_id_value if self.primary_lookup_id_value else ''} ({self.title})"
 
     @classmethod
     def get_best_lookup_id(cls, lookup_ids):
@@ -210,9 +210,13 @@ class Item(PolymorphicModel, SoftDeleteMixin):
         else:
             self.merged_to_item = to_item
 
+    @property
+    def url_id(self):
+        return base62.encode(self.uid.int)
+
     @property
     def url(self):
-        return f'/{self.url_path}/{base62.encode(self.uid.int)}'
+        return f'/{self.url_path}/{self.url_id}'
 
     @classmethod
     def get_by_url(cls, url_or_b62):
@@ -236,6 +240,9 @@ class Item(PolymorphicModel, SoftDeleteMixin):
     def copy_metadata(cls, metadata):
         return dict((k, v) for k, v in metadata.items() if k in cls.METADATA_COPY_LIST and v is not None)
 
+    def has_cover(self):
+        return self.cover and self.cover != DEFAULT_ITEM_COVER
+
     def merge_data_from_external_resources(self):
         """Subclass may override this"""
         lookup_ids = []
@@ -245,7 +252,7 @@ class Item(PolymorphicModel, SoftDeleteMixin):
             for k in self.METADATA_COPY_LIST:
                 if not getattr(self, k) and p.metadata.get(k):
                     setattr(self, k, p.metadata.get(k))
-            if not self.cover and p.cover:
+            if not self.has_cover() and p.cover:
                 self.cover = p.cover
         self.update_lookup_ids(lookup_ids)
 
@@ -284,6 +291,10 @@ class ExternalResource(models.Model):
     def __str__(self):
         return f"{self.id}{':' + self.id_type + ':' + self.id_value if self.id_value else ''} ({self.url})"
 
+    @property
+    def site_name(self):
+        return self.id_type  # TODO change to localized name
+
     def update_content(self, resource_content):
         self.other_lookup_ids = resource_content.lookup_ids
         self.metadata = resource_content.metadata
diff --git a/catalog/common/utils.py b/catalog/common/utils.py
index 39b115a9..5bfc82c4 100644
--- a/catalog/common/utils.py
+++ b/catalog/common/utils.py
@@ -11,4 +11,4 @@ DEFAULT_ITEM_COVER = 'item/default.svg'
 
 def item_cover_path(resource, filename):
     fn = timezone.now().strftime('%Y/%m/%d/') + str(uuid.uuid4()) + '.' + filename.split('.')[-1]
-    return 'items/' + resource.id_type + '/' + fn
+    return 'item/' + resource.id_type + '/' + fn
diff --git a/catalog/management/commands/cat.py b/catalog/management/commands/cat.py
index 1854c08b..9d714693 100644
--- a/catalog/management/commands/cat.py
+++ b/catalog/management/commands/cat.py
@@ -5,10 +5,15 @@ from catalog.sites import *
 
 
 class Command(BaseCommand):
-    help = 'Scrape a catalog item from external resource (but not save it)'
+    help = 'Scrape a catalog item from external resource (and save it)'
 
     def add_arguments(self, parser):
         parser.add_argument('url', type=str, help='URL to scrape')
+        parser.add_argument(
+            '--save',
+            action='store_true',
+            help='save to database',
+        )
 
     def handle(self, *args, **options):
         url = str(options['url'])
@@ -17,7 +22,13 @@ class Command(BaseCommand):
             self.stdout.write(self.style.ERROR(f'Unknown site for {url}'))
             return
         self.stdout.write(f'Fetching from {site}')
-        resource = site.scrape()
+        if options['save']:
+            resource = site.get_resource_ready()
+            pprint.pp(resource.metadata)
+            pprint.pp(site.get_item())
+            pprint.pp(site.get_item().metadata)
+        else:
+            resource = site.scrape()
+            pprint.pp(resource.metadata)
+            pprint.pp(resource.lookup_ids)
         self.stdout.write(self.style.SUCCESS(f'Done.'))
-        pprint.pp(resource.metadata)
-        pprint.pp(resource.lookup_ids)
diff --git a/catalog/sites/douban_book.py b/catalog/sites/douban_book.py
index 021857c5..19aeecaf 100644
--- a/catalog/sites/douban_book.py
+++ b/catalog/sites/douban_book.py
@@ -8,27 +8,8 @@ import logging
 _logger = logging.getLogger(__name__)
 
 
-class ScraperMixin:
-    def set_field(self, field, value=None):
-        self.data[field] = value
-
-    def parse_str(self, query):
-        elem = self.html.xpath(query)
-        return elem[0].strip() if elem else None
-
-    def parse_field(self, field, query, error_when_missing=False):
-        elem = self.html.xpath(query)
-        if elem:
-            self.data[field] = elem[0].strip()
-        elif error_when_missing:
-            raise ParseError(self, field)
-        else:
-            self.data[field] = None
-        return elem
-
-
 @SiteList.register
-class DoubanBook(AbstractSite, ScraperMixin):
+class DoubanBook(AbstractSite):
     ID_TYPE = IdType.DoubanBook
     URL_PATTERNS = [r"\w+://book\.douban\.com/subject/(\d+)/{0,1}", r"\w+://m.douban.com/book/subject/(\d+)/{0,1}"]
     WIKI_PROPERTY_ID = '?'
@@ -39,28 +20,34 @@ class DoubanBook(AbstractSite, ScraperMixin):
         return "https://book.douban.com/subject/" + id_value + "/"
 
     def scrape(self):
-        self.data = {}
-        self.html = DoubanDownloader(self.url).download().html()
-        self.parse_field('title', "/html/body//h1/span/text()")
-        self.parse_field('isbn', "//div[@id='info']//span[text()='ISBN:']/following::text()")
-        # TODO does douban store ASIN as ISBN, need more cleanup if so
-        if not self.data['title']:
-            if self.data['isbn']:
-                self.data['title'] = 'isbn: ' + isbn
-            else:
-                raise ParseError(self, 'title')
+        content = DoubanDownloader(self.url).download().html()
 
-        self.parse_field('cover_image_url', "//*[@id='mainpic']/a/img/@src")
-        self.parse_field('brief', "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
-        self.parse_field('series', "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
-        self.parse_field('producer', "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
-        self.parse_field('cubn', "//div[@id='info']//span[text()='统一书号:']/following::text()")
-        self.parse_field('subtitle', "//div[@id='info']//span[text()='副标题:']/following::text()")
-        self.parse_field('orig_title', "//div[@id='info']//span[text()='原作名:']/following::text()")
-        self.parse_field('language', "//div[@id='info']//span[text()='语言:']/following::text()")
-        self.parse_field('pub_house', "//div[@id='info']//span[text()='出版社:']/following::text()")
-        self.parse_field('pub_date', "//div[@id='info']//span[text()='出版年:']/following::text()")
-        year_month_day = RE_NUMBERS.findall(self.data['pub_date']) if self.data['pub_date'] else []
+        isbn_elem = content.xpath("//div[@id='info']//span[text()='ISBN:']/following::text()")
+        isbn = isbn_elem[0].strip() if isbn_elem else None
+
+        title_elem = content.xpath("/html/body//h1/span/text()")
+        title = title_elem[0].strip() if title_elem else f"Unknown Title {self.id_value}"
+
+        subtitle_elem = content.xpath(
+            "//div[@id='info']//span[text()='副标题:']/following::text()")
+        subtitle = subtitle_elem[0].strip()[:500] if subtitle_elem else None
+
+        orig_title_elem = content.xpath(
+            "//div[@id='info']//span[text()='原作名:']/following::text()")
+        orig_title = orig_title_elem[0].strip()[:500] if orig_title_elem else None
+
+        language_elem = content.xpath(
+            "//div[@id='info']//span[text()='语言:']/following::text()")
+        language = language_elem[0].strip() if language_elem else None
+
+        pub_house_elem = content.xpath(
+            "//div[@id='info']//span[text()='出版社:']/following::text()")
+        pub_house = pub_house_elem[0].strip() if pub_house_elem else None
+
+        pub_date_elem = content.xpath(
+            "//div[@id='info']//span[text()='出版年:']/following::text()")
+        pub_date = pub_date_elem[0].strip() if pub_date_elem else ''
+        year_month_day = RE_NUMBERS.findall(pub_date)
         if len(year_month_day) in (2, 3):
             pub_year = int(year_month_day[0])
             pub_month = int(year_month_day[1])
@@ -77,33 +64,51 @@ class DoubanBook(AbstractSite, ScraperMixin):
         pub_month = None if pub_month is not None and pub_month not in range(
             1, 12) else pub_month
 
-        self.parse_field('binding', "//div[@id='info']//span[text()='装帧:']/following::text()")
-        self.parse_field('price', "//div[@id='info']//span[text()='定价:']/following::text()")
-        self.parse_field('pages', "//div[@id='info']//span[text()='页数:']/following::text()")
-        if self.data['pages'] is not None:
-            self.data['pages'] = int(RE_NUMBERS.findall(self.data['pages'])[0]) if RE_NUMBERS.findall(self.data['pages']) else None
-            if self.data['pages'] and (self.data['pages'] > 999999 or self.data['pages'] < 1):
-                self.data['pages'] = None
+        binding_elem = content.xpath(
+            "//div[@id='info']//span[text()='装帧:']/following::text()")
+        binding = binding_elem[0].strip() if binding_elem else None
+
+        price_elem = content.xpath(
+            "//div[@id='info']//span[text()='定价:']/following::text()")
+        price = price_elem[0].strip() if price_elem else None
+
+        pages_elem = content.xpath(
+            "//div[@id='info']//span[text()='页数:']/following::text()")
+        pages = pages_elem[0].strip() if pages_elem else None
+        if pages is not None:
+            pages = int(RE_NUMBERS.findall(pages)[
+                        0]) if RE_NUMBERS.findall(pages) else None
+            if pages and (pages > 999999 or pages < 1):
+                pages = None
+
+        brief_elem = content.xpath(
+            "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
+        brief = '\n'.join(p.strip()
+                          for p in brief_elem) if brief_elem else None
 
         contents = None
         try:
-            contents_elem = self.html.xpath(
+            contents_elem = content.xpath(
                 "//h2/span[text()='目录']/../following-sibling::div[1]")[0]
             # if next the id of next sibling contains `dir`, that would be the full contents
             if "dir" in contents_elem.getnext().xpath("@id")[0]:
                 contents_elem = contents_elem.getnext()
-                contents = '\n'.join(p.strip() for p in contents_elem.xpath("text()")[:-2]) if len(contents_elem) else None
+                contents = '\n'.join(p.strip() for p in contents_elem.xpath(
+                    "text()")[:-2]) if contents_elem is not None else None
             else:
-                contents = '\n'.join(p.strip() for p in contents_elem.xpath("text()")) if len(contents_elem) else None
+                contents = '\n'.join(p.strip() for p in contents_elem.xpath(
+                    "text()")) if contents_elem is not None else None
         except Exception:
             pass
-        self.data['contents'] = contents
+
+        img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src")
+        img_url = img_url_elem[0].strip() if img_url_elem else None
 
         # there are two html formats for authors and translators
-        authors_elem = self.html.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
+        authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
             preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""")
         if not authors_elem:
-            authors_elem = self.html.xpath(
+            authors_elem = content.xpath(
                 """//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""")
         if authors_elem:
             authors = []
@@ -111,12 +116,11 @@ class DoubanBook(AbstractSite, ScraperMixin):
                 authors.append(RE_WHITESPACES.sub(' ', author.strip())[:200])
         else:
             authors = None
-        self.data['authors'] = authors
 
-        translators_elem = self.html.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
+        translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
             preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""")
         if not translators_elem:
-            translators_elem = self.html.xpath(
+            translators_elem = content.xpath(
                 """//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""")
         if translators_elem:
             translators = []
@@ -124,28 +128,56 @@ class DoubanBook(AbstractSite, ScraperMixin):
                 translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
         else:
             translators = None
-        self.data['translators'] = translators
 
-        work_link = self.parse_str('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
-        if work_link:
-            r = re.match(r'\w+://book.douban.com/works/(\d+)', work_link)
-            self.data['required_resources'] = [{
+        cncode_elem = content.xpath(
+            "//div[@id='info']//span[text()='统一书号:']/following::text()")
+        cubn = cncode_elem[0].strip() if cncode_elem else None
+
+        series_elem = content.xpath(
+            "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
+        series = series_elem[0].strip() if series_elem else None
+
+        imprint_elem = content.xpath(
+            "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
+        producer = imprint_elem[0].strip() if imprint_elem else None
+
+        data = {
+            'title': title,
+            'subtitle': subtitle,
+            'orig_title': orig_title,
+            'author': authors,
+            'translator': translators,
+            'language': language,
+            'pub_house': pub_house,
+            'pub_year': pub_year,
+            'pub_month': pub_month,
+            'binding': binding,
+            'price': price,
+            'pages': pages,
+            'isbn': isbn,
+            'cubn': cubn,
+            'brief': brief,
+            'contents': contents,
+            'series': series,
+            'producer': producer,
+            'cover_image_url': img_url,
+        }
+
+        works_element = content.xpath('//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href')
+        if works_element:
+            r = re.match(r'\w+://book.douban.com/works/(\d+)', works_element[0])
+            data['required_resources'] = [{
                 'model': 'Work',
-                'id_type': IdType.DoubanBook_Work, 
+                'id_type': IdType.DoubanBook_Work,
                 'id_value': r[1] if r else None,
-                'title': self.data['title'],
-                'url': work_link,
+                'title': data['title'],
+                'url': works_element[0],
             }]
-        pd = ResourceContent(metadata=self.data)
-        pd.lookup_ids[IdType.ISBN] = self.data.get('isbn')
-        pd.lookup_ids[IdType.CUBN] = self.data.get('cubn')
-        if self.data["cover_image_url"]:
-            imgdl = BasicImageDownloader(self.data["cover_image_url"], self.url)
-            try:
-                pd.cover_image = imgdl.download().content
-                pd.cover_image_extention = imgdl.extention
-            except Exception:
-                _logger.debug(f'failed to download cover for {self.url} from {self.data["cover_image_url"]}')
+
+        pd = ResourceContent(metadata=data)
+        pd.lookup_ids[IdType.ISBN] = isbn
+        pd.lookup_ids[IdType.CUBN] = cubn
+        pd.cover_image, pd.cover_image_extention = BasicImageDownloader.download_image(img_url, self.url)
         return pd
 
 
diff --git a/journal/models.py b/journal/models.py
index 7839a2c6..8aad9964 100644
--- a/journal/models.py
+++ b/journal/models.py
@@ -14,8 +14,9 @@ from django.core.validators import MaxValueValidator, MinValueValidator
 from django.utils.translation import gettext_lazy as _
 from django.core.validators import RegexValidator
 from functools import cached_property
-from django.db.models import Count
+from django.db.models import Count, Avg
 import django.dispatch
+import math
 
 
 class Piece(PolymorphicModel, UserOwnedObjectMixin):
@@ -29,7 +30,7 @@ class Piece(PolymorphicModel, UserOwnedObjectMixin):
 
 
 class Content(SoftDeleteMixin, Piece):
-    item: models.ForeignKey(Item, on_delete=models.PROTECT)
+    item = models.ForeignKey(Item, on_delete=models.PROTECT)
 
     def __str__(self):
         return f"{self.id}({self.item})"
@@ -50,6 +51,22 @@ class Rating(Content):
     grade = models.IntegerField(default=0, validators=[MaxValueValidator(10), MinValueValidator(0)])
 
 
+class RatingManager:
+    @staticmethod
+    def get_rating_for_item(item):
+        stat = Rating.objects.filter(item=item).aggregate(average=Avg('grade'), count=Count('item'))
+        return math.ceil(stat['average']) if stat['count'] >= 5 else 0
+
+    @staticmethod
+    def get_rating_count_for_item(item):
+        stat = Rating.objects.filter(item=item).aggregate(count=Count('item'))
+        return stat['count']
+
+
+Item.rating = property(RatingManager.get_rating_for_item)
+Item.rating_count = property(RatingManager.get_rating_count_for_item)
+
+
 class Reply(Content):
     reply_to_content = models.ForeignKey(Content, on_delete=models.PROTECT, related_name='replies')
     title = models.CharField(max_length=500, null=True)