From fda32aee733aa7852bab4bdf42c98b44e43e25cd Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sun, 17 Oct 2021 22:43:56 -0400
Subject: [PATCH] search will show external results from Goodreads, TMDB and
 Spotify

---
 common/models.py                           |   1 +
 common/scraper.py                          |  25 +++-
 common/searcher.py                         | 149 +++++++++++++++++++++
 common/templates/common/search_result.html |  39 +++++-
 common/views.py                            |   3 +
 5 files changed, 211 insertions(+), 6 deletions(-)
 create mode 100644 common/searcher.py

diff --git a/common/models.py b/common/models.py
index 3a8631ac..1d231548 100644
--- a/common/models.py
+++ b/common/models.py
@@ -26,6 +26,7 @@ class SourceSiteEnum(models.TextChoices):
     BANGUMI = 'bangumi', _("bangumi")
     GOODREADS = "goodreads", _("goodreads")
     TMDB = "tmdb", _("The Movie Database")
+    GOOGLEBOOKS = "googlebooks", _("Google Books")
 
 
 class Entity(models.Model):
diff --git a/common/scraper.py b/common/scraper.py
index 5bf62d8f..54fadbdb 100644
--- a/common/scraper.py
+++ b/common/scraper.py
@@ -1016,6 +1016,13 @@ class SpotifyAlbumScraper(AbstractScraper):
         ])
 
 
+def get_spotify_token():
+    global spotify_token, spotify_token_expire_time
+    if spotify_token is None or is_spotify_token_expired():
+        invoke_spotify_token()
+    return spotify_token
+
+    
 def is_spotify_token_expired():
     global spotify_token_expire_time
     return True if spotify_token_expire_time <= time.time() else False
@@ -1486,13 +1493,16 @@ class GoodreadsScraper(AbstractScraper):
         u = re.match(r"https://www\.goodreads\.com/book/show/\d+", raw_url)
         return u[0] if u else None
 
-    def scrape(self, url):
+    def scrape(self, url, response=None):
         """
         This is the scraping portal
         """
-        headers = DEFAULT_REQUEST_HEADERS.copy()
-        headers['Host'] = self.host
-        content = self.download_page(url, headers)
+        if response is not None:
+            content = html.fromstring(response.content.decode('utf-8'))
+        else:
+            headers = DEFAULT_REQUEST_HEADERS.copy()
+            headers['Host'] = self.host
+            content = self.download_page(url, headers)
 
         try:
             title = content.xpath("//h1[@id='bookTitle']/text()")[0].strip()
@@ -1542,7 +1552,11 @@ class GoodreadsScraper(AbstractScraper):
         isbn = isbn_elem[0].strip() if isbn_elem else None
 
         brief_elem = content.xpath('//div[@id="description"]/span[@style="display:none"]/text()')
-        brief = '\n'.join(p.strip() for p in brief_elem) if brief_elem else None
+        if brief_elem:
+            brief = '\n'.join(p.strip() for p in brief_elem)
+        else:
+            brief_elem = content.xpath('//div[@id="description"]/span/text()')
+            brief = '\n'.join(p.strip() for p in brief_elem) if brief_elem else None
 
         genre = content.xpath('//div[@class="bigBoxBody"]/div/div/div/a/text()')
         genre = genre[0] if genre else None
@@ -1596,6 +1610,7 @@ class GoodreadsScraper(AbstractScraper):
             'brief': brief,
             'contents': contents,
             'other_info': other,
+            'cover_url': img_url,
             'source_site': self.site_name,
             'source_url': self.get_effective_url(url),
         }
diff --git a/common/searcher.py b/common/searcher.py
new file mode 100644
index 00000000..6792be1f
--- /dev/null
+++ b/common/searcher.py
@@ -0,0 +1,149 @@
+from urllib.parse import quote_plus
+from enum import Enum
+from common.models import SourceSiteEnum
+from django.conf import settings
+from common.scraper import GoodreadsScraper, get_spotify_token
+import requests
+from lxml import html
+
+SEARCH_PAGE_SIZE = 5  # not all apis support page size
+
+
+class Category(Enum):
+    Book = '书籍'
+    Movie = '电影'
+    Music = '音乐'
+    Game = '游戏'
+    TV = '剧集'
+
+
+class SearchResultItem:
+    def __init__(self, category, source_site, source_url, title, subtitle, brief, cover_url):
+        self.category = category
+        self.source_site = source_site
+        self.source_url = source_url
+        self.title = title
+        self.subtitle = subtitle
+        self.brief = brief
+        self.cover_url = cover_url
+
+    @property
+    def verbose_category_name(self):
+        return self.category.value
+
+    @property
+    def link(self):
+        return f"/search?q={quote_plus(self.source_url)}"
+
+    @property
+    def scraped(self):
+        return False
+
+
+class Goodreads:
+    @classmethod
+    def search(self, q, page=1):
+        results = []
+        search_url = f'https://www.goodreads.com/search?page={page}&q={quote_plus(q)}'
+        r = requests.get(search_url)
+        if r.url.startswith('https://www.goodreads.com/book/show/'):
+            # Goodreads will 302 if only one result matches ISBN
+            data, img = GoodreadsScraper.scrape(r.url, r)
+            subtitle = f"{data['pub_year']} {', '.join(data['author'])} {', '.join(data['translator'])}"
+            results.append(SearchResultItem(Category.Book, SourceSiteEnum.GOODREADS, 
+                           data['source_url'], data['title'], subtitle,
+                           data['brief'], data['cover_url']))
+        else:
+            h = html.fromstring(r.content.decode('utf-8'))
+            for c in h.xpath('//tr[@itemtype="http://schema.org/Book"]'):
+                el_cover = c.xpath('.//img[@class="bookCover"]/@src')
+                cover = el_cover[0] if el_cover else None
+                el_title = c.xpath('.//a[@class="bookTitle"]//text()')
+                title = ''.join(el_title).strip() if el_title else None
+                el_url = c.xpath('.//a[@class="bookTitle"]/@href')
+                url = 'https://www.goodreads.com' + el_url[0] if el_url else None
+                el_authors = c.xpath('.//a[@class="authorName"]//text()')
+                subtitle = ', '.join(el_authors) if el_authors else None
+                results.append(SearchResultItem(Category.Book, SourceSiteEnum.GOODREADS, url, title, subtitle, '', cover))
+        return results
+
+
+class GoogleBooks:
+    @classmethod
+    def search(self, q, page=1):
+        results = []
+        api_url = f'https://www.googleapis.com/books/v1/volumes?q={quote_plus(q)}&startIndex={SEARCH_PAGE_SIZE*(page-1)}&maxResults={SEARCH_PAGE_SIZE}&maxAllowedMaturityRating=MATURE'
+        j = requests.get(api_url).json()
+        if 'items' in j:
+            for b in j['items']:
+                title = b['volumeInfo']['title']
+                subtitle = f"{b['volumeInfo']['publishedDate']} {', '.join(b['volumeInfo']['authors'])}"
+                if 'description' in b['volumeInfo']:
+                    brief = b['volumeInfo']['description']
+                elif 'textSnippet' in b['volumeInfo']:
+                    brief = b["volumeInfo"]["textSnippet"]["searchInfo"]
+                else:
+                    brief = ''
+                category = Category.Book
+                url = b['volumeInfo']['infoLink']
+                cover = b['volumeInfo']['imageLinks']['thumbnail'] if 'imageLinks' in b['volumeInfo'] else None
+                results.append(SearchResultItem(category, SourceSiteEnum.GOOGLEBOOKS, url, title, subtitle, brief, cover))
+        return results
+
+
+class TheMovieDatabase:
+    @classmethod
+    def search(self, q, page=1):
+        results = []
+        api_url = f'https://api.themoviedb.org/3/search/multi?query={quote_plus(q)}&page={page}&api_key={settings.TMDB_API3_KEY}&language=zh-CN&include_adult=true'
+        j = requests.get(api_url).json()
+        for m in j['results']:
+            if m['media_type'] in ['tv', 'movie']:
+                url = f"https://www.themoviedb.org/{m['media_type']}/{m['id']}"
+                if m['media_type'] == 'tv':
+                    cat = Category.TV
+                    title = m['name']
+                    subtitle = f"{m['first_air_date']} {m['original_name']}"
+                else:
+                    cat = Category.Movie
+                    title = m['title']
+                    subtitle = f"{m['release_date']} {m['original_title']}"
+                cover = f"https://image.tmdb.org/t/p/w500/{m['poster_path']}"
+                results.append(SearchResultItem(cat, SourceSiteEnum.TMDB, url, title, subtitle, m['overview'], cover))
+        return results
+
+
+class Spotify:
+    @classmethod
+    def search(self, q, page=1):
+        results = []
+        api_url = f"https://api.spotify.com/v1/search?q={q}&type=album&limit={SEARCH_PAGE_SIZE}&offset={page*SEARCH_PAGE_SIZE}"
+        headers = {
+            'Authorization': f"Bearer {get_spotify_token()}"
+        }
+        j = requests.get(api_url, headers=headers).json()
+        for a in j['albums']['items']:
+            title = a['name']
+            subtitle = a['release_date']
+            for artist in a['artists']:
+                subtitle += ' ' + artist['name']
+            url = a['external_urls']['spotify']
+            cover = a['images'][0]['url']
+            results.append(SearchResultItem(Category.Music, SourceSiteEnum.SPOTIFY, url, title, subtitle, '', cover))
+        return results
+
+
+class ExternalSources:
+    @classmethod
+    def search(self, c, q, page=1):
+        results = []
+        if c == '' or c is None:
+            c = 'all'
+        if c == 'all' or c == 'movie':
+            results.extend(TheMovieDatabase.search(q, page))
+        if c == 'all' or c == 'book':
+            results.extend(Goodreads.search(q, page))
+            # results.extend(GoogleBooks.search(q, page))
+        if c == 'all' or c == 'music':
+            results.extend(Spotify.search(q, page))
+        return results
diff --git a/common/templates/common/search_result.html b/common/templates/common/search_result.html
index 90c7f17d..8ed2309d 100644
--- a/common/templates/common/search_result.html
+++ b/common/templates/common/search_result.html
@@ -411,9 +411,46 @@
                                         
 
                                     {% empty %}
-                                    {% trans '无结果' %}
+                                    {% trans '无站内条目匹配' %}
                                     {% endfor %}
+
+                                    {% for item in external_items %}
+                                        <li class="entity-list__entity">
+                                            <div class="entity-list__entity-img-wrapper">
+                                                <a href="{{ item.link }}">
+                                                    <img src="{{ item.cover_url }}" alt="" class="entity-list__entity-img">
+                                                </a>
+                                            </div>
+                                            <div class="entity-list__entity-text">
+                                                <div class="entity-list__entity-title">
+                                                    <a href="{{ item.link }}" class="entity-list__entity-link">
+                                                        {% if request.GET.q %}
+                                                            {{ item.title | highlight:request.GET.q }}
+                                                        {% else %}
+                                                            {{ item.title }}
+                                                        {% endif %}
+                                                    </a>
+                                                    
+                                                    {% if not request.GET.c or not request.GET.c in categories %}
+                                                    <span class="entity-list__entity-category">[{{item.verbose_category_name}}]</span>
+                                                    {% endif %}
+                                                    <a href="{{ item.source_url }}">
+                                                        <span class="source-label source-label__{{ item.source_site }}">{{ item.source_site.label }}</span>
+                                                    </a>
+                                                </div>
+
+                                                <span class="entity-list__entity-info entity-list__entity-info--full-length">
+                                                    {{item.subtitle}}
+                                                </span>
+                                                <p class="entity-list__entity-brief">
+                                                    {{ item.brief }}
+                                                </p>
+                                                <div class="tag-collection">
+                                                </div>
+                                            </div>
                                         
+                                        </li>
+                                    {% endfor %}
                                 </ul>
                             </div>
                             <div class="pagination" >
diff --git a/common/views.py b/common/views.py
index e03da393..eb188b31 100644
--- a/common/views.py
+++ b/common/views.py
@@ -21,11 +21,13 @@ from common.models import MarkStatusEnum
 from common.utils import PageLinksGenerator
 from common.scraper import scraper_registry
 from common.config import *
+from common.searcher import ExternalSources
 from management.models import Announcement
 from django.conf import settings
 
 logger = logging.getLogger(__name__)
 
+
 @login_required
 def home(request):
     return user_home(request, request.user.id)
@@ -302,6 +304,7 @@ def search(request):
             "common/search_result.html",
             {
                 "items": items,
+                "external_items": ExternalSources.search(category, input_string, page_number),
                 "categories": categories,
             }
         )