add imdb scraper

2021-02-18 15:44:10 +01:00 · 2021-02-18 15:44:10 +01:00 · b192f32d4d
commit b192f32d4d
parent 6b07963b51
5 changed files with 118 additions and 7 deletions
--- a/common/models.py
+++ b/common/models.py
@ -21,6 +21,7 @@ class SourceSiteEnum(models.TextChoices):
    IN_SITE = "in-site", CLIENT_NAME
    DOUBAN = "douban",  _("豆瓣")
    SPOTIFY = "spotify", _("Spotify")
+    IMDB = "imdb", _("IMDb")


 class Entity(models.Model):
--- a/common/scraper.py
+++ b/common/scraper.py
@ -9,7 +9,7 @@ import time
 from lxml import html
 from mimetypes import guess_extension
 from threading import Thread
-from boofilsic.settings import LUMINATI_USERNAME, LUMINATI_PASSWORD, DEBUG
+from boofilsic.settings import LUMINATI_USERNAME, LUMINATI_PASSWORD, DEBUG, IMDB_API_KEY
 from boofilsic.settings import SPOTIFY_CREDENTIAL
 from django.utils import timezone
 from django.utils.translation import ugettext_lazy as _
@ -645,7 +645,6 @@ spotify_token_expire_time = time.time()

 class SpotifyTrackScraper(AbstractScraper):
    site_name = SourceSiteEnum.SPOTIFY.value
-    # API URL
    host = 'https://open.spotify.com/track/'
    data_class = Song
    form_class = SongForm
@ -918,3 +917,100 @@ def invoke_spotify_token():
    # minus 2 for execution time error
    spotify_token_expire_time = int(data['expires_in']) + time.time() - 2
    spotify_token = data['access_token']
+
+
+class ImdbMovieScraper(AbstractScraper):
+    site_name = SourceSiteEnum.IMDB.value
+    host = 'https://www.imdb.com/title/'
+    data_class = Movie
+    form_class = MovieForm
+
+    regex = re.compile(r"(?<=https://www\.imdb\.com/title/)[a-zA-Z0-9]+")
+
+    def scrape(self, url):
+
+        effective_url = self.get_effective_url(url)
+        if effective_url is None:
+            raise ValueError("not valid url")
+
+        api_url = self.get_api_url(effective_url)
+        r = requests.get(api_url)
+        res_data = r.json()
+
+        if not res_data['type'] in ['Movie', 'TVSeries']:
+            raise ValueError("not movie/series item")
+
+        if res_data['type'] == 'Movie':
+            is_series = False
+        elif res_data['type'] == 'TVSeries':
+            is_series = True
+
+        title = res_data['title']
+        orig_title = res_data['originalTitle']
+        imdb_code = self.regex.findall(effective_url)[0]
+        director = []
+        for direct_dict in res_data['directorList']:
+            director.append(direct_dict['name'])
+        playwright = []
+        for writer_dict in res_data['writerList']:
+            playwright.append(writer_dict['name'])
+        actor = []
+        for actor_dict in res_data['actorList']:
+            actor.append(actor_dict['name'])
+        genre = res_data['genres'].split(', ')
+        area = res_data['countries'].split(', ')
+        language = res_data['languages'].split(', ')
+        year = int(res_data['year'])
+        duration = res_data['runtimeStr']
+        brief = res_data['plotLocal'] if res_data['plotLocal'] else res_data['plot']
+        if res_data['releaseDate']:
+            showtime = [{res_data['releaseDate']: "发布日期"}]
+        else:
+            showtime = None
+
+        other_info = {}
+        other_info['分级'] = res_data['contentRating'] if res_data['contentRating'] else None
+        other_info['IMDb评分'] = res_data['imDbRating'] if res_data['imDbRating'] else None
+        other_info['Metacritic评分'] = res_data['metacriticRating'] if res_data['metacriticRating'] else None
+        other_info['奖项'] = res_data['awards'] if res_data['awards'] else None
+
+        raw_img, ext = self.download_image(res_data['image'])
+
+        data = {
+            'title': title,
+            'orig_title': orig_title,
+            'other_title': None,
+            'imdb_code': imdb_code,
+            'director': director,
+            'playwright': playwright,
+            'actor': actor,
+            'genre': genre,
+            'showtime': showtime,
+            'site': None,
+            'area': area,
+            'language': language,
+            'year': year,
+            'duration': duration,
+            'season': None,
+            'episodes': None,
+            'single_episode_length': None,
+            'brief': brief,
+            'is_series': is_series,
+            'other_info': other_info,
+            'source_site': self.site_name,
+            'source_url': effective_url,
+        }
+        self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
+        return data, raw_img
+
+    @classmethod
+    def get_effective_url(cls, raw_url):
+        code = cls.regex.findall(raw_url)
+        if code:
+            return f"https://www.imdb.com/title/{code[0]}/"
+        else:
+            return None
+
+    @classmethod
+    def get_api_url(cls, url):
+        return f"https://imdb-api.com/zh/API/Title/{IMDB_API_KEY}/{cls.regex.findall(url)[0]}/FullActor,"
--- a/common/static/css/boofilsic.css
+++ b/common/static/css/boofilsic.css
@ -1216,7 +1216,7 @@ select::placeholder {
  font-weight: lighter;
  letter-spacing: 0.1rem;
  word-break: keep-all;
-  opacity: 0.8;
+  opacity: 1;
  position: relative;
  top: -1px;
 }
@ -1239,6 +1239,13 @@ select::placeholder {
  font-weight: bold;
 }

+.source-label.source-label__imdb {
+  background-color: #F5C518;
+  color: #121212;
+  border: none;
+  font-weight: bold;
+}
+
 .main-section-wrapper {
  padding: 32px 48px 32px 36px;
  background-color: #f7f7f7;
--- a/common/static/css/boofilsic.min.css
+++ b/common/static/css/boofilsic.min.css
--- a/common/static/sass/_Label.sass
+++ b/common/static/sass/_Label.sass
@ -1,10 +1,12 @@
 // source label name should match the enum value in `common.models.SourceSiteEnum`

+$in-site-color: $color-primary
 $douban-color-primary: #319840
 $douban-color-secondary: white
-$in-site-color: $color-primary
 $spotify-color-primary: #1ed760
 $spotify-color-secondary: black
+$imdb-color-primary: #F5C518
+$imdb-color-secondary: #121212

 .source-label
    display: inline
@ -20,7 +22,7 @@ $spotify-color-secondary: black
    font-weight: lighter
    letter-spacing: 0.1rem
    word-break: keep-all
-    opacity: 0.8
+    opacity: 1


    position: relative
@ -38,4 +40,9 @@ $spotify-color-secondary: black
        background-color: $spotify-color-primary
        color: $spotify-color-secondary
        border: none
-        font-weight: bold
+        font-weight: bold
+    &.source-label__imdb
+        background-color: $imdb-color-primary
+        color: $imdb-color-secondary
+        border: none
+        font-weight: bold