From dbdae6d58c7d913dee39269e118fc4aa1febe861 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 9 Dec 2021 22:00:09 -0500 Subject: [PATCH] add missing MovieGenre --- common/scraper.py | 34 +++++++++++++++++++++------------- movies/models.py | 1 + 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/common/scraper.py b/common/scraper.py index f812edec..7e8ac673 100644 --- a/common/scraper.py +++ b/common/scraper.py @@ -79,10 +79,11 @@ def parse_date(raw_str): return dateparser.parse( raw_str, settings={ - "RELATIVE_BASE": datetime.datetime(1900, 1, 1) + "RELATIVE_BASE": datetime.datetime(1900, 1, 1) } ) + class AbstractScraper: """ Scrape entities. The entities means those defined in the models.py file, @@ -249,9 +250,12 @@ class DoubanScrapperMixin: if r.status_code == 200: content = r.content.decode('utf-8') if content.find('关于豆瓣') == -1: + if content.find('你的 IP 发出') == -1: + error = error + 'Content not authentic' # response is garbage + else: + error = error + 'IP banned' content = None last_error = 'network' - error = error + 'Content not authentic' # response is garbage elif re.search('不存在[^<]+', content, re.MULTILINE): content = None last_error = 'censorship' @@ -313,12 +317,12 @@ class DoubanScrapperMixin: if settings.SCRAPESTACK_KEY is not None: error = error + '\nScrapeStack: ' get(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}', 30) - elif settings.SCRAPERAPI_KEY is None: - error = error + '\nDirect: ' - get(url, 30) - else: + elif settings.SCRAPERAPI_KEY is not None: error = error + '\nScraperAPI: ' get(f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}', 30) + else: + error = error + '\nDirect: ' + get(url, 30) check_content() if last_error == 'network' and settings.PROXYCRAWL_KEY is not None: error = error + '\nProxyCrawl: ' @@ -340,11 +344,12 @@ class DoubanScrapperMixin: raw_img = None ext = None - dl_url = url if settings.SCRAPESTACK_KEY is not None: dl_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}' elif settings.SCRAPERAPI_KEY is not None: dl_url = f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}' + else: + dl_url = url try: img_response = requests.get(dl_url, timeout=30) @@ -361,6 +366,7 @@ class DoubanScrapperMixin: raw_img = None ext = None logger.error(f"Douban: download image failed {e} {dl_url} {item_url}") + if raw_img is None and settings.PROXYCRAWL_KEY is not None: try: dl_url = f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={url}' @@ -430,9 +436,9 @@ class DoubanBookScraper(DoubanScrapperMixin, AbstractScraper): pub_month = None if pub_year and pub_month and pub_year < pub_month: pub_year, pub_month = pub_month, pub_year - pub_year = None if pub_year is not None and not pub_year in range( + pub_year = None if pub_year is not None and pub_year not in range( 0, 3000) else pub_year - pub_month = None if pub_month is not None and not pub_month in range( + pub_month = None if pub_month is not None and pub_month not in range( 1, 12) else pub_month binding_elem = content.xpath( @@ -598,7 +604,7 @@ class DoubanMovieScraper(DoubanScrapperMixin, AbstractScraper): # construct genre translator genre_translator = {} - attrs = [attr for attr in dir(MovieGenreEnum) if not '__' in attr] + attrs = [attr for attr in dir(MovieGenreEnum) if '__' not in attr] for attr in attrs: genre_translator[getattr(MovieGenreEnum, attr).label] = getattr( MovieGenreEnum, attr).value @@ -738,8 +744,7 @@ class DoubanAlbumScraper(DoubanScrapperMixin, AbstractScraper): if not title: raise ValueError("given url contains no album info") - - artists_elem = content.xpath("""//div[@id='info']/span/span[@class='pl']/a/text()""") + artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()") artist = None if not artists_elem else artists_elem genre_elem = content.xpath( @@ -1648,6 +1653,7 @@ class GoodreadsScraper(AbstractScraper): self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext return data, raw_img + class TmdbMovieScraper(AbstractScraper): site_name = SourceSiteEnum.TMDB.value host = 'https://www.themoviedb.org/' @@ -1777,6 +1783,7 @@ class TmdbMovieScraper(AbstractScraper): else: return None + # https://developers.google.com/youtube/v3/docs/?apix=true # https://developers.google.com/books/docs/v1/using class GoogleBooksScraper(AbstractScraper): @@ -1855,4 +1862,5 @@ class GoogleBooksScraper(AbstractScraper): self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext return data, raw_img -from common.scrapers.bandcamp import BandcampAlbumScraper \ No newline at end of file + +from common.scrapers.bandcamp import BandcampAlbumScraper diff --git a/movies/models.py b/movies/models.py index bf592b9d..84fc9270 100644 --- a/movies/models.py +++ b/movies/models.py @@ -52,6 +52,7 @@ class MovieGenreEnum(models.TextChoices): NEWS = 'News', _('新闻') SOAP = 'Soap', _('肥皂剧') TV_MOVIE = 'TV Movie', _('电视电影') + THEATRE = 'Theatre', _('舞台艺术') OTHER = 'Other', _('其他')