add missing MovieGenre
This commit is contained in:
parent
829c23f94e
commit
dbdae6d58c
2 changed files with 22 additions and 13 deletions
|
@ -79,10 +79,11 @@ def parse_date(raw_str):
|
|||
return dateparser.parse(
|
||||
raw_str,
|
||||
settings={
|
||||
"RELATIVE_BASE": datetime.datetime(1900, 1, 1)
|
||||
"RELATIVE_BASE": datetime.datetime(1900, 1, 1)
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class AbstractScraper:
|
||||
"""
|
||||
Scrape entities. The entities means those defined in the models.py file,
|
||||
|
@ -249,9 +250,12 @@ class DoubanScrapperMixin:
|
|||
if r.status_code == 200:
|
||||
content = r.content.decode('utf-8')
|
||||
if content.find('关于豆瓣') == -1:
|
||||
if content.find('你的 IP 发出') == -1:
|
||||
error = error + 'Content not authentic' # response is garbage
|
||||
else:
|
||||
error = error + 'IP banned'
|
||||
content = None
|
||||
last_error = 'network'
|
||||
error = error + 'Content not authentic' # response is garbage
|
||||
elif re.search('不存在[^<]+</title>', content, re.MULTILINE):
|
||||
content = None
|
||||
last_error = 'censorship'
|
||||
|
@ -313,12 +317,12 @@ class DoubanScrapperMixin:
|
|||
if settings.SCRAPESTACK_KEY is not None:
|
||||
error = error + '\nScrapeStack: '
|
||||
get(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}', 30)
|
||||
elif settings.SCRAPERAPI_KEY is None:
|
||||
error = error + '\nDirect: '
|
||||
get(url, 30)
|
||||
else:
|
||||
elif settings.SCRAPERAPI_KEY is not None:
|
||||
error = error + '\nScraperAPI: '
|
||||
get(f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}', 30)
|
||||
else:
|
||||
error = error + '\nDirect: '
|
||||
get(url, 30)
|
||||
check_content()
|
||||
if last_error == 'network' and settings.PROXYCRAWL_KEY is not None:
|
||||
error = error + '\nProxyCrawl: '
|
||||
|
@ -340,11 +344,12 @@ class DoubanScrapperMixin:
|
|||
raw_img = None
|
||||
ext = None
|
||||
|
||||
dl_url = url
|
||||
if settings.SCRAPESTACK_KEY is not None:
|
||||
dl_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}'
|
||||
elif settings.SCRAPERAPI_KEY is not None:
|
||||
dl_url = f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}'
|
||||
else:
|
||||
dl_url = url
|
||||
|
||||
try:
|
||||
img_response = requests.get(dl_url, timeout=30)
|
||||
|
@ -361,6 +366,7 @@ class DoubanScrapperMixin:
|
|||
raw_img = None
|
||||
ext = None
|
||||
logger.error(f"Douban: download image failed {e} {dl_url} {item_url}")
|
||||
|
||||
if raw_img is None and settings.PROXYCRAWL_KEY is not None:
|
||||
try:
|
||||
dl_url = f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={url}'
|
||||
|
@ -430,9 +436,9 @@ class DoubanBookScraper(DoubanScrapperMixin, AbstractScraper):
|
|||
pub_month = None
|
||||
if pub_year and pub_month and pub_year < pub_month:
|
||||
pub_year, pub_month = pub_month, pub_year
|
||||
pub_year = None if pub_year is not None and not pub_year in range(
|
||||
pub_year = None if pub_year is not None and pub_year not in range(
|
||||
0, 3000) else pub_year
|
||||
pub_month = None if pub_month is not None and not pub_month in range(
|
||||
pub_month = None if pub_month is not None and pub_month not in range(
|
||||
1, 12) else pub_month
|
||||
|
||||
binding_elem = content.xpath(
|
||||
|
@ -598,7 +604,7 @@ class DoubanMovieScraper(DoubanScrapperMixin, AbstractScraper):
|
|||
|
||||
# construct genre translator
|
||||
genre_translator = {}
|
||||
attrs = [attr for attr in dir(MovieGenreEnum) if not '__' in attr]
|
||||
attrs = [attr for attr in dir(MovieGenreEnum) if '__' not in attr]
|
||||
for attr in attrs:
|
||||
genre_translator[getattr(MovieGenreEnum, attr).label] = getattr(
|
||||
MovieGenreEnum, attr).value
|
||||
|
@ -738,8 +744,7 @@ class DoubanAlbumScraper(DoubanScrapperMixin, AbstractScraper):
|
|||
if not title:
|
||||
raise ValueError("given url contains no album info")
|
||||
|
||||
|
||||
artists_elem = content.xpath("""//div[@id='info']/span/span[@class='pl']/a/text()""")
|
||||
artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()")
|
||||
artist = None if not artists_elem else artists_elem
|
||||
|
||||
genre_elem = content.xpath(
|
||||
|
@ -1648,6 +1653,7 @@ class GoodreadsScraper(AbstractScraper):
|
|||
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
|
||||
return data, raw_img
|
||||
|
||||
|
||||
class TmdbMovieScraper(AbstractScraper):
|
||||
site_name = SourceSiteEnum.TMDB.value
|
||||
host = 'https://www.themoviedb.org/'
|
||||
|
@ -1777,6 +1783,7 @@ class TmdbMovieScraper(AbstractScraper):
|
|||
else:
|
||||
return None
|
||||
|
||||
|
||||
# https://developers.google.com/youtube/v3/docs/?apix=true
|
||||
# https://developers.google.com/books/docs/v1/using
|
||||
class GoogleBooksScraper(AbstractScraper):
|
||||
|
@ -1855,4 +1862,5 @@ class GoogleBooksScraper(AbstractScraper):
|
|||
self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
|
||||
return data, raw_img
|
||||
|
||||
from common.scrapers.bandcamp import BandcampAlbumScraper
|
||||
|
||||
from common.scrapers.bandcamp import BandcampAlbumScraper
|
||||
|
|
|
@ -52,6 +52,7 @@ class MovieGenreEnum(models.TextChoices):
|
|||
NEWS = 'News', _('新闻')
|
||||
SOAP = 'Soap', _('肥皂剧')
|
||||
TV_MOVIE = 'TV Movie', _('电视电影')
|
||||
THEATRE = 'Theatre', _('舞台艺术')
|
||||
OTHER = 'Other', _('其他')
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue