From 2be843a1873848ebe3b9aced888d3558e65cf3d8 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 7 Sep 2022 09:44:08 -0400 Subject: [PATCH] use IGDB for Steam --- common/scrapers/igdb.py | 18 ++++++++++++++---- common/scrapers/steam.py | 26 ++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/common/scrapers/igdb.py b/common/scrapers/igdb.py index 6e2e2b25..eb635f5c 100644 --- a/common/scrapers/igdb.py +++ b/common/scrapers/igdb.py @@ -20,6 +20,13 @@ class IgdbGameScraper(AbstractScraper): form_class = GameForm regex = re.compile(r"https://www\.igdb\.com/games/([a-zA-Z0-9\-_]+)") + def scrape_steam(self, steam_url): + r = json.loads(wrapper.api_request('websites', f'fields *, game.*; where url = "{steam_url}";')) + if not r: + raise ValueError("Cannot find steam url in IGDB") + r = sorted(r, key=lambda w: w['game']['id']) + return self.scrape(r[0]['game']['url']) + def scrape(self, url): m = self.regex.match(url) if m: @@ -30,7 +37,8 @@ class IgdbGameScraper(AbstractScraper): slug = m[1] fields = '*, cover.url, genres.name, platforms.name, involved_companies.*, involved_companies.company.name' r = json.loads(wrapper.api_request('games', f'fields {fields}; where url = "{effective_url}";'))[0] - raw_img, ext = self.download_image('https:' + r['cover']['url'].replace('t_thumb', 't_cover_big'), url) + brief = r['summary'] if 'summary' in r else '' + brief += "\n\n" + r['storyline'] if 'storyline' in r else '' developer = None publisher = None release_date = None @@ -40,7 +48,8 @@ class IgdbGameScraper(AbstractScraper): developer = next(iter([c['company']['name'] for c in r['involved_companies'] if c['developer'] == True]), None) publisher = next(iter([c['company']['name'] for c in r['involved_companies'] if c['publisher'] == True]), None) if 'platforms' in r: - platform = [p['name'] for p in r['platforms']] + ps = sorted(r['platforms'], key=lambda p: p['id']) + platform = [(p['name'] if p['id'] != 6 else 'Windows') for p in ps] if 'first_release_date' in r: release_date = datetime.datetime.fromtimestamp(r['first_release_date'], datetime.timezone.utc) if 'genres' in r: @@ -60,11 +69,12 @@ class IgdbGameScraper(AbstractScraper): 'release_date': release_date, 'genre': genre, 'platform': platform, - 'brief': r['storyline'], + 'brief': brief, 'other_info': other_info, 'source_site': self.site_name, 'source_url': self.get_effective_url(url), } + raw_img, ext = self.download_image('https:' + r['cover']['url'].replace('t_thumb', 't_cover_big'), url) self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext return data, raw_img @@ -72,7 +82,7 @@ class IgdbGameScraper(AbstractScraper): @classmethod def get_effective_url(cls, raw_url): m = cls.regex.match(raw_url) - if raw_url: + if m: return m[0] else: return None diff --git a/common/scrapers/steam.py b/common/scrapers/steam.py index 41c97b43..f500bbc6 100644 --- a/common/scrapers/steam.py +++ b/common/scrapers/steam.py @@ -3,6 +3,7 @@ from common.models import SourceSiteEnum from games.models import Game from games.forms import GameForm from common.scraper import * +from common.scrapers.igdb import IgdbGameScraper class SteamGameScraper(AbstractScraper): @@ -11,9 +12,22 @@ class SteamGameScraper(AbstractScraper): data_class = Game form_class = GameForm - regex = re.compile(r"https://store\.steampowered\.com/app/\d+/{0,1}") + regex = re.compile(r"https://store\.steampowered\.com/app/\d+") def scrape(self, url): + m = self.regex.match(url) + if m: + effective_url = m[0] + else: + raise ValueError("not valid url") + s = IgdbGameScraper() + s.scrape_steam(effective_url) + self.raw_data = s.raw_data + self.raw_img = s.raw_img + self.img_ext = s.img_ext + self.raw_data['source_site'] = self.site_name + self.raw_data['source_url'] = effective_url + return self.raw_data, self.raw_img headers = DEFAULT_REQUEST_HEADERS.copy() headers['Host'] = self.host headers['Cookie'] = "wants_mature_content=1; birthtime=754700401;" @@ -56,8 +70,16 @@ class SteamGameScraper(AbstractScraper): 'brief': brief, 'other_info': None, 'source_site': self.site_name, - 'source_url': self.get_effective_url(url), + 'source_url': effective_url } self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext return data, raw_img + + @classmethod + def get_effective_url(cls, raw_url): + m = cls.regex.match(raw_url) + if m: + return m[0] + else: + return None