lib.itmens/common/scrapers/douban.py

import requests
import re
import filetype
from lxml import html
from common.models import SourceSiteEnum
from movies.models import Movie, MovieGenreEnum
from movies.forms import MovieForm
from books.models import Book
from books.forms import BookForm
from music.models import Album
from music.forms import AlbumForm
from games.models import Game
from games.forms import GameForm
from django.core.validators import URLValidator
from django.conf import settings
from PIL import Image
from io import BytesIO
from common.scraper import *


class DoubanScrapperMixin:
    @classmethod
    def download_page(cls, url, headers):
        url = cls.get_effective_url(url)
        r = None
        error = 'DoubanScrapper: error occured when downloading ' + url
        content = None
        last_error = None

        def get(url):
            nonlocal r
            # print('Douban GET ' + url)
            try:
                r = requests.get(url, timeout=settings.SCRAPING_TIMEOUT)
            except Exception as e:
                r = requests.Response()
                r.status_code = f"Exception when GET {url} {e}" + url
            # print('Douban CODE ' + str(r.status_code))
            return r

        def check_content():
            nonlocal r, error, content, last_error
            content = None
            last_error = None
            if r.status_code == 200:
                content = r.content.decode('utf-8')
                if content.find('关于豆瓣') == -1:
                    if content.find('你的 IP 发出') == -1:
                        error = error + 'Content not authentic'  # response is garbage
                    else:
                        error = error + 'IP banned'
                    content = None
                    last_error = 'network'
                elif content.find('<title>页面不存在</title>') != -1 or content.find('呃... 你想访问的条目豆瓣不收录。') != -1:  # re.search('不存在[^<]+</title>', content, re.MULTILINE):
                    content = None
                    last_error = 'censorship'
                    error = error + 'Not found or hidden by Douban'
            elif r.status_code == 204:
                content = None
                last_error = 'censorship'
                error = error + 'Not found or hidden by Douban'
            else:
                content = None
                last_error = 'network'
                error = error + str(r.status_code)

        def fix_wayback_links():
            nonlocal content
            # fix links
            content = re.sub(r'href="http[^"]+http', r'href="http', content)
            # https://img9.doubanio.com/view/subject/{l|m|s}/public/s1234.jpg
            content = re.sub(r'src="[^"]+/(s\d+\.\w+)"',
                             r'src="https://img9.doubanio.com/view/subject/m/public/\1"', content)
            # https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2681329386.jpg
            # https://img9.doubanio.com/view/photo/{l|m|s}/public/p1234.webp
            content = re.sub(r'src="[^"]+/(p\d+\.\w+)"',
                             r'src="https://img9.doubanio.com/view/photo/m/public/\1"', content)

        # Wayback Machine: get latest available
        def wayback():
            nonlocal r, error, content
            error = error + '\nWayback: '
            get('http://archive.org/wayback/available?url=' + url)
            if r.status_code == 200:
                w = r.json()
                if w['archived_snapshots'] and w['archived_snapshots']['closest']:
                    get(w['archived_snapshots']['closest']['url'])
                    check_content()
                    if content is not None:
                        fix_wayback_links()
                else:
                    error = error + 'No snapshot available'
            else:
                error = error + str(r.status_code)

        # Wayback Machine: guess via CDX API
        def wayback_cdx():
            nonlocal r, error, content
            error = error + '\nWayback: '
            get('http://web.archive.org/cdx/search/cdx?url=' + url)
            if r.status_code == 200:
                dates = re.findall(r'[^\s]+\s+(\d+)\s+[^\s]+\s+[^\s]+\s+\d+\s+[^\s]+\s+\d{5,}',
                                   r.content.decode('utf-8'))
                # assume snapshots whose size >9999 contain real content, use the latest one of them
                if len(dates) > 0:
                    get('http://web.archive.org/web/' + dates[-1] + '/' + url)
                    check_content()
                    if content is not None:
                        fix_wayback_links()
                else:
                    error = error + 'No snapshot available'
            else:
                error = error + str(r.status_code)

        def latest():
            nonlocal r, error, content
            if settings.SCRAPESTACK_KEY is not None:
                error = error + '\nScrapeStack: '
                get(f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}')
            elif settings.SCRAPERAPI_KEY is not None:
                error = error + '\nScraperAPI: '
                get(f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}')
            else:
                error = error + '\nDirect: '
                get(url)
            check_content()
            if last_error == 'network' and settings.PROXYCRAWL_KEY is not None:
                error = error + '\nProxyCrawl: '
                get(f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={url}')
                check_content()
            if last_error == 'censorship' and settings.LOCAL_PROXY is not None:
                error = error + '\nLocal: '
                get(f'{settings.LOCAL_PROXY}?url={url}')
                check_content()

        latest()
        if content is None:
            wayback_cdx()

        if content is None:
            raise RuntimeError(error)
        # with open('/tmp/temp.html', 'w', encoding='utf-8') as fp:
        #     fp.write(content)
        return html.fromstring(content)

    @classmethod
    def download_image(cls, url, item_url=None):
        raw_img = None
        ext = None

        if settings.SCRAPESTACK_KEY is not None:
            dl_url = f'http://api.scrapestack.com/scrape?access_key={settings.SCRAPESTACK_KEY}&url={url}'
        elif settings.SCRAPERAPI_KEY is not None:
            dl_url = f'http://api.scraperapi.com?api_key={settings.SCRAPERAPI_KEY}&url={url}'
        else:
            dl_url = url

        try:
            img_response = requests.get(dl_url, timeout=settings.SCRAPING_TIMEOUT)
            if img_response.status_code == 200:
                raw_img = img_response.content
                img = Image.open(BytesIO(raw_img))
                img.load()  # corrupted image will trigger exception
                content_type = img_response.headers.get('Content-Type')
                ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
            else:
                logger.error(f"Douban: download image failed {img_response.status_code} {dl_url} {item_url}")
                # raise RuntimeError(f"Douban: download image failed {img_response.status_code} {dl_url}")
        except Exception as e:
            raw_img = None
            ext = None
            logger.error(f"Douban: download image failed {e} {dl_url} {item_url}")

        if raw_img is None and settings.PROXYCRAWL_KEY is not None:
            try:
                dl_url = f'https://api.proxycrawl.com/?token={settings.PROXYCRAWL_KEY}&url={url}'
                img_response = requests.get(dl_url, timeout=settings.SCRAPING_TIMEOUT)
                if img_response.status_code == 200:
                    raw_img = img_response.content
                    img = Image.open(BytesIO(raw_img))
                    img.load()  # corrupted image will trigger exception
                    content_type = img_response.headers.get('Content-Type')
                    ext = filetype.get_type(mime=content_type.partition(';')[0].strip()).extension
                else:
                    logger.error(f"Douban: download image failed {img_response.status_code} {dl_url} {item_url}")
            except Exception as e:
                raw_img = None
                ext = None
                logger.error(f"Douban: download image failed {e} {dl_url} {item_url}")
        return raw_img, ext


class DoubanBookScraper(DoubanScrapperMixin, AbstractScraper):
    site_name = SourceSiteEnum.DOUBAN.value
    host = "book.douban.com"
    data_class = Book
    form_class = BookForm

    regex = re.compile(r"https://book\.douban\.com/subject/\d+/{0,1}")

    def scrape(self, url):
        headers = DEFAULT_REQUEST_HEADERS.copy()
        headers['Host'] = self.host
        content = self.download_page(url, headers)

        isbn_elem = content.xpath("//div[@id='info']//span[text()='ISBN:']/following::text()")
        isbn = isbn_elem[0].strip() if isbn_elem else None
        title_elem = content.xpath("/html/body//h1/span/text()")
        title = title_elem[0].strip() if title_elem else None
        if not title:
            if isbn:
                title = 'isbn: ' + isbn
            else:
                raise ValueError("given url contains no book title or isbn")

        subtitle_elem = content.xpath(
            "//div[@id='info']//span[text()='副标题:']/following::text()")
        subtitle = subtitle_elem[0].strip()[:500] if subtitle_elem else None

        orig_title_elem = content.xpath(
            "//div[@id='info']//span[text()='原作名:']/following::text()")
        orig_title = orig_title_elem[0].strip()[:500] if orig_title_elem else None

        language_elem = content.xpath(
            "//div[@id='info']//span[text()='语言:']/following::text()")
        language = language_elem[0].strip() if language_elem else None

        pub_house_elem = content.xpath(
            "//div[@id='info']//span[text()='出版社:']/following::text()")
        pub_house = pub_house_elem[0].strip() if pub_house_elem else None

        pub_date_elem = content.xpath(
            "//div[@id='info']//span[text()='出版年:']/following::text()")
        pub_date = pub_date_elem[0].strip() if pub_date_elem else ''
        year_month_day = RE_NUMBERS.findall(pub_date)
        if len(year_month_day) in (2, 3):
            pub_year = int(year_month_day[0])
            pub_month = int(year_month_day[1])
        elif len(year_month_day) == 1:
            pub_year = int(year_month_day[0])
            pub_month = None
        else:
            pub_year = None
            pub_month = None
        if pub_year and pub_month and pub_year < pub_month:
            pub_year, pub_month = pub_month, pub_year
        pub_year = None if pub_year is not None and pub_year not in range(
            0, 3000) else pub_year
        pub_month = None if pub_month is not None and pub_month not in range(
            1, 12) else pub_month

        binding_elem = content.xpath(
            "//div[@id='info']//span[text()='装帧:']/following::text()")
        binding = binding_elem[0].strip() if binding_elem else None

        price_elem = content.xpath(
            "//div[@id='info']//span[text()='定价:']/following::text()")
        price = price_elem[0].strip() if price_elem else None

        pages_elem = content.xpath(
            "//div[@id='info']//span[text()='页数:']/following::text()")
        pages = pages_elem[0].strip() if pages_elem else None
        if pages is not None:
            pages = int(RE_NUMBERS.findall(pages)[
                        0]) if RE_NUMBERS.findall(pages) else None
            if pages and (pages > 999999 or pages < 1):
                pages = None

        brief_elem = content.xpath(
            "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()")
        brief = '\n'.join(p.strip()
                          for p in brief_elem) if brief_elem else None

        contents = None
        try:
            contents_elem = content.xpath(
                "//h2/span[text()='目录']/../following-sibling::div[1]")[0]
            # if next the id of next sibling contains `dir`, that would be the full contents
            if "dir" in contents_elem.getnext().xpath("@id")[0]:
                contents_elem = contents_elem.getnext()
                contents = '\n'.join(p.strip() for p in contents_elem.xpath(
                    "text()")[:-2]) if contents_elem else None
            else:
                contents = '\n'.join(p.strip() for p in contents_elem.xpath(
                    "text()")) if contents_elem else None
        except Exception:
            pass

        img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src")
        img_url = img_url_elem[0].strip() if img_url_elem else None
        raw_img, ext = self.download_image(img_url, url)

        # there are two html formats for authors and translators
        authors_elem = content.xpath("""//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/
            preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""")
        if not authors_elem:
            authors_elem = content.xpath(
                """//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""")
        if authors_elem:
            authors = []
            for author in authors_elem:
                authors.append(RE_WHITESPACES.sub(' ', author.strip())[:200])
        else:
            authors = None

        translators_elem = content.xpath("""//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/
            preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""")
        if not translators_elem:
            translators_elem = content.xpath(
                """//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""")
        if translators_elem:
            translators = []
            for translator in translators_elem:
                translators.append(RE_WHITESPACES.sub(' ', translator.strip()))
        else:
            translators = None

        other = {}
        cncode_elem = content.xpath(
            "//div[@id='info']//span[text()='统一书号:']/following::text()")
        if cncode_elem:
            other['统一书号'] = cncode_elem[0].strip()
        series_elem = content.xpath(
            "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()")
        if series_elem:
            other['丛书'] = series_elem[0].strip()
        imprint_elem = content.xpath(
            "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()")
        if imprint_elem:
            other['出品方'] = imprint_elem[0].strip()

        data = {
            'title': title,
            'subtitle': subtitle,
            'orig_title': orig_title,
            'author': authors,
            'translator': translators,
            'language': language,
            'pub_house': pub_house,
            'pub_year': pub_year,
            'pub_month': pub_month,
            'binding': binding,
            'price': price,
            'pages': pages,
            'isbn': isbn,
            'brief': brief,
            'contents': contents,
            'other_info': other,
            'source_site': self.site_name,
            'source_url': self.get_effective_url(url),
        }
        self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
        return data, raw_img


class DoubanMovieScraper(DoubanScrapperMixin, AbstractScraper):
    site_name = SourceSiteEnum.DOUBAN.value
    host = 'movie.douban.com'
    data_class = Movie
    form_class = MovieForm

    regex = re.compile(r"https://movie\.douban\.com/subject/\d+/{0,1}")

    def scrape(self, url):
        headers = DEFAULT_REQUEST_HEADERS.copy()
        headers['Host'] = self.host
        content = self.download_page(url, headers)

        # parsing starts here
        try:
            raw_title = content.xpath(
                "//span[@property='v:itemreviewed']/text()")[0].strip()
        except IndexError:
            raise ValueError("given url contains no movie info")

        orig_title = content.xpath(
            "//img[@rel='v:image']/@alt")[0].strip()
        title = raw_title.split(orig_title)[0].strip()
        # if has no chinese title
        if title == '':
            title = orig_title

        if title == orig_title:
            orig_title = None

        # there are two html formats for authors and translators
        other_title_elem = content.xpath(
            "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
        other_title = other_title_elem[0].strip().split(
            ' / ') if other_title_elem else None

        imdb_elem = content.xpath(
            "//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()")
        if not imdb_elem:
            imdb_elem = content.xpath(
                "//div[@id='info']//span[text()='IMDb:']/following-sibling::text()[1]")
        imdb_code = imdb_elem[0].strip() if imdb_elem else None

        director_elem = content.xpath(
            "//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()")
        director = director_elem if director_elem else None

        playwright_elem = content.xpath(
            "//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()")
        playwright = list(map(lambda a: a[:200], playwright_elem)) if playwright_elem else None

        actor_elem = content.xpath(
            "//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()")
        actor = list(map(lambda a: a[:200], actor_elem)) if actor_elem else None

        # construct genre translator
        genre_translator = {}
        attrs = [attr for attr in dir(MovieGenreEnum) if '__' not in attr]
        for attr in attrs:
            genre_translator[getattr(MovieGenreEnum, attr).label] = getattr(
                MovieGenreEnum, attr).value

        genre_elem = content.xpath("//span[@property='v:genre']/text()")
        if genre_elem:
            genre = []
            for g in genre_elem:
                g = g.split(' ')[0]
                if g == '紀錄片':  # likely some original data on douban was corrupted
                    g = '纪录片'
                elif g == '鬼怪':
                    g = '惊悚'
                if g in genre_translator:
                    genre.append(genre_translator[g])
                elif g in genre_translator.values():
                    genre.append(g)
                else:
                    logger.error(f'unable to map genre {g}')
        else:
            genre = None

        showtime_elem = content.xpath(
            "//span[@property='v:initialReleaseDate']/text()")
        if showtime_elem:
            showtime = []
            for st in showtime_elem:
                parts = st.split('(')
                if len(parts) == 1:
                    time = st.split('(')[0]
                    region = ''
                else:
                    time = st.split('(')[0]
                    region = st.split('(')[1][0:-1]
                showtime.append({time: region})
        else:
            showtime = None

        site_elem = content.xpath(
            "//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href")
        site = site_elem[0].strip()[:200] if site_elem else None
        try:
            validator = URLValidator()
            validator(site)
        except ValidationError:
            site = None

        area_elem = content.xpath(
            "//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]")
        if area_elem:
            area = [a.strip()[:100] for a in area_elem[0].split('/')]
        else:
            area = None

        language_elem = content.xpath(
            "//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]")
        if language_elem:
            language = [a.strip() for a in language_elem[0].split(' / ')]
        else:
            language = None

        year_elem = content.xpath("//span[@class='year']/text()")
        year = int(re.search(r'\d+', year_elem[0])[0]) if year_elem and re.search(r'\d+', year_elem[0]) else None

        duration_elem = content.xpath("//span[@property='v:runtime']/text()")
        other_duration_elem = content.xpath(
            "//span[@property='v:runtime']/following-sibling::text()[1]")
        if duration_elem:
            duration = duration_elem[0].strip()
            if other_duration_elem:
                duration += other_duration_elem[0].rstrip()
            duration = duration.split('/')[0].strip()
        else:
            duration = None

        season_elem = content.xpath(
            "//*[@id='season']/option[@selected='selected']/text()")
        if not season_elem:
            season_elem = content.xpath(
                "//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]")
            season = int(season_elem[0].strip()) if season_elem else None
        else:
            season = int(season_elem[0].strip())

        episodes_elem = content.xpath(
            "//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]")
        episodes = int(episodes_elem[0].strip()) if episodes_elem and episodes_elem[0].isdigit() else None

        single_episode_length_elem = content.xpath(
            "//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]")
        single_episode_length = single_episode_length_elem[0].strip(
        )[:100] if single_episode_length_elem else None

        # if has field `episodes` not none then must be series
        is_series = True if episodes else False

        brief_elem = content.xpath("//span[@class='all hidden']")
        if not brief_elem:
            brief_elem = content.xpath("//span[@property='v:summary']")
        brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
            './text()')]) if brief_elem else None

        img_url_elem = content.xpath("//img[@rel='v:image']/@src")
        img_url = img_url_elem[0].strip() if img_url_elem else None
        raw_img, ext = self.download_image(img_url, url)

        data = {
            'title': title,
            'orig_title': orig_title,
            'other_title': other_title,
            'imdb_code': imdb_code,
            'director': director,
            'playwright': playwright,
            'actor': actor,
            'genre': genre,
            'showtime': showtime,
            'site': site,
            'area': area,
            'language': language,
            'year': year,
            'duration': duration,
            'season': season,
            'episodes': episodes,
            'single_episode_length': single_episode_length,
            'brief': brief,
            'is_series': is_series,
            'source_site': self.site_name,
            'source_url': self.get_effective_url(url),
        }
        self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
        return data, raw_img


class DoubanAlbumScraper(DoubanScrapperMixin, AbstractScraper):
    site_name = SourceSiteEnum.DOUBAN.value
    host = 'music.douban.com'
    data_class = Album
    form_class = AlbumForm

    regex = re.compile(r"https://music\.douban\.com/subject/\d+/{0,1}")

    def scrape(self, url):
        headers = DEFAULT_REQUEST_HEADERS.copy()
        headers['Host'] = self.host
        content = self.download_page(url, headers)

        # parsing starts here
        try:
            title = content.xpath("//h1/span/text()")[0].strip()
        except IndexError:
            raise ValueError("given url contains no album info")
        if not title:
            raise ValueError("given url contains no album info")

        artists_elem = content.xpath("//div[@id='info']/span/span[@class='pl']/a/text()")
        artist = None if not artists_elem else list(map(lambda a: a[:200], artists_elem))

        genre_elem = content.xpath(
            "//div[@id='info']//span[text()='流派:']/following::text()[1]")
        genre = genre_elem[0].strip() if genre_elem else None

        date_elem = content.xpath(
            "//div[@id='info']//span[text()='发行时间:']/following::text()[1]")
        release_date = parse_date(date_elem[0].strip()) if date_elem else None

        company_elem = content.xpath(
            "//div[@id='info']//span[text()='出版者:']/following::text()[1]")
        company = company_elem[0].strip() if company_elem else None

        track_list_elem = content.xpath(
            "//div[@class='track-list']/div[@class='indent']/div/text()"
        )
        if track_list_elem:
            track_list = '\n'.join([track.strip() for track in track_list_elem])
        else:
            track_list = None

        brief_elem = content.xpath("//span[@class='all hidden']")
        if not brief_elem:
            brief_elem = content.xpath("//span[@property='v:summary']")
        brief = '\n'.join([e.strip() for e in brief_elem[0].xpath(
            './text()')]) if brief_elem else None

        other_info = {}
        other_elem = content.xpath(
            "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]")
        if other_elem:
            other_info['又名'] = other_elem[0].strip()
        other_elem = content.xpath(
            "//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]")
        if other_elem:
            other_info['专辑类型'] = other_elem[0].strip()
        other_elem = content.xpath(
            "//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]")
        if other_elem:
            other_info['介质'] = other_elem[0].strip()
        other_elem = content.xpath(
            "//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]")
        if other_elem:
            other_info['ISRC'] = other_elem[0].strip()
        other_elem = content.xpath(
            "//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]")
        if other_elem:
            other_info['条形码'] = other_elem[0].strip()
        other_elem = content.xpath(
            "//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]")
        if other_elem:
            other_info['碟片数'] = other_elem[0].strip()

        img_url_elem = content.xpath("//div[@id='mainpic']//img/@src")
        img_url = img_url_elem[0].strip() if img_url_elem else None
        raw_img, ext = self.download_image(img_url, url)

        data = {
            'title': title,
            'artist': artist,
            'genre': genre,
            'release_date': release_date,
            'duration': None,
            'company': company,
            'track_list': track_list,
            'brief': brief,
            'other_info': other_info,
            'source_site': self.site_name,
            'source_url': self.get_effective_url(url),
        }
        self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
        return data, raw_img


class DoubanGameScraper(DoubanScrapperMixin, AbstractScraper):
    site_name = SourceSiteEnum.DOUBAN.value
    host = 'www.douban.com/game/'
    data_class = Game
    form_class = GameForm

    regex = re.compile(r"https://www\.douban\.com/game/\d+/{0,1}")

    def scrape(self, url):
        headers = DEFAULT_REQUEST_HEADERS.copy()
        headers['Host'] = 'www.douban.com'
        content = self.download_page(url, headers)

        try:
            raw_title = content.xpath(
                "//div[@id='content']/h1/text()")[0].strip()
        except IndexError:
            raise ValueError("given url contains no game info")

        title = raw_title

        other_title_elem = content.xpath(
            "//dl[@class='game-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()")
        other_title = other_title_elem[0].strip().split(' / ') if other_title_elem else None

        developer_elem = content.xpath(
            "//dl[@class='game-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()")
        developer = developer_elem[0].strip().split(' / ') if developer_elem else None

        publisher_elem = content.xpath(
            "//dl[@class='game-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()")
        publisher = publisher_elem[0].strip().split(' / ') if publisher_elem else None

        platform_elem = content.xpath(
            "//dl[@class='game-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()")
        platform = platform_elem if platform_elem else None

        genre_elem = content.xpath(
            "//dl[@class='game-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()")
        genre = None
        if genre_elem:
            genre = [g for g in genre_elem if g != '游戏']

        date_elem = content.xpath(
            "//dl[@class='game-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()")
        release_date = parse_date(date_elem[0].strip()) if date_elem else None

        brief_elem = content.xpath("//div[@class='mod item-desc']/p/text()")
        brief = '\n'.join(brief_elem) if brief_elem else None

        img_url_elem = content.xpath(
            "//div[@class='item-subject-info']/div[@class='pic']//img/@src")
        img_url = img_url_elem[0].strip() if img_url_elem else None
        raw_img, ext = self.download_image(img_url, url)

        data = {
            'title': title,
            'other_title': other_title,
            'developer': developer,
            'publisher': publisher,
            'release_date': release_date,
            'genre': genre,
            'platform': platform,
            'brief': brief,
            'other_info': None,
            'source_site': self.site_name,
            'source_url': self.get_effective_url(url),
        }

        self.raw_data, self.raw_img, self.img_ext = data, raw_img, ext
        return data, raw_img