lib.itmens/catalog/sites/douban.py

import json
import re

from catalog.common import *
from catalog.search.models import ExternalSearchResultItem

RE_NUMBERS = re.compile(r"\d+\d*")
RE_WHITESPACES = re.compile(r"\s+")


class DoubanDownloader(ProxiedDownloader):
    def validate_response(self, response) -> int:
        if response is None:
            return RESPONSE_NETWORK_ERROR
        elif response.status_code == 204:
            return RESPONSE_CENSORSHIP
        elif response.status_code == 200:
            content = response.content.decode("utf-8")
            if content.find("关于豆瓣") == -1 and content.find("豆瓣评分") == -1:
                # if content.find('你的 IP 发出') == -1:
                #     error = error + 'Content not authentic'  # response is garbage
                # else:
                #     error = error + 'IP banned'
                return RESPONSE_NETWORK_ERROR
            elif (
                content.find("<title>页面不存在</title>") != -1
                or content.find("呃... 你想访问的条目豆瓣不收录。") != -1
                or content.find("根据相关法律法规，当前条目正在等待审核。") != -1
            ):  # re.search('不存在[^<]+</title>', content, re.MULTILINE):
                return RESPONSE_CENSORSHIP
            else:
                return RESPONSE_OK
        else:
            return RESPONSE_INVALID_CONTENT


class DoubanSearcher:
    @classmethod
    def search(cls, cat: ItemCategory, c: str, q: str, p: int = 1):
        url = f"https://search.douban.com/{c}/subject_search?search_text={q}&start={15 * (p - 1)}"
        content = DoubanDownloader(url).download().html()
        j = json.loads(
            content.xpath("//script[text()[contains(.,'window.__DATA__')]]/text()")[  # type:ignore
                0
            ]
            .split("window.__DATA__ = ")[1]  # type:ignore
            .split("};")[0]  # type:ignore
            + "}"
        )
        results = [
            ExternalSearchResultItem(
                cat,
                SiteName.Douban,
                item["url"],
                item["title"],
                item["abstract"],
                item["abstract_2"],
                item["cover_url"],
            )
            for item in j["items"]
            for item in j["items"]
            if item.get("tpl_name") == "search_subject"
        ]
        return results
fix some lint issues 2025-01-04 11:23:07 -05:00			`import json`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`import re`

fix unicode site name; add isort; split journal and users models 2023-08-10 11:27:31 -04:00			`from catalog.common import *`
fix some lint issues 2025-01-04 11:23:07 -05:00			`from catalog.search.models import ExternalSearchResultItem`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00
			`RE_NUMBERS = re.compile(r"\d+\d*")`
			`RE_WHITESPACES = re.compile(r"\s+")`


			`class DoubanDownloader(ProxiedDownloader):`
lint 2023-12-31 08:32:19 -05:00			`def validate_response(self, response) -> int:`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`if response is None:`
			`return RESPONSE_NETWORK_ERROR`
			`elif response.status_code == 204:`
			`return RESPONSE_CENSORSHIP`
			`elif response.status_code == 200:`
reformat new code with black 2022-12-29 23:57:02 -05:00			`content = response.content.decode("utf-8")`
fix douban parsing 2024-11-29 15:26:08 -05:00			`if content.find("关于豆瓣") == -1 and content.find("豆瓣评分") == -1:`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`# if content.find('你的 IP 发出') == -1:`
			`# error = error + 'Content not authentic' # response is garbage`
			`# else:`
			`# error = error + 'IP banned'`
			`return RESPONSE_NETWORK_ERROR`
reformat new code with black 2022-12-29 23:57:02 -05:00			`elif (`
			`content.find("<title>页面不存在</title>") != -1`
			`or content.find("呃... 你想访问的条目豆瓣不收录。") != -1`
improve douban censorship detection 2024-10-28 02:07:28 -04:00			`or content.find("根据相关法律法规，当前条目正在等待审核。") != -1`
reformat new code with black 2022-12-29 23:57:02 -05:00			`): # re.search('不存在[^<]+</title>', content, re.MULTILINE):`
new catalog data model, wip, not enabled 2022-12-07 19:09:05 -05:00			`return RESPONSE_CENSORSHIP`
			`else:`
			`return RESPONSE_OK`
			`else:`
			`return RESPONSE_INVALID_CONTENT`
fix some lint issues 2025-01-04 11:23:07 -05:00

			`class DoubanSearcher:`
			`@classmethod`
			`def search(cls, cat: ItemCategory, c: str, q: str, p: int = 1):`
update lint: replace black and isort with ruff 2025-01-11 17:20:02 -05:00			`url = f"https://search.douban.com/{c}/subject_search?search_text={q}&start={15 * (p - 1)}"`
fix some lint issues 2025-01-04 11:23:07 -05:00			`content = DoubanDownloader(url).download().html()`
			`j = json.loads(`
update lint: replace black and isort with ruff 2025-01-11 17:20:02 -05:00			`content.xpath("//script[text()[contains(.,'window.__DATA__')]]/text()")[ # type:ignore`
fix some lint issues 2025-01-04 11:23:07 -05:00			`0`
			`]`
			`.split("window.__DATA__ = ")[1] # type:ignore`
			`.split("};")[0] # type:ignore`
			`+ "}"`
			`)`
			`results = [`
			`ExternalSearchResultItem(`
			`cat,`
			`SiteName.Douban,`
			`item["url"],`
			`item["title"],`
			`item["abstract"],`
			`item["abstract_2"],`
			`item["cover_url"],`
			`)`
			`for item in j["items"]`
			`for item in j["items"]`
			`if item.get("tpl_name") == "search_subject"`
			`]`
			`return results`