lib.itmens/catalog/sites/douban.py

65 lines
2.3 KiB
Python
Raw Normal View History

2025-01-04 11:23:07 -05:00
import json
import re
from catalog.common import *
2025-01-04 11:23:07 -05:00
from catalog.search.models import ExternalSearchResultItem
RE_NUMBERS = re.compile(r"\d+\d*")
RE_WHITESPACES = re.compile(r"\s+")
class DoubanDownloader(ProxiedDownloader):
2023-12-31 08:32:19 -05:00
def validate_response(self, response) -> int:
if response is None:
return RESPONSE_NETWORK_ERROR
elif response.status_code == 204:
return RESPONSE_CENSORSHIP
elif response.status_code == 200:
2022-12-29 23:57:02 -05:00
content = response.content.decode("utf-8")
2024-11-29 15:26:08 -05:00
if content.find("关于豆瓣") == -1 and content.find("豆瓣评分") == -1:
# if content.find('你的 IP 发出') == -1:
# error = error + 'Content not authentic' # response is garbage
# else:
# error = error + 'IP banned'
return RESPONSE_NETWORK_ERROR
2022-12-29 23:57:02 -05:00
elif (
content.find("<title>页面不存在</title>") != -1
or content.find("呃... 你想访问的条目豆瓣不收录。") != -1
2024-10-28 02:07:28 -04:00
or content.find("根据相关法律法规,当前条目正在等待审核。") != -1
2022-12-29 23:57:02 -05:00
): # re.search('不存在[^<]+</title>', content, re.MULTILINE):
return RESPONSE_CENSORSHIP
else:
return RESPONSE_OK
else:
return RESPONSE_INVALID_CONTENT
2025-01-04 11:23:07 -05:00
class DoubanSearcher:
@classmethod
def search(cls, cat: ItemCategory, c: str, q: str, p: int = 1):
url = f"https://search.douban.com/{c}/subject_search?search_text={q}&start={15 * (p - 1)}"
2025-01-04 11:23:07 -05:00
content = DoubanDownloader(url).download().html()
j = json.loads(
content.xpath("//script[text()[contains(.,'window.__DATA__')]]/text()")[ # type:ignore
2025-01-04 11:23:07 -05:00
0
]
.split("window.__DATA__ = ")[1] # type:ignore
.split("};")[0] # type:ignore
+ "}"
)
results = [
ExternalSearchResultItem(
cat,
SiteName.Douban,
item["url"],
item["title"],
item["abstract"],
item["abstract_2"],
item["cover_url"],
)
for item in j["items"]
for item in j["items"]
if item.get("tpl_name") == "search_subject"
]
return results