summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
authorgesang <gesang@itinerariummentis.org>2024-07-15 08:45:19 +0000
committergesang <gesang@itinerariummentis.org>2024-07-15 08:45:19 +0000
commitcd8115bb1a79fcbb8853e3c5130d090b42fa3941 (patch)
treed846f74ea8e5b6490b3c069900c496b032000b13 /scraper
parentedfa4b96dcdf950eb9983a3886b5a0a2bf0674b1 (diff)
parent029af216d48c56ad310f4ba82ac0ed2fb57d5e32 (diff)
Merge branch 'master' of https://git.lolcat.ca/lolcat/4get
Diffstat (limited to 'scraper')
-rw-r--r--scraper/googlealt.php182
-rw-r--r--scraper/startpage.php802
2 files changed, 903 insertions, 81 deletions
diff --git a/scraper/googlealt.php b/scraper/googlealt.php
index aa523db..d7878cf 100644
--- a/scraper/googlealt.php
+++ b/scraper/googlealt.php
@@ -535,6 +535,8 @@ class googlealt{
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curlproc, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V6);
+
+
// use http2
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
@@ -704,6 +706,43 @@ class googlealt{
// reset
$this->fuckhtml->load($result_div);
+ }else{
+
+ // get the "Did you mean?" prompt
+ $taw =
+ $this->fuckhtml
+ ->getElementById(
+ "taw"
+ );
+
+ if($taw){
+
+ $this->fuckhtml->load($taw);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($as) !== 0){
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]
+ );
+
+ // @TODO implement did_you_mean
+ $out["spelling"] = [
+ "type" => "including",
+ "using" => $search,
+ "correction" => $text
+ ];
+ }
+ }
+
+ $this->fuckhtml->load($result_div);
}
//
@@ -895,36 +934,10 @@ class googlealt{
// get "Related Searches" and "People also search for"
//
$relateds =
- array_merge(
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle(
- [
- "align-items" => "center",
- "background-color" => "#28292a",
- "border-radius" => "100px",
- "box-sizing" => "border-box",
- "display" => "flex",
- "max-height" => "none",
- "min-height" => "48px",
- "padding-left" => "17px",
- "padding-right" => "17px",
- "position" => "relative"
- ]
- ) . " " .
- $this->getstyle(
- [
- "margin-left" => "8px",
- "margin-right" => "8px"
- ]
- ),
- "a"
- ),
- $this->fuckhtml
- ->getElementsByClassName(
- "wyccme",
- "div"
- )
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "wyccme",
+ "div"
);
foreach($relateds as $related){
@@ -1354,7 +1367,7 @@ class googlealt{
"font-size" => "12px",
"line-height" => "1.34",
"display" => "inline-block",
- "font-family" => "Google Sans,arial,sans-serif",
+ "font-family" => "google sans,arial,sans-serif",
"padding-right" => "0",
"white-space" => "nowrap"
]
@@ -1401,7 +1414,7 @@ class googlealt{
"line-height" => "22px",
"overflow" => "hidden",
"word-break" => "break-word",
- "color" => "#bdc1c6"
+ "color" => "#4d5156"
]
),
"div"
@@ -1415,12 +1428,9 @@ class googlealt{
->getElementsByClassName(
$this->getstyle(
[
- "border-radius" => "10px",
- "font-family" => "arial,sans-serif-medium,sans-serif",
- "font-size" => "12px",
- "line-height" => "16px",
- "padding-block" => "2px",
- "padding-inline" => "8px"
+ "background-color" => "rgba(0,0,0,0.6)",
+ "color" => "#fff",
+ "fill" => "#fff"
]
),
"div"
@@ -1433,14 +1443,6 @@ class googlealt{
->getTextContent(
$duration[0]
);
-
- // remove duration from description
- $description[0]["innerHTML"] =
- str_replace(
- $duration[0]["outerHTML"],
- "",
- $description[0]["innerHTML"]
- );
}
$web["description"] =
@@ -1979,7 +1981,7 @@ class googlealt{
"font-size" => "12px",
"line-height" => "1.34",
"display" => "inline-block",
- "font-family" => "Google Sans,arial,sans-serif",
+ "font-family" => "google sans,arial,sans-serif",
"padding-right" => "0",
"white-space" => "nowrap"
]
@@ -2211,7 +2213,7 @@ class googlealt{
->getElementsByClassName(
$this->getstyle(
[
- "font-family" => "Google Sans,arial,sans-serif",
+ "font-family" => "google sans,arial,sans-serif",
"font-size" => "28px",
"line-height" => "36px"
]
@@ -2801,7 +2803,22 @@ class googlealt{
}
}
- // get thumbnail
+ // get heading element
+ $heading =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "role",
+ "heading",
+ "div"
+ );
+
+ if(count($heading) === 0){
+
+ // no heading, fuck this.
+ continue;
+ }
+
+ // get thumbnail before loading heading object
$image =
$this->fuckhtml
->getElementsByAttributeName(
@@ -2823,35 +2840,6 @@ class googlealt{
];
}
- // get title
- $title =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle(
- [
- "font-family" => "arial,sans-serif",
- "font-size" => "16px",
- "font-weight" => "400",
- "line-height" => "24px"
- ]
- ),
- "div"
- );
-
- if(count($title) === 0){
-
- // ?? no title
- continue;
- }
-
- $title =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $title[0]
- )
- );
-
// get duration
$duration_div =
$this->fuckhtml
@@ -2908,6 +2896,38 @@ class googlealt{
}
}
+ // load heading
+ $this->fuckhtml->load($heading[0]);
+
+ // get title
+ $title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "font-family" => "arial,sans-serif",
+ "font-size" => "16px",
+ "font-weight" => "400",
+ "line-height" => "24px"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($title) === 0){
+
+ // ?? no title
+ continue;
+ }
+
+ $title =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ )
+ );
+
// get date
$date_div =
$this->fuckhtml
@@ -3940,7 +3960,7 @@ class googlealt{
for($k=0; $k<count($values_regex[1]); $k++){
$values[trim($values_regex[1][$k])] =
- trim($values_regex[2][$k]);
+ strtolower(trim($values_regex[2][$k]));
}
$names = explode(",", $matches[1][$i]);
@@ -3971,7 +3991,7 @@ class googlealt{
foreach($this->styles[":root"] as $key => $value){
- $this->css_colors[$value] = $key;
+ $this->css_colors[$value] = strtolower($key);
}
}
}
@@ -4206,7 +4226,7 @@ class googlealt{
throw new Exception("Failed to get HTML");
}
- //$html = file_get_contents("scraper/google-video.html");
+ //$html = file_get_contents("scraper/google.html");
$response = $this->parsepage($html, "videos", $search, $proxy, $params);
$out = [
diff --git a/scraper/startpage.php b/scraper/startpage.php
new file mode 100644
index 0000000..fe63dfd
--- /dev/null
+++ b/scraper/startpage.php
@@ -0,0 +1,802 @@
+<?php
+
+class startpage{
+
+ public function __construct(){
+
+ include "lib/backend.php";
+ $this->backend = new backend("startpage");
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+ }
+
+ public function getfilters($page){
+
+ switch($page){
+ case "web":
+ return [
+ "country" => [
+ "display" => "Country",
+ "option" => [
+ "any" => "All Regions",
+ "es_AR" => "Argentina",
+ "en_AU" => "Australia",
+ "de_AT" => "Austria",
+ "ru_BY" => "Belarus",
+ "fr_BE" => "Belgium (FR)",
+ "nl_BE" => "Belgium (NL)",
+ "bg_BG" => "Bulgaria",
+ "en_CA" => "Canada (EN)",
+ "fr_CA" => "Canada (FR)",
+ "es_CL" => "Chile",
+ "es_CO" => "Colombia",
+ "cs_CZ" => "Czech Republic",
+ "da_DK" => "Denmark",
+ "ar_EG" => "Egypt",
+ "et_EE" => "Estonia",
+ "fi_FI" => "Finland",
+ "fr_FR" => "France",
+ "de_DE" => "Germany",
+ "el_GR" => "Greece",
+ "hu_HU" => "Hungary",
+ "hi_IN" => "India (HI)",
+ "en_IN" => "India (EN)",
+ "id_ID" => "Indonesia (ID)",
+ "en_ID" => "Indonesia (EN)",
+ "en_IE" => "Ireland",
+ "it_IT" => "Italy",
+ "ja_JP" => "Japan",
+ "ko_KR" => "Korea",
+ "ms_MY" => "Malaysia (MS)",
+ "en_MY" => "Malaysia (EN)",
+ "es_MX" => "Mexico",
+ "nl_NL" => "Netherlands",
+ "en_NZ" => "New Zealand",
+ "no_NO" => "Norway",
+ "es_PE" => "Peru",
+ "fil_PH" => "Philippines (FIL)",
+ "en_PH" => "Philippines (EN)",
+ "pl_PL" => "Poland",
+ "pt_PT" => "Portugal",
+ "ro_RO" => "Romania",
+ "ru_RU" => "Russia",
+ "ms_SG" => "Singapore (MS)",
+ "en_SG" => "Singapore (EN)",
+ "es_ES" => "Spain (ES)",
+ "ca_ES" => "Spain (CA)",
+ "sv_SE" => "Sweden",
+ "de_CH" => "Switzerland (DE)",
+ "fr_CH" => "Switzerland (FR)",
+ "it_CH" => "Switzerland (IT)",
+ "tr_TR" => "Turkey",
+ "uk_UA" => "Ukraine",
+ "en_US" => "US (EN)",
+ "es_US" => "US (ES)",
+ "es_UY" => "Uruguay",
+ "es_VE" => "Venezuela",
+ "vi_VN" => "Vietnam (VI)",
+ "en_VN" => "Vietnam (EN)",
+ "en_ZA" => "South Africa"
+ ]
+ ],
+ "nsfw" => [ // qadf
+ "display" => "NSFW",
+ "option" => [
+ "yes" => "Yes", // qadf=none
+ "no" => "No" // qadf=heavy
+ ]
+ ],
+ "time" => [ // with_date
+ "display" => "Time fetched",
+ "option" => [
+ "any" => "Any time",
+ "d" => "Past 24 hours",
+ "w" => "Past week",
+ "m" => "Past month",
+ "y" => "Past year",
+ ]
+ ],
+ "extendedsearch" => [
+ // undefined display, so it wont show in frontend
+ "option" => [
+ "yes" => "Yes",
+ "no" => "No"
+ ]
+ ]
+ ];
+ break;
+ }
+ }
+
+ private function get($proxy, $url, $get = [], $post = false, $is_xhr = false){
+
+ $curlproc = curl_init();
+
+ if($post === true){
+
+ curl_setopt($curlproc, CURLOPT_POST, true);
+ curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
+
+ }elseif($get !== []){
+
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ // http2 bypass
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+
+ if($is_xhr === true){
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: application/json",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "Referer: https://www.startpage.com/",
+ "Content-Type: application/json",
+ "Content-Length: " . strlen($get),
+ "Origin: https://www.startpage.com/",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius",
+ "Sec-Fetch-Dest: empty",
+ "Sec-Fetch-Mode: cors",
+ "Sec-Fetch-Site: same-origin",
+ "TE: trailers"]
+ );
+
+ }elseif($post === true){
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "Referer: https://www.startpage.com/",
+ "Content-Type: application/x-www-form-urlencoded",
+ "Content-Length: " . strlen($get),
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1",
+ "Priority: u=0, i",
+ "TE: trailers"]
+ );
+ }else{
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1",
+ "Priority: u=0, i",
+ "TE: trailers"]
+ );
+ }
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+ return $data;
+ }
+
+ public function web($get){
+
+ if($get["npt"]){
+
+ [$post, $proxy] = $this->backend->get($get["npt"], "web");
+
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://www.startpage.com/sp/search",
+ $post,
+ true
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ $get_instant_answer = false;
+
+ }else{
+
+ $proxy = $this->backend->get_ip();
+
+ $params = [
+ "query" => $get["s"],
+ "cat" => "web",
+ "pl" => "opensearch"
+ ];
+
+ if($get["nsfw"] == "no"){
+
+ $params["qadf"] = "heavy";
+ $get_instant_answer = false;
+ }else{
+
+ $get_instant_answer = true;
+ }
+
+ if($get["country"] !== "any"){
+
+ $params["qsr"] = $get["country"];
+ }
+
+ if($get["time"] !== "any"){
+
+ $params["with_date"] = $get["time"];
+ }
+
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://www.startpage.com/sp/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ //$html = file_get_contents("scraper/startpage.html");
+ }
+
+ if(
+ preg_match(
+ '/React\.createElement\(UIStartpage\.AppSerpWeb, ?(.+)\),$/m',
+ $html,
+ $matches
+ ) === 0
+ ){
+
+ throw new Exception("Failed to grep JSON object");
+ }
+
+ $json = json_decode($matches[1], true);
+
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ // get npt
+ foreach($json["render"]["presenter"]["pagination"]["pages"] as $page){
+
+ if($page["name"] == "Next"){
+
+ parse_str(
+ explode(
+ "?",
+ $page["url"],
+ 2
+ )[1],
+ $str
+ );
+
+ $out["npt"] =
+ $this->backend->store(
+ http_build_query(
+ [
+ "lui" => "english",
+ "language" => "english",
+ "query" => $str["q"],
+ "cat" => "web",
+ "sc" => $str["sc"],
+ "t" => "device",
+ "segment" => "startpage.udog",
+ "page" => $str["page"]
+ ]
+ ),
+ "web",
+ $proxy
+ );
+
+ break;
+ }
+ }
+
+ foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
+
+ if(!isset($category["display_type"])){
+
+ continue;
+ }
+
+ switch($category["display_type"]){
+
+ case "web-google":
+ foreach($category["results"] as $result){
+
+ $sublinks = [];
+
+ foreach($result["siteLinks"] as $sublink){
+
+ $sublinks[] = [
+ "title" => $sublink["title"],
+ "description" => null,
+ "url" => $sublink["clickUrl"]
+ ];
+ }
+
+ $description =
+ explode(
+ "...",
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $result["description"]
+ )
+ )
+ ),
+ 2
+ );
+
+ $date = strtotime(trim($description[0]));
+
+ if(
+ $date === false ||
+ count($description) !== 2 ||
+ strlen($description[0]) > 14
+ ){
+
+ // no date found
+ $description =
+ implode(
+ " ... ",
+ $description
+ );
+
+ $date = null;
+ }else{
+
+ // date found
+ $description = ltrim($description[1]);
+ }
+
+ $out["web"][] = [
+ "title" =>
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $result["title"]
+ )
+ )
+ ),
+ "description" => $description,
+ "url" => $result["clickUrl"],
+ "date" => $date,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => $sublinks,
+ "table" => []
+ ];
+ }
+ break;
+
+ case "images-qi-top":
+ foreach($category["results"] as $result){
+
+ $out["image"][] = [
+ "title" =>
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $result["title"]
+ )
+ )
+ ),
+ "source" => [
+ [
+ "url" => $result["rawImageUrl"],
+ "width" => (int)$result["width"],
+ "height" => (int)$result["height"]
+ ],
+ [
+ "url" => $this->unshitimage($result["mdThumbnailUrl"]),
+ "width" => (int)$result["mdThumbnailWidth"],
+ "height" => (int)$result["mdThumbnailHeight"]
+ ]
+ ],
+ "url" =>
+ $result["altClickUrl"]
+ ];
+ }
+ break;
+ }
+ }
+
+ // parse instant answers
+ if(
+ $get["extendedsearch"] == "yes" &&
+ $get_instant_answer === true
+ ){
+
+ // https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=BqZ3inqrAgF701&sr=1
+ try{
+ $post = [
+ "se" => "n0vze2y9dqwy",
+ "q" => $json["render"]["query"],
+ "results" => [], // populate
+ "enableKnowledgePanel" => true,
+ "enableMediaThumbBar" => false,
+ "enableSearchSuggestions" => false,
+ "enableTripadvisorProperties" => [],
+ "enableTripadvisorPlaces" => [],
+ "enableTripadvisorPlacesForLocations" => [],
+ "enableWebProducts" => false,
+ "tripadvisorPartnerId" => null,
+ "tripadvisorMapColorMode" => "light",
+ "tripadvisorDisablesKnowledgePanel" => false,
+ "instantAnswers" => [
+ "smartAnswers",
+ "youtube",
+ "tripadvisor"
+ ],
+ "iaType" => null,
+ "forceEnhancedKnowledgePanel" => false,
+ "shoppingOnly" => false,
+ "allowAdultProducts" => true,
+ "lang" => "en",
+ "browserLang" => "en-US",
+ "browserTimezone" => "America/New_York",
+ "market" => null,
+ "userLocation" => null,
+ "userDate" => date("Y-m-d"),
+ "userAgentType" => "unknown"
+ ];
+
+ foreach($out["web"] as $result){
+
+ $post["results"][] = [
+ "url" => $result["url"],
+ "title" => $result["title"]
+ ];
+ }
+
+ $post = json_encode($post, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE);
+
+ $additional_data =
+ $this->get(
+ $proxy,
+ "https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=" . $json["render"]["callback_sc"] . "&sr=1",
+ $post,
+ true,
+ true
+ );
+
+ $additional_data = json_decode($additional_data, true);
+
+ if($additional_data === null){
+
+ throw new Exception("Failed to decode JSON"); // just break out, dont fail completely
+ }
+
+ if(!isset($additional_data["knowledgePanel"])){
+
+ throw new Exception("Response has missing data (knowledgePanel)");
+ }
+
+ $additional_data = $additional_data["knowledgePanel"];
+
+ $answer = [
+ "title" => $additional_data["meta"]["title"],
+ "description" => [
+ [
+ "type" => "quote",
+ "value" => $additional_data["meta"]["description"]
+ ]
+ ],
+ "url" => $additional_data["meta"]["origWikiUrl"],
+ "thumb" => $additional_data["meta"]["image"],
+ "table" => [],
+ "sublink" => []
+ ];
+
+ // parse html for instant answer
+ $this->fuckhtml->load($additional_data["html"]);
+
+ $div =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "div"
+ );
+
+ // get description
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "sx-kp-short-extract sx-kp-short-extract-complete",
+ $div
+ );
+
+ if(count($description) !== 0){
+
+ $answer["description"][] = [
+ "type" => "text",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ )
+ ];
+ }
+
+ // get socials
+ $socials =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "sx-wiki-social-link",
+ "a"
+ );
+
+ foreach($socials as $social){
+
+ $title =
+ $this->fuckhtml
+ ->getTextContent(
+ $social["attributes"]["title"]
+ );
+
+ $url =
+ $this->fuckhtml
+ ->getTextContent(
+ $social["attributes"]["href"]
+ );
+
+ switch($title){
+
+ case "Official Website":
+ $title = "Website";
+ break;
+ }
+
+ $answer["sublink"][$title] = $url;
+ }
+
+ // get videos
+ $videos =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "sx-kp-video-grid-item",
+ $div
+ );
+
+ foreach($videos as $video){
+
+ $this->fuckhtml->load($video);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($as) === 0){
+
+ // ?? invalid
+ continue;
+ }
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "data-sx-src",
+ "img"
+ );
+
+ if(count($image) !== 0){
+
+ $thumb = [
+ "ratio" => "16:9",
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $image[0]["attributes"]["data-sx-src"]
+ )
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ $out["video"][] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]["attributes"]["title"]
+ ),
+ "description" => null,
+ "date" => null,
+ "duration" => null,
+ "views" => null,
+ "thumb" => $thumb,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]["attributes"]["href"]
+ )
+ ];
+ }
+
+ // reset
+ $this->fuckhtml->load($additional_data["html"]);
+
+ // get table elements
+ $table =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "sx-infobox",
+ "table"
+ );
+
+ if(count($table) !== 0){
+
+ $trs =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "tr"
+ );
+
+ foreach($trs as $tr){
+
+ $this->fuckhtml->load($tr);
+
+ // ok so startpage devs cant fucking code a table
+ // td = content
+ // th (AAAHH) = title
+ $tds =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "td"
+ );
+
+ $ths =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "th"
+ );
+
+ if(
+ count($ths) === 1 &&
+ count($tds) === 1
+ ){
+
+ $title =
+ $this->fuckhtml
+ ->getTextContent(
+ $ths[0]
+ );
+
+ $description = [];
+
+ $this->fuckhtml->load($tds[0]);
+
+ $lis =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "li"
+ );
+
+ if(count($lis) !== 0){
+
+ foreach($lis as $li){
+
+ $description[] =
+ $this->fuckhtml
+ ->getTextContent(
+ $li
+ );
+ }
+
+ $description = implode(", ", $description);
+ }else{
+
+ $description =
+ $this->fuckhtml
+ ->getTextContent(
+ $tds[0]
+ );
+ }
+
+ $answer["table"][$title] = $description;
+ }
+ }
+ }
+
+ $out["answer"][] = $answer;
+
+ }catch(Exception $error){
+
+ // do nothing
+ //echo "error!";
+ }
+ }
+
+ return $out;
+ }
+
+ private function unshitimage($url){
+
+ $query = parse_url($url, PHP_URL_QUERY);
+ parse_str($query, $query);
+
+ if(isset($query["piurl"])){
+
+ if(strpos($query["piurl"], "gstatic.com/")){
+
+ return
+ explode(
+ "&",
+ $query["piurl"],
+ 2
+ )[0];
+ }
+
+ return $query["piurl"];
+ }
+
+ return $url;
+ }
+
+ private function titledots($title){
+
+ return trim($title, " .\t\n\r\0\x0B…");
+ }
+}