diff options
author | gesang <gesang@itinerariummentis.org> | 2024-07-15 08:45:19 +0000 |
---|---|---|
committer | gesang <gesang@itinerariummentis.org> | 2024-07-15 08:45:19 +0000 |
commit | cd8115bb1a79fcbb8853e3c5130d090b42fa3941 (patch) | |
tree | d846f74ea8e5b6490b3c069900c496b032000b13 /scraper | |
parent | edfa4b96dcdf950eb9983a3886b5a0a2bf0674b1 (diff) | |
parent | 029af216d48c56ad310f4ba82ac0ed2fb57d5e32 (diff) |
Merge branch 'master' of https://git.lolcat.ca/lolcat/4get
Diffstat (limited to 'scraper')
-rw-r--r-- | scraper/googlealt.php | 182 | ||||
-rw-r--r-- | scraper/startpage.php | 802 |
2 files changed, 903 insertions, 81 deletions
diff --git a/scraper/googlealt.php b/scraper/googlealt.php index aa523db..d7878cf 100644 --- a/scraper/googlealt.php +++ b/scraper/googlealt.php @@ -535,6 +535,8 @@ class googlealt{ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); curl_setopt($curlproc, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V6); + + // use http2 curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); @@ -704,6 +706,43 @@ class googlealt{ // reset $this->fuckhtml->load($result_div); + }else{ + + // get the "Did you mean?" prompt + $taw = + $this->fuckhtml + ->getElementById( + "taw" + ); + + if($taw){ + + $this->fuckhtml->load($taw); + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) !== 0){ + + $text = + $this->fuckhtml + ->getTextContent( + $as[0] + ); + + // @TODO implement did_you_mean + $out["spelling"] = [ + "type" => "including", + "using" => $search, + "correction" => $text + ]; + } + } + + $this->fuckhtml->load($result_div); } // @@ -895,36 +934,10 @@ class googlealt{ // get "Related Searches" and "People also search for" // $relateds = - array_merge( - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "align-items" => "center", - "background-color" => "#28292a", - "border-radius" => "100px", - "box-sizing" => "border-box", - "display" => "flex", - "max-height" => "none", - "min-height" => "48px", - "padding-left" => "17px", - "padding-right" => "17px", - "position" => "relative" - ] - ) . " " . - $this->getstyle( - [ - "margin-left" => "8px", - "margin-right" => "8px" - ] - ), - "a" - ), - $this->fuckhtml - ->getElementsByClassName( - "wyccme", - "div" - ) + $this->fuckhtml + ->getElementsByClassName( + "wyccme", + "div" ); foreach($relateds as $related){ @@ -1354,7 +1367,7 @@ class googlealt{ "font-size" => "12px", "line-height" => "1.34", "display" => "inline-block", - "font-family" => "Google Sans,arial,sans-serif", + "font-family" => "google sans,arial,sans-serif", "padding-right" => "0", "white-space" => "nowrap" ] @@ -1401,7 +1414,7 @@ class googlealt{ "line-height" => "22px", "overflow" => "hidden", "word-break" => "break-word", - "color" => "#bdc1c6" + "color" => "#4d5156" ] ), "div" @@ -1415,12 +1428,9 @@ class googlealt{ ->getElementsByClassName( $this->getstyle( [ - "border-radius" => "10px", - "font-family" => "arial,sans-serif-medium,sans-serif", - "font-size" => "12px", - "line-height" => "16px", - "padding-block" => "2px", - "padding-inline" => "8px" + "background-color" => "rgba(0,0,0,0.6)", + "color" => "#fff", + "fill" => "#fff" ] ), "div" @@ -1433,14 +1443,6 @@ class googlealt{ ->getTextContent( $duration[0] ); - - // remove duration from description - $description[0]["innerHTML"] = - str_replace( - $duration[0]["outerHTML"], - "", - $description[0]["innerHTML"] - ); } $web["description"] = @@ -1979,7 +1981,7 @@ class googlealt{ "font-size" => "12px", "line-height" => "1.34", "display" => "inline-block", - "font-family" => "Google Sans,arial,sans-serif", + "font-family" => "google sans,arial,sans-serif", "padding-right" => "0", "white-space" => "nowrap" ] @@ -2211,7 +2213,7 @@ class googlealt{ ->getElementsByClassName( $this->getstyle( [ - "font-family" => "Google Sans,arial,sans-serif", + "font-family" => "google sans,arial,sans-serif", "font-size" => "28px", "line-height" => "36px" ] @@ -2801,7 +2803,22 @@ class googlealt{ } } - // get thumbnail + // get heading element + $heading = + $this->fuckhtml + ->getElementsByAttributeValue( + "role", + "heading", + "div" + ); + + if(count($heading) === 0){ + + // no heading, fuck this. + continue; + } + + // get thumbnail before loading heading object $image = $this->fuckhtml ->getElementsByAttributeName( @@ -2823,35 +2840,6 @@ class googlealt{ ]; } - // get title - $title = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "font-family" => "arial,sans-serif", - "font-size" => "16px", - "font-weight" => "400", - "line-height" => "24px" - ] - ), - "div" - ); - - if(count($title) === 0){ - - // ?? no title - continue; - } - - $title = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $title[0] - ) - ); - // get duration $duration_div = $this->fuckhtml @@ -2908,6 +2896,38 @@ class googlealt{ } } + // load heading + $this->fuckhtml->load($heading[0]); + + // get title + $title = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "font-family" => "arial,sans-serif", + "font-size" => "16px", + "font-weight" => "400", + "line-height" => "24px" + ] + ), + "div" + ); + + if(count($title) === 0){ + + // ?? no title + continue; + } + + $title = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ); + // get date $date_div = $this->fuckhtml @@ -3940,7 +3960,7 @@ class googlealt{ for($k=0; $k<count($values_regex[1]); $k++){ $values[trim($values_regex[1][$k])] = - trim($values_regex[2][$k]); + strtolower(trim($values_regex[2][$k])); } $names = explode(",", $matches[1][$i]); @@ -3971,7 +3991,7 @@ class googlealt{ foreach($this->styles[":root"] as $key => $value){ - $this->css_colors[$value] = $key; + $this->css_colors[$value] = strtolower($key); } } } @@ -4206,7 +4226,7 @@ class googlealt{ throw new Exception("Failed to get HTML"); } - //$html = file_get_contents("scraper/google-video.html"); + //$html = file_get_contents("scraper/google.html"); $response = $this->parsepage($html, "videos", $search, $proxy, $params); $out = [ diff --git a/scraper/startpage.php b/scraper/startpage.php new file mode 100644 index 0000000..fe63dfd --- /dev/null +++ b/scraper/startpage.php @@ -0,0 +1,802 @@ +<?php + +class startpage{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("startpage"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + switch($page){ + case "web": + return [ + "country" => [ + "display" => "Country", + "option" => [ + "any" => "All Regions", + "es_AR" => "Argentina", + "en_AU" => "Australia", + "de_AT" => "Austria", + "ru_BY" => "Belarus", + "fr_BE" => "Belgium (FR)", + "nl_BE" => "Belgium (NL)", + "bg_BG" => "Bulgaria", + "en_CA" => "Canada (EN)", + "fr_CA" => "Canada (FR)", + "es_CL" => "Chile", + "es_CO" => "Colombia", + "cs_CZ" => "Czech Republic", + "da_DK" => "Denmark", + "ar_EG" => "Egypt", + "et_EE" => "Estonia", + "fi_FI" => "Finland", + "fr_FR" => "France", + "de_DE" => "Germany", + "el_GR" => "Greece", + "hu_HU" => "Hungary", + "hi_IN" => "India (HI)", + "en_IN" => "India (EN)", + "id_ID" => "Indonesia (ID)", + "en_ID" => "Indonesia (EN)", + "en_IE" => "Ireland", + "it_IT" => "Italy", + "ja_JP" => "Japan", + "ko_KR" => "Korea", + "ms_MY" => "Malaysia (MS)", + "en_MY" => "Malaysia (EN)", + "es_MX" => "Mexico", + "nl_NL" => "Netherlands", + "en_NZ" => "New Zealand", + "no_NO" => "Norway", + "es_PE" => "Peru", + "fil_PH" => "Philippines (FIL)", + "en_PH" => "Philippines (EN)", + "pl_PL" => "Poland", + "pt_PT" => "Portugal", + "ro_RO" => "Romania", + "ru_RU" => "Russia", + "ms_SG" => "Singapore (MS)", + "en_SG" => "Singapore (EN)", + "es_ES" => "Spain (ES)", + "ca_ES" => "Spain (CA)", + "sv_SE" => "Sweden", + "de_CH" => "Switzerland (DE)", + "fr_CH" => "Switzerland (FR)", + "it_CH" => "Switzerland (IT)", + "tr_TR" => "Turkey", + "uk_UA" => "Ukraine", + "en_US" => "US (EN)", + "es_US" => "US (ES)", + "es_UY" => "Uruguay", + "es_VE" => "Venezuela", + "vi_VN" => "Vietnam (VI)", + "en_VN" => "Vietnam (EN)", + "en_ZA" => "South Africa" + ] + ], + "nsfw" => [ // qadf + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // qadf=none + "no" => "No" // qadf=heavy + ] + ], + "time" => [ // with_date + "display" => "Time fetched", + "option" => [ + "any" => "Any time", + "d" => "Past 24 hours", + "w" => "Past week", + "m" => "Past month", + "y" => "Past year", + ] + ], + "extendedsearch" => [ + // undefined display, so it wont show in frontend + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ] + ]; + break; + } + } + + private function get($proxy, $url, $get = [], $post = false, $is_xhr = false){ + + $curlproc = curl_init(); + + if($post === true){ + + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get); + + }elseif($get !== []){ + + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($is_xhr === true){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/json", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://www.startpage.com/", + "Content-Type: application/json", + "Content-Length: " . strlen($get), + "Origin: https://www.startpage.com/", + "DNT: 1", + "Connection: keep-alive", + "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "TE: trailers"] + ); + + }elseif($post === true){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://www.startpage.com/", + "Content-Type: application/x-www-form-urlencoded", + "Content-Length: " . strlen($get), + "DNT: 1", + "Connection: keep-alive", + "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1", + "Priority: u=0, i", + "TE: trailers"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1", + "Priority: u=0, i", + "TE: trailers"] + ); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + [$post, $proxy] = $this->backend->get($get["npt"], "web"); + + try{ + $html = $this->get( + $proxy, + "https://www.startpage.com/sp/search", + $post, + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + $get_instant_answer = false; + + }else{ + + $proxy = $this->backend->get_ip(); + + $params = [ + "query" => $get["s"], + "cat" => "web", + "pl" => "opensearch" + ]; + + if($get["nsfw"] == "no"){ + + $params["qadf"] = "heavy"; + $get_instant_answer = false; + }else{ + + $get_instant_answer = true; + } + + if($get["country"] !== "any"){ + + $params["qsr"] = $get["country"]; + } + + if($get["time"] !== "any"){ + + $params["with_date"] = $get["time"]; + } + + try{ + $html = $this->get( + $proxy, + "https://www.startpage.com/sp/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + //$html = file_get_contents("scraper/startpage.html"); + } + + if( + preg_match( + '/React\.createElement\(UIStartpage\.AppSerpWeb, ?(.+)\),$/m', + $html, + $matches + ) === 0 + ){ + + throw new Exception("Failed to grep JSON object"); + } + + $json = json_decode($matches[1], true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // get npt + foreach($json["render"]["presenter"]["pagination"]["pages"] as $page){ + + if($page["name"] == "Next"){ + + parse_str( + explode( + "?", + $page["url"], + 2 + )[1], + $str + ); + + $out["npt"] = + $this->backend->store( + http_build_query( + [ + "lui" => "english", + "language" => "english", + "query" => $str["q"], + "cat" => "web", + "sc" => $str["sc"], + "t" => "device", + "segment" => "startpage.udog", + "page" => $str["page"] + ] + ), + "web", + $proxy + ); + + break; + } + } + + foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){ + + if(!isset($category["display_type"])){ + + continue; + } + + switch($category["display_type"]){ + + case "web-google": + foreach($category["results"] as $result){ + + $sublinks = []; + + foreach($result["siteLinks"] as $sublink){ + + $sublinks[] = [ + "title" => $sublink["title"], + "description" => null, + "url" => $sublink["clickUrl"] + ]; + } + + $description = + explode( + "...", + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $result["description"] + ) + ) + ), + 2 + ); + + $date = strtotime(trim($description[0])); + + if( + $date === false || + count($description) !== 2 || + strlen($description[0]) > 14 + ){ + + // no date found + $description = + implode( + " ... ", + $description + ); + + $date = null; + }else{ + + // date found + $description = ltrim($description[1]); + } + + $out["web"][] = [ + "title" => + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $result["title"] + ) + ) + ), + "description" => $description, + "url" => $result["clickUrl"], + "date" => $date, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => $sublinks, + "table" => [] + ]; + } + break; + + case "images-qi-top": + foreach($category["results"] as $result){ + + $out["image"][] = [ + "title" => + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $result["title"] + ) + ) + ), + "source" => [ + [ + "url" => $result["rawImageUrl"], + "width" => (int)$result["width"], + "height" => (int)$result["height"] + ], + [ + "url" => $this->unshitimage($result["mdThumbnailUrl"]), + "width" => (int)$result["mdThumbnailWidth"], + "height" => (int)$result["mdThumbnailHeight"] + ] + ], + "url" => + $result["altClickUrl"] + ]; + } + break; + } + } + + // parse instant answers + if( + $get["extendedsearch"] == "yes" && + $get_instant_answer === true + ){ + + // https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=BqZ3inqrAgF701&sr=1 + try{ + $post = [ + "se" => "n0vze2y9dqwy", + "q" => $json["render"]["query"], + "results" => [], // populate + "enableKnowledgePanel" => true, + "enableMediaThumbBar" => false, + "enableSearchSuggestions" => false, + "enableTripadvisorProperties" => [], + "enableTripadvisorPlaces" => [], + "enableTripadvisorPlacesForLocations" => [], + "enableWebProducts" => false, + "tripadvisorPartnerId" => null, + "tripadvisorMapColorMode" => "light", + "tripadvisorDisablesKnowledgePanel" => false, + "instantAnswers" => [ + "smartAnswers", + "youtube", + "tripadvisor" + ], + "iaType" => null, + "forceEnhancedKnowledgePanel" => false, + "shoppingOnly" => false, + "allowAdultProducts" => true, + "lang" => "en", + "browserLang" => "en-US", + "browserTimezone" => "America/New_York", + "market" => null, + "userLocation" => null, + "userDate" => date("Y-m-d"), + "userAgentType" => "unknown" + ]; + + foreach($out["web"] as $result){ + + $post["results"][] = [ + "url" => $result["url"], + "title" => $result["title"] + ]; + } + + $post = json_encode($post, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE); + + $additional_data = + $this->get( + $proxy, + "https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=" . $json["render"]["callback_sc"] . "&sr=1", + $post, + true, + true + ); + + $additional_data = json_decode($additional_data, true); + + if($additional_data === null){ + + throw new Exception("Failed to decode JSON"); // just break out, dont fail completely + } + + if(!isset($additional_data["knowledgePanel"])){ + + throw new Exception("Response has missing data (knowledgePanel)"); + } + + $additional_data = $additional_data["knowledgePanel"]; + + $answer = [ + "title" => $additional_data["meta"]["title"], + "description" => [ + [ + "type" => "quote", + "value" => $additional_data["meta"]["description"] + ] + ], + "url" => $additional_data["meta"]["origWikiUrl"], + "thumb" => $additional_data["meta"]["image"], + "table" => [], + "sublink" => [] + ]; + + // parse html for instant answer + $this->fuckhtml->load($additional_data["html"]); + + $div = + $this->fuckhtml + ->getElementsByTagName( + "div" + ); + + // get description + $description = + $this->fuckhtml + ->getElementsByClassName( + "sx-kp-short-extract sx-kp-short-extract-complete", + $div + ); + + if(count($description) !== 0){ + + $answer["description"][] = [ + "type" => "text", + "value" => + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ]; + } + + // get socials + $socials = + $this->fuckhtml + ->getElementsByClassName( + "sx-wiki-social-link", + "a" + ); + + foreach($socials as $social){ + + $title = + $this->fuckhtml + ->getTextContent( + $social["attributes"]["title"] + ); + + $url = + $this->fuckhtml + ->getTextContent( + $social["attributes"]["href"] + ); + + switch($title){ + + case "Official Website": + $title = "Website"; + break; + } + + $answer["sublink"][$title] = $url; + } + + // get videos + $videos = + $this->fuckhtml + ->getElementsByClassName( + "sx-kp-video-grid-item", + $div + ); + + foreach($videos as $video){ + + $this->fuckhtml->load($video); + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) === 0){ + + // ?? invalid + continue; + } + + $image = + $this->fuckhtml + ->getElementsByAttributeName( + "data-sx-src", + "img" + ); + + if(count($image) !== 0){ + + $thumb = [ + "ratio" => "16:9", + "url" => + $this->fuckhtml + ->getTextContent( + $image[0]["attributes"]["data-sx-src"] + ) + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["video"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $as[0]["attributes"]["title"] + ), + "description" => null, + "date" => null, + "duration" => null, + "views" => null, + "thumb" => $thumb, + "url" => + $this->fuckhtml + ->getTextContent( + $as[0]["attributes"]["href"] + ) + ]; + } + + // reset + $this->fuckhtml->load($additional_data["html"]); + + // get table elements + $table = + $this->fuckhtml + ->getElementsByClassName( + "sx-infobox", + "table" + ); + + if(count($table) !== 0){ + + $trs = + $this->fuckhtml + ->getElementsByTagName( + "tr" + ); + + foreach($trs as $tr){ + + $this->fuckhtml->load($tr); + + // ok so startpage devs cant fucking code a table + // td = content + // th (AAAHH) = title + $tds = + $this->fuckhtml + ->getElementsByTagName( + "td" + ); + + $ths = + $this->fuckhtml + ->getElementsByTagName( + "th" + ); + + if( + count($ths) === 1 && + count($tds) === 1 + ){ + + $title = + $this->fuckhtml + ->getTextContent( + $ths[0] + ); + + $description = []; + + $this->fuckhtml->load($tds[0]); + + $lis = + $this->fuckhtml + ->getElementsByTagName( + "li" + ); + + if(count($lis) !== 0){ + + foreach($lis as $li){ + + $description[] = + $this->fuckhtml + ->getTextContent( + $li + ); + } + + $description = implode(", ", $description); + }else{ + + $description = + $this->fuckhtml + ->getTextContent( + $tds[0] + ); + } + + $answer["table"][$title] = $description; + } + } + } + + $out["answer"][] = $answer; + + }catch(Exception $error){ + + // do nothing + //echo "error!"; + } + } + + return $out; + } + + private function unshitimage($url){ + + $query = parse_url($url, PHP_URL_QUERY); + parse_str($query, $query); + + if(isset($query["piurl"])){ + + if(strpos($query["piurl"], "gstatic.com/")){ + + return + explode( + "&", + $query["piurl"], + 2 + )[0]; + } + + return $query["piurl"]; + } + + return $url; + } + + private function titledots($title){ + + return trim($title, " .\t\n\r\0\x0B…"); + } +} |