From bca265aea67ec62499aaa113a6490ce9ec7fe730 Mon Sep 17 00:00:00 2001 From: lolcat Date: Sat, 22 Jul 2023 14:41:14 -0400 Subject: still missing things on google scraper --- scraper/brave.php | 2287 ++++++++++++++++++++++++++++++++++++++++ scraper/ddg.php | 2722 ++++++++++++++++++++++++++++++++++++++++++++++++ scraper/google.php | 1562 +++++++++++++++++++++++++++ scraper/marginalia.php | 242 +++++ scraper/mojeek.php | 1182 +++++++++++++++++++++ scraper/wiby.php | 244 +++++ scraper/yandex.php | 530 ++++++++++ scraper/youtube.php | 1723 ++++++++++++++++++++++++++++++ 8 files changed, 10492 insertions(+) create mode 100644 scraper/brave.php create mode 100644 scraper/ddg.php create mode 100644 scraper/google.php create mode 100644 scraper/marginalia.php create mode 100644 scraper/mojeek.php create mode 100644 scraper/wiby.php create mode 100644 scraper/yandex.php create mode 100644 scraper/youtube.php (limited to 'scraper') diff --git a/scraper/brave.php b/scraper/brave.php new file mode 100644 index 0000000..4d48c33 --- /dev/null +++ b/scraper/brave.php @@ -0,0 +1,2287 @@ +bypasscaptcha($html, "yes", "ca");*/ + +class brave{ + + public function __construct(){ + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + include "lib/nextpage.php"; + $this->nextpage = new nextpage("brave"); + } + + public function getfilters($page){ + + switch($page){ + + case "web": + return [ + "country" => [ + "display" => "Country", + "option" => [ + "all" => "All Regions", + "ar" => "Argentina", + "au" => "Australia", + "at" => "Austria", + "be" => "Belgium", + "br" => "Brazil", + "ca" => "Canada", + "cl" => "Chile", + "cn" => "China", + "dk" => "Denmark", + "fi" => "Finland", + "fr" => "France", + "de" => "Germany", + "hk" => "Hong Kong", + "in" => "India", + "id" => "Indonesia", + "it" => "Italy", + "jp" => "Japan", + "kr" => "Korea", + "my" => "Malaysia", + "mx" => "Mexico", + "nl" => "Netherlands", + "nz" => "New Zealand", + "no" => "Norway", + "pl" => "Poland", + "pt" => "Portugal", + "ph" => "Philippines", + "ru" => "Russia", + "sa" => "Saudi Arabia", + "za" => "South Africa", + "es" => "Spain", + "se" => "Sweden", + "ch" => "Switzerland", + "tw" => "Taiwan", + "tr" => "Turkey", + "gb" => "United Kingdom", + "us" => "United States" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ] + ]; + break; + + case "news": + return [ + "country" => [ + "display" => "Country", + "option" => [ + "all" => "All regions", + "ar" => "Argentina", + "au" => "Australia", + "at" => "Austria", + "be" => "Belgium", + "br" => "Brazil", + "ca" => "Canada", + "cl" => "Chile", + "cn" => "China", + "dk" => "Denmark", + "fi" => "Finland", + "fr" => "France", + "de" => "Germany", + "hk" => "Hong Kong", + "in" => "India", + "id" => "Indonesia", + "it" => "Italy", + "jp" => "Japan", + "kr" => "Korea", + "my" => "Malaysia", + "mx" => "Mexico", + "nl" => "Netherlands", + "nz" => "New Zealand", + "no" => "Norway", + "pl" => "Poland", + "pt" => "Portugal", + "ph" => "Philippines", + "ru" => "Russia", + "sa" => "Saudi Arabia", + "za" => "South Africa", + "es" => "Spain", + "se" => "Sweden", + "ch" => "Switzerland", + "tw" => "Taiwan", + "tr" => "Turkey", + "gb" => "United Kingdom", + "us" => "United States" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ] + ]; + break; + } + } + + private function get($url, $get = [], $nsfw, $country/*, $is_post = false, $additional_cookies = null*/){ + + switch($nsfw){ + + case "yes": $nsfw = "off"; break; + case "maybe": $nsfw = "moderate"; break; + case "no": $nsfw = "strict"; break; + } + + //$cookie = "safesearch={$nsfw}; country={$country}; useLocation=0"; + /* + if($additional_cookies !== null){ + + $cookie = $additional_cookies . "; " . $cookie; + }*/ + + $headers = [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Cookie: safesearch={$nsfw}; country={$country}; useLocation=0; summarizer=0", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"//, + //"Content-Type: application/json" + ]; + + if($country == "any"){ + + $country = "all"; + } + + $curlproc = curl_init(); + + /*if($is_post){ + + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt( + $curlproc, + CURLOPT_POSTFIELDS, + json_encode($get) + ); + + }else{ + */ + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + //} + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + // get next page data + $q = json_decode($this->nextpage->get($get["npt"], "web"), true); + + $search = $q["q"]; + $q["spellcheck"] = 0; + + $nsfw = $q["nsfw"]; + unset($q["nsfw"]); + + $country = $q["country"]; + unset($q["country"]); + + }else{ + + // get _GET data instead + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search query is too long!"); + } + + $nsfw = $get["nsfw"]; + $country = $get["country"]; + $older = $get["older"]; + $newer = $get["newer"]; + + $q = [ + "q" => $search + ]; + + /* + Pass older/newer filters to brave + */ + if($newer !== false){ + + $newer = date("Y-m-d", $newer); + + if($older === false){ + + $older = date("Y-m-d", time()); + } + } + + if( + is_string($older) === false && + $older !== false + ){ + + $older = date("Y-m-d", $older); + + if($newer === false){ + + $newer = "1970-01-02"; + } + } + + if($older !== false){ + + $q["tf"] = "{$newer}to{$older}"; + } + } + /* + $handle = fopen("scraper/brave.html", "r"); + $html = fread($handle, filesize("scraper/brave.html")); + fclose($handle); + */ + try{ + $html = + $this->get( + "https://search.brave.com/search", + $q, + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // load html + $this->fuckhtml->load($html); + + /* + Get next page "token" + */ + $nextpage = + $this->fuckhtml + ->getElementsByClassName( + "btn ml-15", + "a" + ); + + if(count($nextpage) !== 0){ + + preg_match( + '/offset=([0-9]+)/', + $this->fuckhtml->getTextContent($nextpage[0]["attributes"]["href"]), + $nextpage + ); + + $q["offset"] = (int)$nextpage[1]; + $q["nsfw"] = $nsfw; + $q["country"] = $country; + + $out["npt"] = + $this->nextpage->store( + json_encode($q), + "web" + ); + } + + /* + Get discussions (and append them to web results) + */ + + // they're loaded using javascript!! + $discussion = + $this->fuckhtml + ->getElementById( + "js-discussions", + "script" + ); + + if( + $discussion && + isset($discussion["attributes"]["data"]) + ){ + + $discussion = + json_decode( + $this->fuckhtml + ->getTextContent( + $discussion["attributes"]["data"] + ), + true + ); + + foreach($discussion["results"] as $result){ + + $data = [ + "title" => $this->titledots($result["title"]), + "description" => null, + "url" => $result["url"], + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + // description + $data["description"] = + $this->limitstrlen( + $this->limitwhitespace( + $this->titledots( + $this->fuckhtml->getTextContent( + $result["description"] + ) + ) + ) + ); + + if($result["age"] != ""){ + $data["date"] = strtotime($result["age"]); + } + + // populate table + + if($result["data"]["num_answers"] != ""){ + $data["table"]["Replies"] = (int)$result["data"]["num_answers"]; + } + + if($result["data"]["score"] != ""){ + + $score = explode("|", $result["data"]["score"]); + + if(count($score) === 2){ + + $score = ((int)$score[1]) . " (" . trim($score[0]) . ")"; + }else{ + + $score = (int)$score[0]; + } + + $data["table"]["Votes"] = $score; + } + + if($result["thumbnail"] != ""){ + + $data["thumb"]["url"] = $result["thumbnail"]; + $data["thumb"]["ratio"] = "16:9"; + } + + $out["web"][] = $data; + } + } + + /* + Get related searches + */ + $faq = + $this->fuckhtml + ->getElementById("js-faq", "script"); + + if( + $faq && + isset($faq["attributes"]["data"]) + ){ + + $faq = + json_decode( + $this->fuckhtml + ->getTextContent( + $faq["attributes"]["data"] + ), + true + ); + + foreach($faq["items"] as $related){ + + $out["related"][] = $related["question"]; + } + } + + /* + Get spelling autocorrect + */ + $altered = + $this->fuckhtml + ->getElementById("altered-query", "div"); + + if($altered){ + + $this->fuckhtml->load($altered); + + $altered = + $this->fuckhtml + ->getElementsByTagName("a"); + + if(count($altered) === 2){ + + $out["spelling"] = [ + "type" => "including", + "using" => + $this->fuckhtml + ->getTextContent($altered[0]), + "correction" => + $this->fuckhtml + ->getTextContent($altered[1]) + ]; + } + + $this->fuckhtml->load($html); + } + + /* + Get web results + */ + $resulthtml = + $this->fuckhtml + ->getElementById( + "results", + "div" + ); + + $this->fuckhtml->load($resulthtml); + $items = 0; + foreach( + $this->fuckhtml + ->getElementsByClassName("snippet fdb") + as $result + ){ + + $data = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + if( + isset($result["attributes"]["data-type"]) && + $result["attributes"]["data-type"] == "ad" + ){ + + // is an ad, skip + continue; + } + + $this->fuckhtml->load($result); + + /* + Get title + */ + $title = + $this->fuckhtml + ->getElementsByClassName( + "snippet-title", + "span" + ); + + if(count($title) === 0){ + + // encountered AI summarizer + // or misspelling indicator @TODO + continue; + } + + if(isset($title[0]["attributes"]["title"])){ + + $data["title"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0]["attributes"]["title"] + ) + ); + }else{ + + $data["title"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ); + } + + /* + Get description + */ + $description = + $this->fuckhtml + ->getElementsByClassName( + "snippet-description", + "p" + ); + + if(count($description) !== 0){ + $data["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ); + + // also check for thumbnail in here + $img = + $this->fuckhtml + ->getElementsByClassName( + "thumb", + "img" + ); + + if(count($img) !== 0){ + + $data["thumb"] = [ + "url" => $this->unshiturl($img[0]["attributes"]["src"]), + "ratio" => "16:9" + ]; + }else{ + + // might be a video thumbnail wrapper? + $wrapper = + $this->fuckhtml + ->getElementsByClassName( + "video-thumb", + "a" + ); + + if(count($wrapper) !== 0){ + + // we found a video + $this->fuckhtml->load($wrapper[0]); + + $img = + $this->fuckhtml + ->getElementsByTagName("img"); + + $data["thumb"] = [ + "url" => $this->unshiturl($img[0]["attributes"]["src"]), + "ratio" => "16:9" + ]; + + // get the video length, if its there + $duration = + $this->fuckhtml + ->getElementsByClassName( + "duration", + "div" + ); + + if(count($duration) !== 0){ + + $data["table"]["Duration"] = $duration[0]["innerHTML"]; + } + + // reset html load + $this->fuckhtml->load($result); + } + } + + }else{ + + // is a steam/shop listing + $description_alt = + $this->fuckhtml + ->getElementsByClassName( + "text-sm", + "div" + ); + + if(count($description_alt) !== 0){ + + switch($description_alt[0]["attributes"]["class"]){ + + case "text-sm text-gray": + case "description text-sm": + + $data["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $description_alt[0] + ) + ); + break; + } + + // get table sublink + $sublink = + $this->fuckhtml + ->getElementsByClassName( + "r-attr text-sm", + "div" + ); + + if(count($sublink) !== 0){ + + $this->tablesublink($sublink, $data); + } + + // check for thumb element + $data["thumb"] = $this->getimagelinkfromstyle("thumb"); + }else{ + + // ok... finally... + // maybe its the instant answer thingy + $answer = + $this->fuckhtml + ->getElementsByClassName("answer"); + + if(count($answer) !== 0){ + + $data["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent($answer[0]) + ); + } + } + } + + // finally, fix brave's date format sucking balls + $data["description"] = explode(" - ", $data["description"], 2); + + if(count($data["description"]) === 0){ + + // nothing to do + $data["description"] = $data["description"][0]; + }else{ + + // attempt to parse + $time = strtotime($data["description"][0]); + + if($time !== false){ + + // got response + $data["date"] = $time; + + array_shift($data["description"]); + } + + // merge back + $data["description"] = + implode(" - ", $data["description"]); + } + + /* + Check content type + */ + $content_type = + $this->fuckhtml + ->getElementsByClassName( + "content-type", + "span" + ); + + if(count($content_type) !== 0){ + + $data["type"] = + strtolower($this->fuckhtml->getTextContent($content_type[0])); + } + + /* + Check subtext table thingy + */ + $table_items = + array_merge( + $this->fuckhtml + ->getElementsByClassName( + "item-attributes", + "div" + ), + $this->fuckhtml + ->getElementsByClassName( + "r", + "div" + ) + ); + + /* + DIV: item-attributes + */ + if(count($table_items) !== 0){ + + foreach($table_items as $table){ + + $this->fuckhtml->load($table); + + $span = + $this->fuckhtml + ->getElementsByClassName( + "text-sm", + "*" + ); + + foreach($span as $item){ + + $item = + explode( + ":", + $this->fuckhtml->getTextContent(preg_replace('/\n/', " ", $item["innerHTML"])), + 2 + ); + + if(count($item) === 2){ + + $data["table"][trim($item[0])] = trim($this->limitwhitespace($item[1])); + } + } + } + + $this->fuckhtml->load($result); + } + + // get video sublinks + $table_items = + $this->fuckhtml + ->getElementsByClassName( + "snippet-description published-time", + "p" + ); + + if(count($table_items) !== 0){ + + $table_items = + explode( + '', + $table_items[0]["innerHTML"], + 2 + ); + if(count($table_items) === 2){ + + $item2 = []; + + $item2[] = explode(":", $this->fuckhtml->getTextContent($table_items[0])); + + if(trim($table_items[1]) != ""){ + $item2[] = explode(":", $this->fuckhtml->getTextContent($table_items[1])); + } + + foreach($item2 as $it){ + + $data["table"][trim($it[0])] = trim($it[1]); + } + } + } + + /* + Get URL + */ + $data["url"] = + $this->fuckhtml->getTextContent( + $this->fuckhtml + ->getElementsByTagName("a") + [0] + ["attributes"] + ["href"] + ); + + /* + Get sublinks + */ + $sublinks_elems = + $this->fuckhtml + ->getElementsByClassName( + "snippet", + "div" + ); + + $sublinks = []; + + foreach($sublinks_elems as $sublink){ + + $this->fuckhtml->load($sublink); + + $a = + $this->fuckhtml + ->getElementsByTagName("a")[0]; + + $title = + $this->fuckhtml + ->getTextContent($a); + + $url = $a["attributes"]["href"]; + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName("p")[0] + ) + ); + + $sublinks[] = [ + "title" => $title, + "date" => null, + "description" => $description, + "url" => $url + ]; + } + + /* + Get smaller sublinks + */ + $sublinks_elems = + $this->fuckhtml + ->getElementsByClassName( + "deep-link", + "a" + ); + + foreach($sublinks_elems as $sublink){ + + $sublinks[] = [ + "title" => $this->fuckhtml->getTextContent($sublink), + "date" => null, + "description" => null, + "url" => $sublink["attributes"]["href"] + ]; + } + + // append sublinks to $data !! + $data["sublink"] = $sublinks; + + // append first result to start of $out["web"] + // other results are after + if($items === 0){ + + $out["web"] = [$data, ...$out["web"]]; + }else{ + + $out["web"][] = $data; + } + $items++; + } + + /* + Get news + */ + $this->fuckhtml->load($resulthtml); + $news_carousel = $this->fuckhtml->getElementById("news-carousel"); + + $this->fuckhtml->load($news_carousel); + + if($news_carousel){ + + $a = + $this->fuckhtml + ->getElementsByClassName( + "card fdb", + "a" + ); + + foreach($a as $news){ + + $this->fuckhtml->load($news); + + $out["news"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "title", + "div" + )[0] + ) + ), + "description" => null, + "date" => + strtotime( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "card-footer__timestamp", + "span" + )[0] + ) + ), + "thumb" => $this->getimagelinkfromstyle("img-bg"), + "url" => $this->fuckhtml->getTextContent($news["attributes"]["href"]) + ]; + } + } + + + + /* + Get videos + */ + $this->fuckhtml->load($resulthtml); + $news_carousel = $this->fuckhtml->getElementById("video-carousel"); + + $this->fuckhtml->load($news_carousel); + + if($news_carousel){ + + $a = + $this->fuckhtml + ->getElementsByClassName( + "card fdb", + "a" + ); + + foreach($a as $video){ + + $this->fuckhtml->load($video); + + $date = null; + + $date_o = + $this->fuckhtml + ->getElementsByClassName( + "text-gray text-xs", + "span" + ); + + if(count($date_o) !== 0){ + + $date = + strtotime( + $this->fuckhtml + ->getTextContent( + $date_o[0] + ) + ); + } + + $out["video"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "title", + "div" + )[0] + ) + ), + "description" => null, + "date" => $date, + "duration" => null, + "views" => null, + "thumb" => $this->getimagelinkfromstyle("img-bg"), + "url" => $this->fuckhtml->getTextContent($video["attributes"]["href"]) + ]; + } + } + + + /* + Get DEFINITION snippet + */ + $this->fuckhtml->load($html); + $infobox = $this->fuckhtml->getElementById("rh-definitions", "div"); + + if($infobox !== false){ + + $answer = [ + "title" => null, + "description" => [], + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + + $this->fuckhtml->load($infobox); + + $answer["title"] = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "header", + "h5" + )[0] + ); + + $sections = + $this->fuckhtml + ->getElementsByTagName("section"); + + $i = -1; + foreach($sections as $section){ + + $this->fuckhtml->load($section); + $items = + $this->fuckhtml + ->getElementsByTagName("*"); + + $li = 1; + $pronounce = false; + foreach($items as $item){ + + switch($item["tagName"]){ + + case "h6": + + if( + isset($item["attributes"]["class"]) && + $item["attributes"]["class"] == "h6 pronunciation" + ){ + + if($pronounce){ + + break; + } + + $answer["description"][] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $item + ) + ]; + + $answer["description"][] = + [ + "type" => "audio", + "url" => "https://search.brave.com/api/rhfetch?rhtype=definitions&word={$answer["title"]}&source=ahd-5" + ]; + + $pronounce = true; + $i = $i + 2; + break; + } + + $answer["description"][] = [ + "type" => "title", + "value" => + $this->fuckhtml + ->getTextContent( + $item + ) + ]; + $i++; + break; + + case "li": + + if( + $i !== -1 && + $answer["description"][$i]["type"] == "text" + ){ + + $answer["description"][$i]["value"] .= + "\n" . $li . ". " . + $this->fuckhtml + ->getTextContent( + $item + ); + + }else{ + $answer["description"][] = [ + "type" => "text", + "value" => + $li . ". " . + $this->fuckhtml + ->getTextContent( + $item + ) + ]; + $i++; + } + $li++; + break; + + case "a": + $answer["url"] = + $this->fuckhtml + ->getTextContent( + $item["attributes"]["href"] + ); + break; + } + } + } + + $out["answer"][] = $answer; + } + + + /* + Get instant answer + */ + $this->fuckhtml->load($html); + $infobox = $this->fuckhtml->getElementById("infobox", "div"); + + if($infobox !== false){ + + $answer = [ + "title" => null, + "description" => [], + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + + $this->fuckhtml->load($infobox); + $div = $this->fuckhtml->getElementsByTagName("div"); + + /* + Get title + url + */ + $title = + $this->fuckhtml + ->getElementsByClassName("infobox-title", "a"); + + if(count($title) !== 0){ + + $answer["title"] = + $this->fuckhtml + ->getTextContent( + $title[0] + ); + + $answer["url"] = + $this->fuckhtml + ->getTextContent( + $title[0]["attributes"]["href"] + ); + } + + /* + Get thumbnail + */ + $thumb = $this->getimagelinkfromstyle("thumb"); + + if($thumb["url"] !== null){ + + $answer["thumb"] = $thumb["url"]; + } + + /* + Get table + */ + $title = + $this->fuckhtml + ->getElementsByClassName( + "infobox-attr-header", + "div" + ); + + $rowhtml = $infobox; + + if(count($title) >= 2){ + + $rowhtml = + explode( + $title[1]["outerHTML"], + $infobox["innerHTML"], + 2 + )[0]; + } + + $this->fuckhtml->load($rowhtml); + + $rows = + $this->fuckhtml + ->getElementsByClassName("infobox-attr", "div"); + + foreach($rows as $row){ + + if(!isset($row["innerHTML"])){ + + continue; + } + + $this->fuckhtml->load($row); + $span = + $this->fuckhtml + ->getElementsByTagName("span"); + + if(count($span) === 2){ + + $answer["table"][ + $this->fuckhtml->getTextContent($span[0]) + ] = str_replace("\n", ", ", $this->fuckhtml->getTextContent($span[1], true)); + } + } + + $this->fuckhtml->load($infobox); + + /* + Parse stackoverflow answers + */ + $code = + $this->fuckhtml + ->getElementById("codebox-answer", $div); + + if($code){ + + // this might be standalone text with no paragraphs, check for that + $author = + $this->fuckhtml + ->getElementById("author"); + + $desc_tmp = + str_replace( + $author["outerHTML"], + "", + $code["innerHTML"] + ); + + $this->fuckhtml->load($desc_tmp); + $code = + $this->fuckhtml + ->getElementsByTagName("*"); + + if(count($code) === 0){ + + $answer["description"] = + [ + [ + "type" => "text", + "value" => + $this->fuckhtml + ->getTextContent( + $desc_tmp + ) + ], + [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $author + ) + ] + ]; + }else{ + + $text = []; + $i = 0; + + foreach($code as $snippet){ + + switch($snippet["tagName"]){ + + case "p": + $this->fuckhtml->load($snippet["innerHTML"]); + + $codetags = + $this->fuckhtml + ->getElementsByTagName("*"); + + $tmphtml = $snippet["innerHTML"]; + + foreach($codetags as $tag){ + + if(!isset($tag["outerHTML"])){ + + continue; + } + + $tmphtml = + explode( + $tag["outerHTML"], + $tmphtml, + 2 + ); + + $value = $this->fuckhtml->getTextContent($tmphtml[0], false, false); + $this->appendtext($value, $text, $i); + + $type = null; + switch($tag["tagName"]){ + + case "code": $type = "inline_code"; break; + case "em": $type = "italic"; break; + case "blockquote": $type = "quote"; break; + default: $type = "text"; + } + + if($type !== null){ + $value = $this->fuckhtml->getTextContent($tag, false, true); + + if(trim($value) != ""){ + + if( + $i !== 0 && + $type == "title" + ){ + + $text[$i - 1]["value"] = rtrim($text[$i - 1]["value"]); + } + + $text[] = [ + "type" => $type, + "value" => $value + ]; + $i++; + } + } + + if(count($tmphtml) === 2){ + + $tmphtml = $tmphtml[1]; + }else{ + + break; + } + } + + if(is_array($tmphtml)){ + + $tmphtml = $tmphtml[0]; + } + + if(strlen($tmphtml) !== 0){ + + $value = $this->fuckhtml->getTextContent($tmphtml, false, false); + $this->appendtext($value, $text, $i); + } + break; + + case "pre": + + switch($text[$i - 1]["type"]){ + + case "text": + case "italic": + $text[$i - 1]["value"] = rtrim($text[$i - 1]["value"]); + break; + } + + $text[] = + [ + "type" => "code", + "value" => + rtrim( + $this->fuckhtml + ->getTextContent( + $snippet, + true, + false + ) + ) + ]; + $i++; + + break; + + case "ol": + $o = 0; + + $this->fuckhtml->load($snippet); + $li = + $this->fuckhtml + ->getElementsByTagName("li"); + + foreach($li as $elem){ + $o++; + + $this->appendtext( + $o . ". " . + $this->fuckhtml + ->getTextContent( + $elem + ), + $text, + $i + ); + } + break; + } + } + + if( + $i !== 0 && + $text[$i - 1]["type"] == "text" + ){ + + $text[$i - 1]["value"] = rtrim($text[$i - 1]["value"]); + } + + if($author){ + + $text[] = [ + "type" => "quote", + "value" => $this->fuckhtml->getTextContent($author) + ]; + } + + $answer["description"] = $text; + } + }else{ + + /* + Get normal description + */ + $description = + $this->fuckhtml + ->getElementsByClassName( + "mb-6", + "div" + ); + + if(count($description) !== 0){ + + $description = + [ + [ + "type" => "text", + "value" => + $this->titledots( + preg_replace( + '/ Wikipedia$/', + "", + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ) + ) + ] + ]; + + $ratings = + $this->fuckhtml + ->getElementById("ratings"); + + if($ratings){ + + $this->fuckhtml->load($ratings); + + $ratings = + $this->fuckhtml + ->getElementsByClassName( + "flex-hcenter mb-10", + "div" + ); + + $description[] = [ + "type" => "title", + "value" => "Ratings" + ]; + + foreach($ratings as $rating){ + + $this->fuckhtml->load($rating); + + $num = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "r-num", + "div" + )[0] + ); + + $href = + $this->fuckhtml + ->getElementsByClassName( + "mr-10", + "a" + )[0]; + + $votes = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "text-sm", + "span" + )[0] + ); + + $c = count($description) - 1; + + if( + $c !== -1 && + $description[$c]["type"] == "text" + ){ + + $description[$c]["value"] .= $num . " "; + }else{ + + $description[] = [ + "type" => "text", + "value" => $num . " " + ]; + } + + $description[] = [ + "type" => "link", + "value" => $this->fuckhtml->getTextContent($href), + "url" => $this->fuckhtml->getTextContent($href["attributes"]["href"]) + ]; + + $description[] = [ + "type" => "text", + "value" => " (" . $votes . ")\n" + ]; + } + } + + $answer["description"] = $description; + } + } + + /* + Get sublinks + */ + $this->fuckhtml->load($infobox); + + $profiles = + $this->fuckhtml + ->getElementById("profiles"); + + if($profiles){ + $profiles = + $this->fuckhtml + ->getElementsByClassName( + "chip", + "a" + ); + + foreach($profiles as $profile){ + + $name = $this->fuckhtml->getTextContent($profile["attributes"]["title"]); + + if(strtolower($name) == "steampowered"){ + + $name = "Steam"; + } + + $answer["sublink"][$name] = + $this->fuckhtml->getTextContent($profile["attributes"]["href"]); + } + } + + $actors = + $this->fuckhtml + ->getElementById("panel-movie-cast"); + + if($actors){ + + $this->fuckhtml->load($actors); + + $actors = + $this->fuckhtml + ->getElementsByClassName("card"); + + $answer["description"][] = [ + "type" => "title", + "value" => "Cast" + ]; + + foreach($actors as $actor){ + + $this->fuckhtml->load($actor); + + $answer["description"][] = [ + "type" => "text", + "value" => + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName("card-body") + [0] + ) + ]; + + $answer["description"][] = [ + "type" => "image", + "url" => $this->getimagelinkfromstyle("person-thumb")["url"] + ]; + } + } + + $out["answer"][] = $answer; + } + + /* + Get actor standalone thingy + */ + $this->fuckhtml->load($resulthtml); + $actors = + $this->fuckhtml + ->getElementById("predicate-entity"); + + if($actors){ + + $this->fuckhtml->load($actors); + + $cards = + $this->fuckhtml + ->getElementsByClassName("card"); + + $url = + $this->fuckhtml + ->getElementsByClassName( + "disclaimer", + "div" + )[0]; + + $this->fuckhtml->load($url); + + $url = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName("a") + [0] + ["attributes"] + ["href"] + ); + + $this->fuckhtml->load($actors); + + $answer = [ + "title" => + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "entity", + "span" + )[0] + ) . " (Cast)", + "description" => [], + "url" => $url, + "sublink" => [], + "thumb" => null, + "table" => [] + ]; + + foreach($cards as $card){ + + $this->fuckhtml->load($card); + + $answer["description"][] = [ + "type" => "title", + "value" => + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "title" + )[0] + ) + ]; + + $answer["description"][] = [ + "type" => "text", + "value" => + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "text-xs desc" + )[0] + ) + ]; + + $answer["description"][] = [ + "type" => "image", + "url" => $this->getimagelinkfromstyle("img-bg")["url"] + ]; + } + + $out["answer"][] = $answer; + } + + return $out; + } + + public function news($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $nsfw = $get["nsfw"]; + $country = $get["country"]; + + if(strlen($search) > 2048){ + + throw new Exception("Search query is too long!"); + } + /* + $handle = fopen("scraper/brave-news.html", "r"); + $html = fread($handle, filesize("scraper/brave-news.html")); + fclose($handle);*/ + try{ + $html = + $this->get( + "https://search.brave.com/news", + [ + "q" => $search + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + // load html + $this->fuckhtml->load($html); + + $news = + $this->fuckhtml + ->getElementsByClassName( + "snippet inline gap-standard", + "div" + ); + + foreach($news as $article){ + + $data = [ + "title" => null, + "author" => null, + "description" => null, + "date" => null, + "thumb" => + [ + "url" => null, + "ratio" => null + ], + "url" => null + ]; + + $this->fuckhtml->load($article); + $elems = + $this->fuckhtml + ->getElementsByTagName("*"); + + // get title + $data["title"] = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "snippet-title", + $elems + ) + [0] + ["innerHTML"] + ); + + // get description + $data["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "snippet-description", + $elems + ) + [0] + ["innerHTML"] + ) + ); + + // get date + $date = + explode( + "•", + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "snippet-url", + $elems + )[0] + ) + ); + + if( + count($date) !== 1 && + trim($date[1]) != "" + ){ + + $data["date"] = + strtotime( + $date[1] + ); + } + + // get URL + $data["url"] = + $this->fuckhtml->getTextContent( + $this->unshiturl( + $this->fuckhtml + ->getElementsByClassName( + "result-header", + $elems + ) + [0] + ["attributes"] + ["href"] + ) + ); + + // get thumbnail + $thumb = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if( + count($thumb) === 2 && + trim( + $thumb[1] + ["attributes"] + ["src"] + ) != "" + ){ + + $data["thumb"] = [ + "url" => + $this->fuckhtml->getTextContent( + $this->unshiturl( + $thumb[1] + ["attributes"] + ["src"] + ) + ), + "ratio" => "16:9" + ]; + } + + $out["news"][] = $data; + } + + return $out; + } + + /* + public function bypasscaptcha($html, $nsfw, $country){ + + // @TODO figure out why I still cant go trough + // the captcha wall even after breaking it + + try{ + $html = + $this->get( + "https://search.brave.com/goggles", + [ + "q" => "site:dailymotion.com my bloody valentine" + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch html"); + } + + // Bypass brave search captcha + // this captcha only appears on the goggles page + preg_match( + '/this\.img\.src = "(.*)"/', + $html, + $image + ); + + $image = + base64_decode( + explode( + "data:image/png;base64,", + $image[1] + )[1] + ); + + $im = new Imagick(); + $im->readImageBlob($image); + + $im->blurImage(20, 20); + $im->posterizeImage(2, imagick::IMGTYPE_COLORSEPARATION); + + // if we encounter a white line thats longer than 45px + // we found the circle position + $iterator = $im->getPixelRegionIterator(0, 77, 310, 1); + + $found = null; + foreach( + $iterator as $row + ){ + + $whitecount = 0; + $count = 0; + + foreach($row as $pixel){ + + if($pixel->getColor()["r"] === 255){ + + $whitecount++; + $pixel->setColor("rgba(255,0,0,0)"); + + if($whitecount === 45){ + + $found = $count - 45; + break 2; + } + }else{ + + $whitecount = 0; + } + + $count++; + $iterator->syncIterator(); + } + } + + $found = $found + 10; + + //header("Content-Type: image/png"); + //echo $im; + //die(); + + if($found === null){ + + throw new Exception("Could not bypass captcha"); + } + + preg_match( + '/data="{"captcha_id":"([0-9A-z-]+)"}"/', + $html, + $key + ); + + $key = $key[1]; + // we bypassed captcha, send POST data + $order = + $this->get( + "https://search.brave.com/api/captcha?brave=0&captcha_id={$key}", + [ + "solution" => (string)$found + ], + $nsfw, + $country, + true + ); + + $order = json_decode($order, true)["orderId"]; + + $orderpayload = + $this->get( + "https://search.brave.com/api/rewards/v1/orders/{$order}", + [], + $nsfw, + $country + ); + + $orderpayload = json_decode($orderpayload, true); + + $creds = + $this->get( + "https://search.brave.com/api/rewards/v1/orders/{$order}/credentials", + [ + "itemId" => $orderpayload["items"][0]["id"], + "blindedCreds" => [ + "fuYAVcB/m7BU66vf3wkNGxJCSaRhshB9o+8km3F1h2c=", + "uswvcWJuPK/1qFlVdzBP3eQd0+V1EQgfAtnEoMIK+Uk=", + "fJWKGLBxl3Gyn4n9FjTLq1PjupfABT7Ni8MeB+iGzUs=", + "Aq9enJ/VZP9GxQIza3n65ZK7xQhY4VwDxv53BCb/Txg=", + "FMJA9eSLHq71K+Pcwgm4gIQOmdR/6KMy5cMgXhpd5Ro=", + "2NVhIAbvI317SP9/xXbVe/U57eWgvHyqVbHL/5+Gdmw=", + "6mpjsjSCmYEzK2xlbL8DI2P4LuhWUOxjTLvsTAL9l24=", + "kAn4wuHvIlKWhfuFfPTSfD4tZ5le9t7/61YbdEc/L3k=", + "BjjUyG16aTfd1c0h4oBzgQQOekrH1f+a5CmcXqMPTR4=", + "SBNgpCt4/V44yaQTfh+D027Yv1GJFHkjUEpPw6rAwRI=", + "XDENAtdQ7PyYx+Qx1wQGQtDWgg8WpIMgWGmd4RDOVWE=", + "tF7rB4sqamsiUk3K7fojdQSI0Q6iip72yKyhnvg/bC0=", + "VsAqflirAd/u4VsLdfRS2UvnH24ZNkFh6YN3DctLjzQ=", + "MntLbXkoI0LdcisCbNazmooiHXJyX91L1KERDAu1JRU=", + "TH6Zs8JBvFDbTDWgKbfGE4M5/cSwCtHD8ms5Y/U8zHQ=", + "jsZg0Z+qDPHymrbhdnesodhLNJ26QdunyMko1aVe4So=", + "rpKsyj6/vdnuMgLI2BApeijtGq9g5USRDL0w6X2bnlQ=", + "vCzliGT8A9vcLXj2sFf2kavOuYw69d70NpfgA22B4lI=", + "7OWoxSCtYXWcaBSifF7AXNBif/sjcuO0IelzXG/3PFk=", + "iiXtByNlT6nDMN9De5B58Jl8J0p6LCjnZ9aS3w2FEQU=", + "zDhd7gsJ4h4JkDeGK0Y0mfFd8IBdkLhMOANzwO+4Dig=", + "qANZ+AikwFReEA61JF009d/c3IHM/aSfIYwljckhJWE=", + "nNC30pDLxtXvUr+WDwfDSrAInNBpfSZkPsV2JlpheWI=", + "kGXE1pkt25P71kdJzmKIg4+yMR1VA5wNmbpBb/FhJQ8=", + "aLqPsY1Qiz2UCa2Jx3YNNt8r4JINMphks/43EiyZfXU=", + "bHGYZoQARZEM5LdFF6B74PkRqNd9EKxzuTvGYxjq+hk=", + "JOsYQjfE/9Y1u29hR+GvEkNyxUI8blgLhX1iJI/aGRQ=", + "yKjHjH5j600TJD/3WPsA1N3OmItDLifdjlysq4H6NV0=", + "9lTnUbsPp7BJ7XVN5/T4yGfzD9DJdqWB7xk72s19MAA=", + "5KHG8iY45em7zDhO/HlI0ydcZ0Ubn+XSyjifMmy7qXM=" + ] + ], + $nsfw, + $country, + true + ); + + var_dump($creds); + + sleep(2); + $test = + $this->get( + "https://search.brave.com/api/rewards/v1/orders/{$order}/credentials", + [], + $nsfw, + $country + ); + + var_dump($test); + + $html = + $this->get( + "https://search.brave.com/goggles", + [ + "q" => "site:dailymotion.com my bloody valentine" + ], + $nsfw, + $country, + false, + "__Secure-sku#brave-search-captcha=eyJ0eXBlIjoic2luZ2xlLXVzZSIsInZlcnNpb24iOjEsInNrdSI6ImJyYXZlLXNlYXJjaC1jYXB0Y2hhIiwicHJlc2VudGF0aW9uIjoiZXlKcGMzTjFaWElpT2lKaWNtRjJaUzVqYjIwL2MydDFQV0p5WVhabExYTmxZWEpqYUMxallYQjBZMmhoSWl3aWMybG5ibUYwZFhKbElqb2lNRzl0VDBneWQxZ3dTazkzU0VFMVJ6QTJaR1V5WjFOQ1dDdGhSM3B2Y2xsTVQwVTJZVVJtTUc5a1IweG1Wa3RhZEd0cU4xbHdia3BPT0VOVGNGbE5lVWR2YmpGRlNTOUhhMlZYU1RWNGQxTjJPWGxJTTNjOVBTSXNJblFpT2lKWlJWWldaVzR5TTJwQ01tSnZkakJ2U1hGNGJtSndUMGxEUW5Kd1drRjBRbWQxVnpoRlNURTNVREY2UVRaQlpUTXJSVGRFYm5NeVFqUmhka0pGYTFWM2FGY3JWRVZJVjNWcE9TdFllRU1yYlVSTVkyMTBRVDA5SW4wPSJ9" + ); + + var_dump($html); + }*/ + + private function appendtext($payload, &$text, &$index){ + + if(trim($payload) == ""){ + + return; + } + + if( + $index !== 0 && + $text[$index - 1]["type"] == "text" + ){ + + $text[$index - 1]["value"] .= "\n\n" . preg_replace('/ $/', " ", $payload); + }else{ + + $text[] = [ + "type" => "text", + "value" => preg_replace('/ $/', " ", $payload) + ]; + $index++; + } + } + + private function tablesublink($html_collection, &$data){ + + foreach($html_collection as $html){ + + $html["innerHTML"] = preg_replace( + '/