diff options
Diffstat (limited to 'scraper')
-rw-r--r-- | scraper/brave.php | 2287 | ||||
-rw-r--r-- | scraper/ddg.php | 2722 | ||||
-rw-r--r-- | scraper/google.php | 1562 | ||||
-rw-r--r-- | scraper/marginalia.php | 242 | ||||
-rw-r--r-- | scraper/mojeek.php | 1182 | ||||
-rw-r--r-- | scraper/wiby.php | 244 | ||||
-rw-r--r-- | scraper/yandex.php | 530 | ||||
-rw-r--r-- | scraper/youtube.php | 1723 |
8 files changed, 10492 insertions, 0 deletions
diff --git a/scraper/brave.php b/scraper/brave.php new file mode 100644 index 0000000..4d48c33 --- /dev/null +++ b/scraper/brave.php @@ -0,0 +1,2287 @@ +<?php +/* +$brave = new brave(); + +$handle = fopen("captcha.html", "r"); +$html = fread($handle, filesize("captcha.html")); +fclose($handle); + +$brave->bypasscaptcha($html, "yes", "ca");*/ + +class brave{ + + public function __construct(){ + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + include "lib/nextpage.php"; + $this->nextpage = new nextpage("brave"); + } + + public function getfilters($page){ + + switch($page){ + + case "web": + return [ + "country" => [ + "display" => "Country", + "option" => [ + "all" => "All Regions", + "ar" => "Argentina", + "au" => "Australia", + "at" => "Austria", + "be" => "Belgium", + "br" => "Brazil", + "ca" => "Canada", + "cl" => "Chile", + "cn" => "China", + "dk" => "Denmark", + "fi" => "Finland", + "fr" => "France", + "de" => "Germany", + "hk" => "Hong Kong", + "in" => "India", + "id" => "Indonesia", + "it" => "Italy", + "jp" => "Japan", + "kr" => "Korea", + "my" => "Malaysia", + "mx" => "Mexico", + "nl" => "Netherlands", + "nz" => "New Zealand", + "no" => "Norway", + "pl" => "Poland", + "pt" => "Portugal", + "ph" => "Philippines", + "ru" => "Russia", + "sa" => "Saudi Arabia", + "za" => "South Africa", + "es" => "Spain", + "se" => "Sweden", + "ch" => "Switzerland", + "tw" => "Taiwan", + "tr" => "Turkey", + "gb" => "United Kingdom", + "us" => "United States" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ] + ]; + break; + + case "news": + return [ + "country" => [ + "display" => "Country", + "option" => [ + "all" => "All regions", + "ar" => "Argentina", + "au" => "Australia", + "at" => "Austria", + "be" => "Belgium", + "br" => "Brazil", + "ca" => "Canada", + "cl" => "Chile", + "cn" => "China", + "dk" => "Denmark", + "fi" => "Finland", + "fr" => "France", + "de" => "Germany", + "hk" => "Hong Kong", + "in" => "India", + "id" => "Indonesia", + "it" => "Italy", + "jp" => "Japan", + "kr" => "Korea", + "my" => "Malaysia", + "mx" => "Mexico", + "nl" => "Netherlands", + "nz" => "New Zealand", + "no" => "Norway", + "pl" => "Poland", + "pt" => "Portugal", + "ph" => "Philippines", + "ru" => "Russia", + "sa" => "Saudi Arabia", + "za" => "South Africa", + "es" => "Spain", + "se" => "Sweden", + "ch" => "Switzerland", + "tw" => "Taiwan", + "tr" => "Turkey", + "gb" => "United Kingdom", + "us" => "United States" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ] + ]; + break; + } + } + + private function get($url, $get = [], $nsfw, $country/*, $is_post = false, $additional_cookies = null*/){ + + switch($nsfw){ + + case "yes": $nsfw = "off"; break; + case "maybe": $nsfw = "moderate"; break; + case "no": $nsfw = "strict"; break; + } + + //$cookie = "safesearch={$nsfw}; country={$country}; useLocation=0"; + /* + if($additional_cookies !== null){ + + $cookie = $additional_cookies . "; " . $cookie; + }*/ + + $headers = [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Cookie: safesearch={$nsfw}; country={$country}; useLocation=0; summarizer=0", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"//, + //"Content-Type: application/json" + ]; + + if($country == "any"){ + + $country = "all"; + } + + $curlproc = curl_init(); + + /*if($is_post){ + + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt( + $curlproc, + CURLOPT_POSTFIELDS, + json_encode($get) + ); + + }else{ + */ + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + //} + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + // get next page data + $q = json_decode($this->nextpage->get($get["npt"], "web"), true); + + $search = $q["q"]; + $q["spellcheck"] = 0; + + $nsfw = $q["nsfw"]; + unset($q["nsfw"]); + + $country = $q["country"]; + unset($q["country"]); + + }else{ + + // get _GET data instead + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search query is too long!"); + } + + $nsfw = $get["nsfw"]; + $country = $get["country"]; + $older = $get["older"]; + $newer = $get["newer"]; + + $q = [ + "q" => $search + ]; + + /* + Pass older/newer filters to brave + */ + if($newer !== false){ + + $newer = date("Y-m-d", $newer); + + if($older === false){ + + $older = date("Y-m-d", time()); + } + } + + if( + is_string($older) === false && + $older !== false + ){ + + $older = date("Y-m-d", $older); + + if($newer === false){ + + $newer = "1970-01-02"; + } + } + + if($older !== false){ + + $q["tf"] = "{$newer}to{$older}"; + } + } + /* + $handle = fopen("scraper/brave.html", "r"); + $html = fread($handle, filesize("scraper/brave.html")); + fclose($handle); + */ + try{ + $html = + $this->get( + "https://search.brave.com/search", + $q, + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // load html + $this->fuckhtml->load($html); + + /* + Get next page "token" + */ + $nextpage = + $this->fuckhtml + ->getElementsByClassName( + "btn ml-15", + "a" + ); + + if(count($nextpage) !== 0){ + + preg_match( + '/offset=([0-9]+)/', + $this->fuckhtml->getTextContent($nextpage[0]["attributes"]["href"]), + $nextpage + ); + + $q["offset"] = (int)$nextpage[1]; + $q["nsfw"] = $nsfw; + $q["country"] = $country; + + $out["npt"] = + $this->nextpage->store( + json_encode($q), + "web" + ); + } + + /* + Get discussions (and append them to web results) + */ + + // they're loaded using javascript!! + $discussion = + $this->fuckhtml + ->getElementById( + "js-discussions", + "script" + ); + + if( + $discussion && + isset($discussion["attributes"]["data"]) + ){ + + $discussion = + json_decode( + $this->fuckhtml + ->getTextContent( + $discussion["attributes"]["data"] + ), + true + ); + + foreach($discussion["results"] as $result){ + + $data = [ + "title" => $this->titledots($result["title"]), + "description" => null, + "url" => $result["url"], + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + // description + $data["description"] = + $this->limitstrlen( + $this->limitwhitespace( + $this->titledots( + $this->fuckhtml->getTextContent( + $result["description"] + ) + ) + ) + ); + + if($result["age"] != ""){ + $data["date"] = strtotime($result["age"]); + } + + // populate table + + if($result["data"]["num_answers"] != ""){ + $data["table"]["Replies"] = (int)$result["data"]["num_answers"]; + } + + if($result["data"]["score"] != ""){ + + $score = explode("|", $result["data"]["score"]); + + if(count($score) === 2){ + + $score = ((int)$score[1]) . " (" . trim($score[0]) . ")"; + }else{ + + $score = (int)$score[0]; + } + + $data["table"]["Votes"] = $score; + } + + if($result["thumbnail"] != ""){ + + $data["thumb"]["url"] = $result["thumbnail"]; + $data["thumb"]["ratio"] = "16:9"; + } + + $out["web"][] = $data; + } + } + + /* + Get related searches + */ + $faq = + $this->fuckhtml + ->getElementById("js-faq", "script"); + + if( + $faq && + isset($faq["attributes"]["data"]) + ){ + + $faq = + json_decode( + $this->fuckhtml + ->getTextContent( + $faq["attributes"]["data"] + ), + true + ); + + foreach($faq["items"] as $related){ + + $out["related"][] = $related["question"]; + } + } + + /* + Get spelling autocorrect + */ + $altered = + $this->fuckhtml + ->getElementById("altered-query", "div"); + + if($altered){ + + $this->fuckhtml->load($altered); + + $altered = + $this->fuckhtml + ->getElementsByTagName("a"); + + if(count($altered) === 2){ + + $out["spelling"] = [ + "type" => "including", + "using" => + $this->fuckhtml + ->getTextContent($altered[0]), + "correction" => + $this->fuckhtml + ->getTextContent($altered[1]) + ]; + } + + $this->fuckhtml->load($html); + } + + /* + Get web results + */ + $resulthtml = + $this->fuckhtml + ->getElementById( + "results", + "div" + ); + + $this->fuckhtml->load($resulthtml); + $items = 0; + foreach( + $this->fuckhtml + ->getElementsByClassName("snippet fdb") + as $result + ){ + + $data = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + if( + isset($result["attributes"]["data-type"]) && + $result["attributes"]["data-type"] == "ad" + ){ + + // is an ad, skip + continue; + } + + $this->fuckhtml->load($result); + + /* + Get title + */ + $title = + $this->fuckhtml + ->getElementsByClassName( + "snippet-title", + "span" + ); + + if(count($title) === 0){ + + // encountered AI summarizer + // or misspelling indicator @TODO + continue; + } + + if(isset($title[0]["attributes"]["title"])){ + + $data["title"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0]["attributes"]["title"] + ) + ); + }else{ + + $data["title"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ); + } + + /* + Get description + */ + $description = + $this->fuckhtml + ->getElementsByClassName( + "snippet-description", + "p" + ); + + if(count($description) !== 0){ + $data["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ); + + // also check for thumbnail in here + $img = + $this->fuckhtml + ->getElementsByClassName( + "thumb", + "img" + ); + + if(count($img) !== 0){ + + $data["thumb"] = [ + "url" => $this->unshiturl($img[0]["attributes"]["src"]), + "ratio" => "16:9" + ]; + }else{ + + // might be a video thumbnail wrapper? + $wrapper = + $this->fuckhtml + ->getElementsByClassName( + "video-thumb", + "a" + ); + + if(count($wrapper) !== 0){ + + // we found a video + $this->fuckhtml->load($wrapper[0]); + + $img = + $this->fuckhtml + ->getElementsByTagName("img"); + + $data["thumb"] = [ + "url" => $this->unshiturl($img[0]["attributes"]["src"]), + "ratio" => "16:9" + ]; + + // get the video length, if its there + $duration = + $this->fuckhtml + ->getElementsByClassName( + "duration", + "div" + ); + + if(count($duration) !== 0){ + + $data["table"]["Duration"] = $duration[0]["innerHTML"]; + } + + // reset html load + $this->fuckhtml->load($result); + } + } + + }else{ + + // is a steam/shop listing + $description_alt = + $this->fuckhtml + ->getElementsByClassName( + "text-sm", + "div" + ); + + if(count($description_alt) !== 0){ + + switch($description_alt[0]["attributes"]["class"]){ + + case "text-sm text-gray": + case "description text-sm": + + $data["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $description_alt[0] + ) + ); + break; + } + + // get table sublink + $sublink = + $this->fuckhtml + ->getElementsByClassName( + "r-attr text-sm", + "div" + ); + + if(count($sublink) !== 0){ + + $this->tablesublink($sublink, $data); + } + + // check for thumb element + $data["thumb"] = $this->getimagelinkfromstyle("thumb"); + }else{ + + // ok... finally... + // maybe its the instant answer thingy + $answer = + $this->fuckhtml + ->getElementsByClassName("answer"); + + if(count($answer) !== 0){ + + $data["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent($answer[0]) + ); + } + } + } + + // finally, fix brave's date format sucking balls + $data["description"] = explode(" - ", $data["description"], 2); + + if(count($data["description"]) === 0){ + + // nothing to do + $data["description"] = $data["description"][0]; + }else{ + + // attempt to parse + $time = strtotime($data["description"][0]); + + if($time !== false){ + + // got response + $data["date"] = $time; + + array_shift($data["description"]); + } + + // merge back + $data["description"] = + implode(" - ", $data["description"]); + } + + /* + Check content type + */ + $content_type = + $this->fuckhtml + ->getElementsByClassName( + "content-type", + "span" + ); + + if(count($content_type) !== 0){ + + $data["type"] = + strtolower($this->fuckhtml->getTextContent($content_type[0])); + } + + /* + Check subtext table thingy + */ + $table_items = + array_merge( + $this->fuckhtml + ->getElementsByClassName( + "item-attributes", + "div" + ), + $this->fuckhtml + ->getElementsByClassName( + "r", + "div" + ) + ); + + /* + DIV: item-attributes + */ + if(count($table_items) !== 0){ + + foreach($table_items as $table){ + + $this->fuckhtml->load($table); + + $span = + $this->fuckhtml + ->getElementsByClassName( + "text-sm", + "*" + ); + + foreach($span as $item){ + + $item = + explode( + ":", + $this->fuckhtml->getTextContent(preg_replace('/\n/', " ", $item["innerHTML"])), + 2 + ); + + if(count($item) === 2){ + + $data["table"][trim($item[0])] = trim($this->limitwhitespace($item[1])); + } + } + } + + $this->fuckhtml->load($result); + } + + // get video sublinks + $table_items = + $this->fuckhtml + ->getElementsByClassName( + "snippet-description published-time", + "p" + ); + + if(count($table_items) !== 0){ + + $table_items = + explode( + '<span class="mr-15"></span>', + $table_items[0]["innerHTML"], + 2 + ); + if(count($table_items) === 2){ + + $item2 = []; + + $item2[] = explode(":", $this->fuckhtml->getTextContent($table_items[0])); + + if(trim($table_items[1]) != ""){ + $item2[] = explode(":", $this->fuckhtml->getTextContent($table_items[1])); + } + + foreach($item2 as $it){ + + $data["table"][trim($it[0])] = trim($it[1]); + } + } + } + + /* + Get URL + */ + $data["url"] = + $this->fuckhtml->getTextContent( + $this->fuckhtml + ->getElementsByTagName("a") + [0] + ["attributes"] + ["href"] + ); + + /* + Get sublinks + */ + $sublinks_elems = + $this->fuckhtml + ->getElementsByClassName( + "snippet", + "div" + ); + + $sublinks = []; + + foreach($sublinks_elems as $sublink){ + + $this->fuckhtml->load($sublink); + + $a = + $this->fuckhtml + ->getElementsByTagName("a")[0]; + + $title = + $this->fuckhtml + ->getTextContent($a); + + $url = $a["attributes"]["href"]; + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName("p")[0] + ) + ); + + $sublinks[] = [ + "title" => $title, + "date" => null, + "description" => $description, + "url" => $url + ]; + } + + /* + Get smaller sublinks + */ + $sublinks_elems = + $this->fuckhtml + ->getElementsByClassName( + "deep-link", + "a" + ); + + foreach($sublinks_elems as $sublink){ + + $sublinks[] = [ + "title" => $this->fuckhtml->getTextContent($sublink), + "date" => null, + "description" => null, + "url" => $sublink["attributes"]["href"] + ]; + } + + // append sublinks to $data !! + $data["sublink"] = $sublinks; + + // append first result to start of $out["web"] + // other results are after + if($items === 0){ + + $out["web"] = [$data, ...$out["web"]]; + }else{ + + $out["web"][] = $data; + } + $items++; + } + + /* + Get news + */ + $this->fuckhtml->load($resulthtml); + $news_carousel = $this->fuckhtml->getElementById("news-carousel"); + + $this->fuckhtml->load($news_carousel); + + if($news_carousel){ + + $a = + $this->fuckhtml + ->getElementsByClassName( + "card fdb", + "a" + ); + + foreach($a as $news){ + + $this->fuckhtml->load($news); + + $out["news"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "title", + "div" + )[0] + ) + ), + "description" => null, + "date" => + strtotime( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "card-footer__timestamp", + "span" + )[0] + ) + ), + "thumb" => $this->getimagelinkfromstyle("img-bg"), + "url" => $this->fuckhtml->getTextContent($news["attributes"]["href"]) + ]; + } + } + + + + /* + Get videos + */ + $this->fuckhtml->load($resulthtml); + $news_carousel = $this->fuckhtml->getElementById("video-carousel"); + + $this->fuckhtml->load($news_carousel); + + if($news_carousel){ + + $a = + $this->fuckhtml + ->getElementsByClassName( + "card fdb", + "a" + ); + + foreach($a as $video){ + + $this->fuckhtml->load($video); + + $date = null; + + $date_o = + $this->fuckhtml + ->getElementsByClassName( + "text-gray text-xs", + "span" + ); + + if(count($date_o) !== 0){ + + $date = + strtotime( + $this->fuckhtml + ->getTextContent( + $date_o[0] + ) + ); + } + + $out["video"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "title", + "div" + )[0] + ) + ), + "description" => null, + "date" => $date, + "duration" => null, + "views" => null, + "thumb" => $this->getimagelinkfromstyle("img-bg"), + "url" => $this->fuckhtml->getTextContent($video["attributes"]["href"]) + ]; + } + } + + + /* + Get DEFINITION snippet + */ + $this->fuckhtml->load($html); + $infobox = $this->fuckhtml->getElementById("rh-definitions", "div"); + + if($infobox !== false){ + + $answer = [ + "title" => null, + "description" => [], + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + + $this->fuckhtml->load($infobox); + + $answer["title"] = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "header", + "h5" + )[0] + ); + + $sections = + $this->fuckhtml + ->getElementsByTagName("section"); + + $i = -1; + foreach($sections as $section){ + + $this->fuckhtml->load($section); + $items = + $this->fuckhtml + ->getElementsByTagName("*"); + + $li = 1; + $pronounce = false; + foreach($items as $item){ + + switch($item["tagName"]){ + + case "h6": + + if( + isset($item["attributes"]["class"]) && + $item["attributes"]["class"] == "h6 pronunciation" + ){ + + if($pronounce){ + + break; + } + + $answer["description"][] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $item + ) + ]; + + $answer["description"][] = + [ + "type" => "audio", + "url" => "https://search.brave.com/api/rhfetch?rhtype=definitions&word={$answer["title"]}&source=ahd-5" + ]; + + $pronounce = true; + $i = $i + 2; + break; + } + + $answer["description"][] = [ + "type" => "title", + "value" => + $this->fuckhtml + ->getTextContent( + $item + ) + ]; + $i++; + break; + + case "li": + + if( + $i !== -1 && + $answer["description"][$i]["type"] == "text" + ){ + + $answer["description"][$i]["value"] .= + "\n" . $li . ". " . + $this->fuckhtml + ->getTextContent( + $item + ); + + }else{ + $answer["description"][] = [ + "type" => "text", + "value" => + $li . ". " . + $this->fuckhtml + ->getTextContent( + $item + ) + ]; + $i++; + } + $li++; + break; + + case "a": + $answer["url"] = + $this->fuckhtml + ->getTextContent( + $item["attributes"]["href"] + ); + break; + } + } + } + + $out["answer"][] = $answer; + } + + + /* + Get instant answer + */ + $this->fuckhtml->load($html); + $infobox = $this->fuckhtml->getElementById("infobox", "div"); + + if($infobox !== false){ + + $answer = [ + "title" => null, + "description" => [], + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + + $this->fuckhtml->load($infobox); + $div = $this->fuckhtml->getElementsByTagName("div"); + + /* + Get title + url + */ + $title = + $this->fuckhtml + ->getElementsByClassName("infobox-title", "a"); + + if(count($title) !== 0){ + + $answer["title"] = + $this->fuckhtml + ->getTextContent( + $title[0] + ); + + $answer["url"] = + $this->fuckhtml + ->getTextContent( + $title[0]["attributes"]["href"] + ); + } + + /* + Get thumbnail + */ + $thumb = $this->getimagelinkfromstyle("thumb"); + + if($thumb["url"] !== null){ + + $answer["thumb"] = $thumb["url"]; + } + + /* + Get table + */ + $title = + $this->fuckhtml + ->getElementsByClassName( + "infobox-attr-header", + "div" + ); + + $rowhtml = $infobox; + + if(count($title) >= 2){ + + $rowhtml = + explode( + $title[1]["outerHTML"], + $infobox["innerHTML"], + 2 + )[0]; + } + + $this->fuckhtml->load($rowhtml); + + $rows = + $this->fuckhtml + ->getElementsByClassName("infobox-attr", "div"); + + foreach($rows as $row){ + + if(!isset($row["innerHTML"])){ + + continue; + } + + $this->fuckhtml->load($row); + $span = + $this->fuckhtml + ->getElementsByTagName("span"); + + if(count($span) === 2){ + + $answer["table"][ + $this->fuckhtml->getTextContent($span[0]) + ] = str_replace("\n", ", ", $this->fuckhtml->getTextContent($span[1], true)); + } + } + + $this->fuckhtml->load($infobox); + + /* + Parse stackoverflow answers + */ + $code = + $this->fuckhtml + ->getElementById("codebox-answer", $div); + + if($code){ + + // this might be standalone text with no paragraphs, check for that + $author = + $this->fuckhtml + ->getElementById("author"); + + $desc_tmp = + str_replace( + $author["outerHTML"], + "", + $code["innerHTML"] + ); + + $this->fuckhtml->load($desc_tmp); + $code = + $this->fuckhtml + ->getElementsByTagName("*"); + + if(count($code) === 0){ + + $answer["description"] = + [ + [ + "type" => "text", + "value" => + $this->fuckhtml + ->getTextContent( + $desc_tmp + ) + ], + [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $author + ) + ] + ]; + }else{ + + $text = []; + $i = 0; + + foreach($code as $snippet){ + + switch($snippet["tagName"]){ + + case "p": + $this->fuckhtml->load($snippet["innerHTML"]); + + $codetags = + $this->fuckhtml + ->getElementsByTagName("*"); + + $tmphtml = $snippet["innerHTML"]; + + foreach($codetags as $tag){ + + if(!isset($tag["outerHTML"])){ + + continue; + } + + $tmphtml = + explode( + $tag["outerHTML"], + $tmphtml, + 2 + ); + + $value = $this->fuckhtml->getTextContent($tmphtml[0], false, false); + $this->appendtext($value, $text, $i); + + $type = null; + switch($tag["tagName"]){ + + case "code": $type = "inline_code"; break; + case "em": $type = "italic"; break; + case "blockquote": $type = "quote"; break; + default: $type = "text"; + } + + if($type !== null){ + $value = $this->fuckhtml->getTextContent($tag, false, true); + + if(trim($value) != ""){ + + if( + $i !== 0 && + $type == "title" + ){ + + $text[$i - 1]["value"] = rtrim($text[$i - 1]["value"]); + } + + $text[] = [ + "type" => $type, + "value" => $value + ]; + $i++; + } + } + + if(count($tmphtml) === 2){ + + $tmphtml = $tmphtml[1]; + }else{ + + break; + } + } + + if(is_array($tmphtml)){ + + $tmphtml = $tmphtml[0]; + } + + if(strlen($tmphtml) !== 0){ + + $value = $this->fuckhtml->getTextContent($tmphtml, false, false); + $this->appendtext($value, $text, $i); + } + break; + + case "pre": + + switch($text[$i - 1]["type"]){ + + case "text": + case "italic": + $text[$i - 1]["value"] = rtrim($text[$i - 1]["value"]); + break; + } + + $text[] = + [ + "type" => "code", + "value" => + rtrim( + $this->fuckhtml + ->getTextContent( + $snippet, + true, + false + ) + ) + ]; + $i++; + + break; + + case "ol": + $o = 0; + + $this->fuckhtml->load($snippet); + $li = + $this->fuckhtml + ->getElementsByTagName("li"); + + foreach($li as $elem){ + $o++; + + $this->appendtext( + $o . ". " . + $this->fuckhtml + ->getTextContent( + $elem + ), + $text, + $i + ); + } + break; + } + } + + if( + $i !== 0 && + $text[$i - 1]["type"] == "text" + ){ + + $text[$i - 1]["value"] = rtrim($text[$i - 1]["value"]); + } + + if($author){ + + $text[] = [ + "type" => "quote", + "value" => $this->fuckhtml->getTextContent($author) + ]; + } + + $answer["description"] = $text; + } + }else{ + + /* + Get normal description + */ + $description = + $this->fuckhtml + ->getElementsByClassName( + "mb-6", + "div" + ); + + if(count($description) !== 0){ + + $description = + [ + [ + "type" => "text", + "value" => + $this->titledots( + preg_replace( + '/ Wikipedia$/', + "", + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ) + ) + ] + ]; + + $ratings = + $this->fuckhtml + ->getElementById("ratings"); + + if($ratings){ + + $this->fuckhtml->load($ratings); + + $ratings = + $this->fuckhtml + ->getElementsByClassName( + "flex-hcenter mb-10", + "div" + ); + + $description[] = [ + "type" => "title", + "value" => "Ratings" + ]; + + foreach($ratings as $rating){ + + $this->fuckhtml->load($rating); + + $num = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "r-num", + "div" + )[0] + ); + + $href = + $this->fuckhtml + ->getElementsByClassName( + "mr-10", + "a" + )[0]; + + $votes = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "text-sm", + "span" + )[0] + ); + + $c = count($description) - 1; + + if( + $c !== -1 && + $description[$c]["type"] == "text" + ){ + + $description[$c]["value"] .= $num . " "; + }else{ + + $description[] = [ + "type" => "text", + "value" => $num . " " + ]; + } + + $description[] = [ + "type" => "link", + "value" => $this->fuckhtml->getTextContent($href), + "url" => $this->fuckhtml->getTextContent($href["attributes"]["href"]) + ]; + + $description[] = [ + "type" => "text", + "value" => " (" . $votes . ")\n" + ]; + } + } + + $answer["description"] = $description; + } + } + + /* + Get sublinks + */ + $this->fuckhtml->load($infobox); + + $profiles = + $this->fuckhtml + ->getElementById("profiles"); + + if($profiles){ + $profiles = + $this->fuckhtml + ->getElementsByClassName( + "chip", + "a" + ); + + foreach($profiles as $profile){ + + $name = $this->fuckhtml->getTextContent($profile["attributes"]["title"]); + + if(strtolower($name) == "steampowered"){ + + $name = "Steam"; + } + + $answer["sublink"][$name] = + $this->fuckhtml->getTextContent($profile["attributes"]["href"]); + } + } + + $actors = + $this->fuckhtml + ->getElementById("panel-movie-cast"); + + if($actors){ + + $this->fuckhtml->load($actors); + + $actors = + $this->fuckhtml + ->getElementsByClassName("card"); + + $answer["description"][] = [ + "type" => "title", + "value" => "Cast" + ]; + + foreach($actors as $actor){ + + $this->fuckhtml->load($actor); + + $answer["description"][] = [ + "type" => "text", + "value" => + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName("card-body") + [0] + ) + ]; + + $answer["description"][] = [ + "type" => "image", + "url" => $this->getimagelinkfromstyle("person-thumb")["url"] + ]; + } + } + + $out["answer"][] = $answer; + } + + /* + Get actor standalone thingy + */ + $this->fuckhtml->load($resulthtml); + $actors = + $this->fuckhtml + ->getElementById("predicate-entity"); + + if($actors){ + + $this->fuckhtml->load($actors); + + $cards = + $this->fuckhtml + ->getElementsByClassName("card"); + + $url = + $this->fuckhtml + ->getElementsByClassName( + "disclaimer", + "div" + )[0]; + + $this->fuckhtml->load($url); + + $url = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName("a") + [0] + ["attributes"] + ["href"] + ); + + $this->fuckhtml->load($actors); + + $answer = [ + "title" => + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "entity", + "span" + )[0] + ) . " (Cast)", + "description" => [], + "url" => $url, + "sublink" => [], + "thumb" => null, + "table" => [] + ]; + + foreach($cards as $card){ + + $this->fuckhtml->load($card); + + $answer["description"][] = [ + "type" => "title", + "value" => + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "title" + )[0] + ) + ]; + + $answer["description"][] = [ + "type" => "text", + "value" => + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "text-xs desc" + )[0] + ) + ]; + + $answer["description"][] = [ + "type" => "image", + "url" => $this->getimagelinkfromstyle("img-bg")["url"] + ]; + } + + $out["answer"][] = $answer; + } + + return $out; + } + + public function news($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $nsfw = $get["nsfw"]; + $country = $get["country"]; + + if(strlen($search) > 2048){ + + throw new Exception("Search query is too long!"); + } + /* + $handle = fopen("scraper/brave-news.html", "r"); + $html = fread($handle, filesize("scraper/brave-news.html")); + fclose($handle);*/ + try{ + $html = + $this->get( + "https://search.brave.com/news", + [ + "q" => $search + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + // load html + $this->fuckhtml->load($html); + + $news = + $this->fuckhtml + ->getElementsByClassName( + "snippet inline gap-standard", + "div" + ); + + foreach($news as $article){ + + $data = [ + "title" => null, + "author" => null, + "description" => null, + "date" => null, + "thumb" => + [ + "url" => null, + "ratio" => null + ], + "url" => null + ]; + + $this->fuckhtml->load($article); + $elems = + $this->fuckhtml + ->getElementsByTagName("*"); + + // get title + $data["title"] = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "snippet-title", + $elems + ) + [0] + ["innerHTML"] + ); + + // get description + $data["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "snippet-description", + $elems + ) + [0] + ["innerHTML"] + ) + ); + + // get date + $date = + explode( + "•", + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "snippet-url", + $elems + )[0] + ) + ); + + if( + count($date) !== 1 && + trim($date[1]) != "" + ){ + + $data["date"] = + strtotime( + $date[1] + ); + } + + // get URL + $data["url"] = + $this->fuckhtml->getTextContent( + $this->unshiturl( + $this->fuckhtml + ->getElementsByClassName( + "result-header", + $elems + ) + [0] + ["attributes"] + ["href"] + ) + ); + + // get thumbnail + $thumb = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if( + count($thumb) === 2 && + trim( + $thumb[1] + ["attributes"] + ["src"] + ) != "" + ){ + + $data["thumb"] = [ + "url" => + $this->fuckhtml->getTextContent( + $this->unshiturl( + $thumb[1] + ["attributes"] + ["src"] + ) + ), + "ratio" => "16:9" + ]; + } + + $out["news"][] = $data; + } + + return $out; + } + + /* + public function bypasscaptcha($html, $nsfw, $country){ + + // @TODO figure out why I still cant go trough + // the captcha wall even after breaking it + + try{ + $html = + $this->get( + "https://search.brave.com/goggles", + [ + "q" => "site:dailymotion.com my bloody valentine" + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch html"); + } + + // Bypass brave search captcha + // this captcha only appears on the goggles page + preg_match( + '/this\.img\.src = "(.*)"/', + $html, + $image + ); + + $image = + base64_decode( + explode( + "data:image/png;base64,", + $image[1] + )[1] + ); + + $im = new Imagick(); + $im->readImageBlob($image); + + $im->blurImage(20, 20); + $im->posterizeImage(2, imagick::IMGTYPE_COLORSEPARATION); + + // if we encounter a white line thats longer than 45px + // we found the circle position + $iterator = $im->getPixelRegionIterator(0, 77, 310, 1); + + $found = null; + foreach( + $iterator as $row + ){ + + $whitecount = 0; + $count = 0; + + foreach($row as $pixel){ + + if($pixel->getColor()["r"] === 255){ + + $whitecount++; + $pixel->setColor("rgba(255,0,0,0)"); + + if($whitecount === 45){ + + $found = $count - 45; + break 2; + } + }else{ + + $whitecount = 0; + } + + $count++; + $iterator->syncIterator(); + } + } + + $found = $found + 10; + + //header("Content-Type: image/png"); + //echo $im; + //die(); + + if($found === null){ + + throw new Exception("Could not bypass captcha"); + } + + preg_match( + '/data="{"captcha_id":"([0-9A-z-]+)"}"/', + $html, + $key + ); + + $key = $key[1]; + // we bypassed captcha, send POST data + $order = + $this->get( + "https://search.brave.com/api/captcha?brave=0&captcha_id={$key}", + [ + "solution" => (string)$found + ], + $nsfw, + $country, + true + ); + + $order = json_decode($order, true)["orderId"]; + + $orderpayload = + $this->get( + "https://search.brave.com/api/rewards/v1/orders/{$order}", + [], + $nsfw, + $country + ); + + $orderpayload = json_decode($orderpayload, true); + + $creds = + $this->get( + "https://search.brave.com/api/rewards/v1/orders/{$order}/credentials", + [ + "itemId" => $orderpayload["items"][0]["id"], + "blindedCreds" => [ + "fuYAVcB/m7BU66vf3wkNGxJCSaRhshB9o+8km3F1h2c=", + "uswvcWJuPK/1qFlVdzBP3eQd0+V1EQgfAtnEoMIK+Uk=", + "fJWKGLBxl3Gyn4n9FjTLq1PjupfABT7Ni8MeB+iGzUs=", + "Aq9enJ/VZP9GxQIza3n65ZK7xQhY4VwDxv53BCb/Txg=", + "FMJA9eSLHq71K+Pcwgm4gIQOmdR/6KMy5cMgXhpd5Ro=", + "2NVhIAbvI317SP9/xXbVe/U57eWgvHyqVbHL/5+Gdmw=", + "6mpjsjSCmYEzK2xlbL8DI2P4LuhWUOxjTLvsTAL9l24=", + "kAn4wuHvIlKWhfuFfPTSfD4tZ5le9t7/61YbdEc/L3k=", + "BjjUyG16aTfd1c0h4oBzgQQOekrH1f+a5CmcXqMPTR4=", + "SBNgpCt4/V44yaQTfh+D027Yv1GJFHkjUEpPw6rAwRI=", + "XDENAtdQ7PyYx+Qx1wQGQtDWgg8WpIMgWGmd4RDOVWE=", + "tF7rB4sqamsiUk3K7fojdQSI0Q6iip72yKyhnvg/bC0=", + "VsAqflirAd/u4VsLdfRS2UvnH24ZNkFh6YN3DctLjzQ=", + "MntLbXkoI0LdcisCbNazmooiHXJyX91L1KERDAu1JRU=", + "TH6Zs8JBvFDbTDWgKbfGE4M5/cSwCtHD8ms5Y/U8zHQ=", + "jsZg0Z+qDPHymrbhdnesodhLNJ26QdunyMko1aVe4So=", + "rpKsyj6/vdnuMgLI2BApeijtGq9g5USRDL0w6X2bnlQ=", + "vCzliGT8A9vcLXj2sFf2kavOuYw69d70NpfgA22B4lI=", + "7OWoxSCtYXWcaBSifF7AXNBif/sjcuO0IelzXG/3PFk=", + "iiXtByNlT6nDMN9De5B58Jl8J0p6LCjnZ9aS3w2FEQU=", + "zDhd7gsJ4h4JkDeGK0Y0mfFd8IBdkLhMOANzwO+4Dig=", + "qANZ+AikwFReEA61JF009d/c3IHM/aSfIYwljckhJWE=", + "nNC30pDLxtXvUr+WDwfDSrAInNBpfSZkPsV2JlpheWI=", + "kGXE1pkt25P71kdJzmKIg4+yMR1VA5wNmbpBb/FhJQ8=", + "aLqPsY1Qiz2UCa2Jx3YNNt8r4JINMphks/43EiyZfXU=", + "bHGYZoQARZEM5LdFF6B74PkRqNd9EKxzuTvGYxjq+hk=", + "JOsYQjfE/9Y1u29hR+GvEkNyxUI8blgLhX1iJI/aGRQ=", + "yKjHjH5j600TJD/3WPsA1N3OmItDLifdjlysq4H6NV0=", + "9lTnUbsPp7BJ7XVN5/T4yGfzD9DJdqWB7xk72s19MAA=", + "5KHG8iY45em7zDhO/HlI0ydcZ0Ubn+XSyjifMmy7qXM=" + ] + ], + $nsfw, + $country, + true + ); + + var_dump($creds); + + sleep(2); + $test = + $this->get( + "https://search.brave.com/api/rewards/v1/orders/{$order}/credentials", + [], + $nsfw, + $country + ); + + var_dump($test); + + $html = + $this->get( + "https://search.brave.com/goggles", + [ + "q" => "site:dailymotion.com my bloody valentine" + ], + $nsfw, + $country, + false, + "__Secure-sku#brave-search-captcha=eyJ0eXBlIjoic2luZ2xlLXVzZSIsInZlcnNpb24iOjEsInNrdSI6ImJyYXZlLXNlYXJjaC1jYXB0Y2hhIiwicHJlc2VudGF0aW9uIjoiZXlKcGMzTjFaWElpT2lKaWNtRjJaUzVqYjIwL2MydDFQV0p5WVhabExYTmxZWEpqYUMxallYQjBZMmhoSWl3aWMybG5ibUYwZFhKbElqb2lNRzl0VDBneWQxZ3dTazkzU0VFMVJ6QTJaR1V5WjFOQ1dDdGhSM3B2Y2xsTVQwVTJZVVJtTUc5a1IweG1Wa3RhZEd0cU4xbHdia3BPT0VOVGNGbE5lVWR2YmpGRlNTOUhhMlZYU1RWNGQxTjJPWGxJTTNjOVBTSXNJblFpT2lKWlJWWldaVzR5TTJwQ01tSnZkakJ2U1hGNGJtSndUMGxEUW5Kd1drRjBRbWQxVnpoRlNURTNVREY2UVRaQlpUTXJSVGRFYm5NeVFqUmhka0pGYTFWM2FGY3JWRVZJVjNWcE9TdFllRU1yYlVSTVkyMTBRVDA5SW4wPSJ9" + ); + + var_dump($html); + }*/ + + private function appendtext($payload, &$text, &$index){ + + if(trim($payload) == ""){ + + return; + } + + if( + $index !== 0 && + $text[$index - 1]["type"] == "text" + ){ + + $text[$index - 1]["value"] .= "\n\n" . preg_replace('/ $/', " ", $payload); + }else{ + + $text[] = [ + "type" => "text", + "value" => preg_replace('/ $/', " ", $payload) + ]; + $index++; + } + } + + private function tablesublink($html_collection, &$data){ + + foreach($html_collection as $html){ + + $html["innerHTML"] = preg_replace( + '/<style>[\S\s]*<\/style>/i', + "", + $html["innerHTML"] + ); + + $html = + explode( + ":", + $this->fuckhtml->getTextContent($html), + 2 + ); + + if(count($html) === 1){ + + $html = ["Rating", $html[0]]; + } + + $data["table"][trim($html[0])] = trim($html[1]); + } + } + + private function getimagelinkfromstyle($thumb){ + + $thumb = + $this->fuckhtml + ->getElementsByClassName( + $thumb, + "div" + ); + + if(count($thumb) === 0){ + + return [ + "url" => null, + "ratio" => null + ]; + } + + $thumb = $thumb[0]["attributes"]["style"]; + + preg_match( + '/background-image: ?url\((\'[^\']+\'|"[^"]+"|[^\)]+)\)/', + $thumb, + $thumb + ); + + $url = $this->fuckhtml->getTextContent($this->unshiturl(trim($thumb[1], '"\' '))); + + if(parse_url($url, PHP_URL_HOST) == "cdn.search.brave.com"){ + + return [ + "url" => null, + "ratio" => null + ]; + } + + return [ + "url" => $url, + "ratio" => "16:9" + ]; + } + + private function limitstrlen($text){ + + return explode("\n", wordwrap($text, 300, "\n"))[0]; + } + + private function limitwhitespace($text){ + + return + preg_replace( + '/[\s]+/', + " ", + $text + ); + } + + private function titledots($title){ + + $substr = substr($title, -3); + + if( + $substr == "..." || + $substr == "…" + ){ + + return trim(substr($title, 0, -3)); + } + + return trim($title); + } + + private function unshiturl($url){ + + // https://imgs.search.brave.com/XFnbR8Sl7ge82MBDEH7ju0UHImRovMVmQ2qnDvgNTuA/rs:fit:844:225:1/g:ce/aHR0cHM6Ly90c2U0/Lm1tLmJpbmcubmV0/L3RoP2lkPU9JUC54/UWotQXU5N2ozVndT/RDJnNG9BNVhnSGFF/SyZwaWQ9QXBp.jpeg + + $tmp = explode("aHR0", $url); + + if(count($tmp) !== 2){ + + // nothing to do + return $url; + } + + return + base64_decode( + "aHR0" . + str_replace(["/", "_"], ["", "/"], + explode( + ".", + $tmp[1] + )[0] + ) + ); + } +} diff --git a/scraper/ddg.php b/scraper/ddg.php new file mode 100644 index 0000000..c9c28af --- /dev/null +++ b/scraper/ddg.php @@ -0,0 +1,2722 @@ +<?php + +class ddg{ + + public function __construct(){ + + include "lib/nextpage.php"; + $this->nextpage = new nextpage("ddg"); + } + + /* + curl functions + */ + private const req_web = 0; + private const req_xhr = 1; + + private function get($url, $get = [], $reqtype = self::req_web){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + switch($reqtype){ + case self::req_web: + $headers = + ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Encoding: gzip", + "Accept-Language: en-US,en;q=0.5", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site", + "Upgrade-Insecure-Requests: 1"]; + break; + + case self::req_xhr: + $headers = + ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + "Accept: */*", + "Accept-Encoding: gzip", + "Accept-Language: en-US,en;q=0.5", + "Connection: keep-alive", + "Referer: https://duckduckgo.com/", + "X-Requested-With: XMLHttpRequest", + "DNT: 1", + "Sec-Fetch-Dest: script", + "Sec-Fetch-Mode: no-cors", + "Sec-Fetch-Site: same-site"]; + break; + } + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function getfilters($pagetype){ + + switch($pagetype){ + + case "web": + return + [ + "country" => [ + "display" => "Country", + "option" => [ + "any" => "All Regions", + "ar-es" => "Argentina", + "au-en" => "Australia", + "at-de" => "Austria", + "be-fr" => "Belgium (fr)", + "be-nl" => "Belgium (nl)", + "br-pt" => "Brazil", + "bg-bg" => "Bulgaria", + "ca-en" => "Canada (en)", + "ca-fr" => "Canada (fr)", + "ct-ca" => "Catalonia", + "cl-es" => "Chile", + "cn-zh" => "China", + "co-es" => "Colombia", + "hr-hr" => "Croatia", + "cz-cs" => "Czech Republic", + "dk-da" => "Denmark", + "ee-et" => "Estonia", + "fi-fi" => "Finland", + "fr-fr" => "France", + "de-de" => "Germany", + "gr-el" => "Greece", + "hk-tzh" => "Hong Kong", + "hu-hu" => "Hungary", + "in-en" => "India (en)", + "id-en" => "Indonesia (en)", + "ie-en" => "Ireland", + "il-en" => "Israel (en)", + "it-it" => "Italy", + "jp-jp" => "Japan", + "kr-kr" => "Korea", + "lv-lv" => "Latvia", + "lt-lt" => "Lithuania", + "my-en" => "Malaysia (en)", + "mx-es" => "Mexico", + "nl-nl" => "Netherlands", + "nz-en" => "New Zealand", + "no-no" => "Norway", + "pk-en" => "Pakistan (en)", + "pe-es" => "Peru", + "ph-en" => "Philippines (en)", + "pl-pl" => "Poland", + "pt-pt" => "Portugal", + "ro-ro" => "Romania", + "ru-ru" => "Russia", + "xa-ar" => "Saudi Arabia", + "sg-en" => "Singapore", + "sk-sk" => "Slovakia", + "sl-sl" => "Slovenia", + "za-en" => "South Africa", + "es-ca" => "Spain (ca)", + "es-es" => "Spain (es)", + "se-sv" => "Sweden", + "ch-de" => "Switzerland (de)", + "ch-fr" => "Switzerland (fr)", + "tw-tzh" => "Taiwan", + "th-en" => "Thailand (en)", + "tr-tr" => "Turkey", + "us-en" => "US (English)", + "us-es" => "US (Spanish)", + "ua-uk" => "Ukraine", + "uk-en" => "United Kingdom", + "vn-en" => "Vietnam (en)" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ], + "extendedsearch" => [ + // undefined display, so it wont show in frontend + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ] + ]; + break; + + case "images": + return + [ + "country" => [ + "display" => "Country", + "option" => [ + "us-en" => "US (English)", + "ar-es" => "Argentina", + "au-en" => "Australia", + "at-de" => "Austria", + "be-fr" => "Belgium (fr)", + "be-nl" => "Belgium (nl)", + "br-pt" => "Brazil", + "bg-bg" => "Bulgaria", + "ca-en" => "Canada (en)", + "ca-fr" => "Canada (fr)", + "ct-ca" => "Catalonia", + "cl-es" => "Chile", + "cn-zh" => "China", + "co-es" => "Colombia", + "hr-hr" => "Croatia", + "cz-cs" => "Czech Republic", + "dk-da" => "Denmark", + "ee-et" => "Estonia", + "fi-fi" => "Finland", + "fr-fr" => "France", + "de-de" => "Germany", + "gr-el" => "Greece", + "hk-tzh" => "Hong Kong", + "hu-hu" => "Hungary", + "in-en" => "India (en)", + "id-en" => "Indonesia (en)", + "ie-en" => "Ireland", + "il-en" => "Israel (en)", + "it-it" => "Italy", + "jp-jp" => "Japan", + "kr-kr" => "Korea", + "lv-lv" => "Latvia", + "lt-lt" => "Lithuania", + "my-en" => "Malaysia (en)", + "mx-es" => "Mexico", + "nl-nl" => "Netherlands", + "nz-en" => "New Zealand", + "no-no" => "Norway", + "pk-en" => "Pakistan (en)", + "pe-es" => "Peru", + "ph-en" => "Philippines (en)", + "pl-pl" => "Poland", + "pt-pt" => "Portugal", + "ro-ro" => "Romania", + "ru-ru" => "Russia", + "xa-ar" => "Saudi Arabia", + "sg-en" => "Singapore", + "sk-sk" => "Slovakia", + "sl-sl" => "Slovenia", + "za-en" => "South Africa", + "es-ca" => "Spain (ca)", + "es-es" => "Spain (es)", + "se-sv" => "Sweden", + "ch-de" => "Switzerland (de)", + "ch-fr" => "Switzerland (fr)", + "tw-tzh" => "Taiwan", + "th-en" => "Thailand (en)", + "tr-tr" => "Turkey", + "us-es" => "US (Spanish)", + "ua-uk" => "Ukraine", + "uk-en" => "United Kingdom", + "vn-en" => "Vietnam (en)" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ], + "date" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "Day" => "Past day", + "Week" => "Past week", + "Month" => "Past month" + ] + ], + "size" => [ + "display" => "Size", + "option" => [ + "any" => "Any size", + "Small" => "Small", + "Medium" => "Medium", + "Large" => "Large", + "Wallpaper" => "Wallpaper" + ] + ], + "color" => [ + "display" => "Colors", + "option" => [ + "any" => "All colors", + "Monochrome" => "Black and white", + "Red" => "Red", + "Orange" => "Orange", + "Yellow" => "Yellow", + "Green" => "Green", + "Blue" => "Blue", + "Purple" => "Purple", + "Pink" => "Pink", + "Brown" => "Brown", + "Black" => "Black", + "Gray" => "Gray", + "Teal" => "Teal", + "White" => "White" + ] + ], + "type" => [ + "display" => "Type", + "option" => [ + "any" => "All types", + "photo" => "Photograph", + "clipart" => "Clipart", + "gif" => "Animated GIF", + "transparent" => "Transparent" + ] + ], + "layout" => [ + "display" => "Layout", + "option" => [ + "any" => "All layouts", + "Square" => "Square", + "Tall" => "Tall", + "Wide" => "Wide" + ] + ], + "license" => [ + "display" => "License", + "option" => [ + "any" => "All licenses", // blame ddg for this + "Any" => "All Creative Commons", + "Public" => "Public domain", + "Share" => "Free to Share and Use", + "ShareCommercially" => "Free to Share and Use Commercially", + "Modify" => "Free to Modify, Share, and Use", + "ModifyCommercially" => "Free to Modify, Share, and Use Commercially" + ] + ] + ]; + break; + + case "videos": + return + [ + "country" => [ + "display" => "Country", + "option" => [ + "us-en" => "US (English)", + "ar-es" => "Argentina", + "au-en" => "Australia", + "at-de" => "Austria", + "be-fr" => "Belgium (fr)", + "be-nl" => "Belgium (nl)", + "br-pt" => "Brazil", + "bg-bg" => "Bulgaria", + "ca-en" => "Canada (en)", + "ca-fr" => "Canada (fr)", + "ct-ca" => "Catalonia", + "cl-es" => "Chile", + "cn-zh" => "China", + "co-es" => "Colombia", + "hr-hr" => "Croatia", + "cz-cs" => "Czech Republic", + "dk-da" => "Denmark", + "ee-et" => "Estonia", + "fi-fi" => "Finland", + "fr-fr" => "France", + "de-de" => "Germany", + "gr-el" => "Greece", + "hk-tzh" => "Hong Kong", + "hu-hu" => "Hungary", + "in-en" => "India (en)", + "id-en" => "Indonesia (en)", + "ie-en" => "Ireland", + "il-en" => "Israel (en)", + "it-it" => "Italy", + "jp-jp" => "Japan", + "kr-kr" => "Korea", + "lv-lv" => "Latvia", + "lt-lt" => "Lithuania", + "my-en" => "Malaysia (en)", + "mx-es" => "Mexico", + "nl-nl" => "Netherlands", + "nz-en" => "New Zealand", + "no-no" => "Norway", + "pk-en" => "Pakistan (en)", + "pe-es" => "Peru", + "ph-en" => "Philippines (en)", + "pl-pl" => "Poland", + "pt-pt" => "Portugal", + "ro-ro" => "Romania", + "ru-ru" => "Russia", + "xa-ar" => "Saudi Arabia", + "sg-en" => "Singapore", + "sk-sk" => "Slovakia", + "sl-sl" => "Slovenia", + "za-en" => "South Africa", + "es-ca" => "Spain (ca)", + "es-es" => "Spain (es)", + "se-sv" => "Sweden", + "ch-de" => "Switzerland (de)", + "ch-fr" => "Switzerland (fr)", + "tw-tzh" => "Taiwan", + "th-en" => "Thailand (en)", + "tr-tr" => "Turkey", + "us-en" => "US (English)", + "us-es" => "US (Spanish)", + "ua-uk" => "Ukraine", + "uk-en" => "United Kingdom", + "vn-en" => "Vietnam (en)" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ], + "date" => [ + "display" => "Time fetched", + "option" => [ + "any" => "Any time", + "d" => "Past day", + "w" => "Past week", + "m" => "Past month" + ] + ], + "resolution" => [ //videoDefinition + "display" => "Resolution", + "option" => [ + "any" => "Any resolution", + "high" => "High definition", + "standard" => "Standard definition" + ] + ], + "duration" => [ // videoDuration + "display" => "Duration", + "option" => [ + "any" => "Any duration", + "short" => "Short (>5min)", + "medium" => "Medium (5-20min)", + "long" => "Long (<20min)" + ] + ], + "license" => [ + "display" => "License", + "option" => [ + "any" => "Any license", + "creativeCommon" => "Creative Commons", + "youtube" => "YouTube Standard" + ] + ] + ]; + break; + + case "news": + return + [ + "country" => [ + "display" => "Country", + "option" => [ + "us-en" => "US (English)", + "ar-es" => "Argentina", + "au-en" => "Australia", + "at-de" => "Austria", + "be-fr" => "Belgium (fr)", + "be-nl" => "Belgium (nl)", + "br-pt" => "Brazil", + "bg-bg" => "Bulgaria", + "ca-en" => "Canada (en)", + "ca-fr" => "Canada (fr)", + "ct-ca" => "Catalonia", + "cl-es" => "Chile", + "cn-zh" => "China", + "co-es" => "Colombia", + "hr-hr" => "Croatia", + "cz-cs" => "Czech Republic", + "dk-da" => "Denmark", + "ee-et" => "Estonia", + "fi-fi" => "Finland", + "fr-fr" => "France", + "de-de" => "Germany", + "gr-el" => "Greece", + "hk-tzh" => "Hong Kong", + "hu-hu" => "Hungary", + "in-en" => "India (en)", + "id-en" => "Indonesia (en)", + "ie-en" => "Ireland", + "il-en" => "Israel (en)", + "it-it" => "Italy", + "jp-jp" => "Japan", + "kr-kr" => "Korea", + "lv-lv" => "Latvia", + "lt-lt" => "Lithuania", + "my-en" => "Malaysia (en)", + "mx-es" => "Mexico", + "nl-nl" => "Netherlands", + "nz-en" => "New Zealand", + "no-no" => "Norway", + "pk-en" => "Pakistan (en)", + "pe-es" => "Peru", + "ph-en" => "Philippines (en)", + "pl-pl" => "Poland", + "pt-pt" => "Portugal", + "ro-ro" => "Romania", + "ru-ru" => "Russia", + "xa-ar" => "Saudi Arabia", + "sg-en" => "Singapore", + "sk-sk" => "Slovakia", + "sl-sl" => "Slovenia", + "za-en" => "South Africa", + "es-ca" => "Spain (ca)", + "es-es" => "Spain (es)", + "se-sv" => "Sweden", + "ch-de" => "Switzerland (de)", + "ch-fr" => "Switzerland (fr)", + "tw-tzh" => "Taiwan", + "th-en" => "Thailand (en)", + "tr-tr" => "Turkey", + "us-en" => "US (English)", + "us-es" => "US (Spanish)", + "ua-uk" => "Ukraine", + "uk-en" => "United Kingdom", + "vn-en" => "Vietnam (en)" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "date" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "d" => "Past day", + "w" => "Past week", + "m" => "Past month" + ] + ] + ]; + break; + + default: + return []; + break; + } + } + + public function web($get){ + + if($get["npt"]){ + + $jsgrep = $this->nextpage->get($get["npt"], "web"); + + $extendedsearch = false; + $inithtml = ""; + + }else{ + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $older = $get["older"]; + $newer = $get["newer"]; + $extendedsearch = $get["extendedsearch"] == "yes" ? true : false; + + // generate filters + $get_filters = [ + "q" => $search, + "kz" => "1" // force instant answers + ]; + + if($country == "any"){ + + $get_filters["kl"] = "wt-wt"; + }else{ + + $get_filters["kl"] = $country; + } + + switch($nsfw){ + + case "yes": $get_filters["kp"] = "-2"; break; + case "maybe": $get_filters["kp"] = "-1"; break; + case "no": $get_filters["kp"] = "1"; break; + } + + $df = true; + + if($newer === false){ + + if($older !== false){ + + $start = 36000; + $end = $older; + }else{ + + $df = false; + } + }else{ + + $start = $newer; + + if($older !== false){ + + $end = $older; + }else{ + + $end = time(); + } + } + + if($df === true){ + $get_filters["df"] = date("Y-m-d", $start) . ".." . date("Y-m-d", $end); + } + + /* + Get html + */ + // https://duckduckgo.com/?q=minecraft&kz=1&k1=-1&kp=-2 + try{ + $inithtml = $this->get( + "https://duckduckgo.com/", + $get_filters + ); + }catch(Exception $e){ + + throw new Exception("Failed to get html"); + } + + preg_match( + '/DDG\.deep\.initialize\(\'(.*)\',/U', + $inithtml, + $jsgrep + ); + + if(!isset($jsgrep[1])){ + + throw new Exception("Failed to get d.js URL"); + } + + $jsgrep = $jsgrep[1]; + } + + // get javascript + try{ + + $js = $this->get( + "https://links.duckduckgo.com" . $jsgrep, + [], + ddg::req_xhr + ); + }catch(Exception $e){ + + throw new Exception("Failed to fetch d.js"); + } + + // initialize api response array + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + /* + Additional requests + */ + + if($extendedsearch){ + + /* + Check for worknik results + */ + preg_match( + '/nrj\(\'\/js\/spice\/dictionary\/definition\/([^\']+)\'\)/', + $js, + $wordnik + ); + + if(isset($wordnik[1])){ + + try{ + + $wordnik = $wordnik[1]; + + // get definition + $wordnikjs = $this->get( + "https://duckduckgo.com/js/spice/dictionary/definition/" . $wordnik, + [], + ddg::req_xhr + ); + + preg_match( + '/ddg_spice_dictionary_definition\(\n?(\[{[\S\s]*}])/', + $wordnikjs, + $wordnikjson + ); + + if(isset($wordnikjson[1])){ + + $wordnikjson = json_decode($wordnikjson[1], true); + + $out["answer"][0] = [ + "title" => urldecode($wordnik), + "description" => [], + "url" => "https://www.wordnik.com/words/" . $wordnik, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + + $partofspeech = false; + $wastext = false; + $textindent = 1; + + // get audio + + $wordnikaudio_json = + json_decode( + $this->get( + "https://duckduckgo.com/js/spice/dictionary/audio/" . $wordnik, + [], + ddg::req_xhr + ), + true + ); + + if(isset($wordnikaudio_json[0]["id"])){ + + usort($wordnikaudio_json, function($a, $b){ + + return $a["id"] < $b["id"]; + }); + + $out["answer"][0]["description"][] = [ + "type" => "audio", + "url" => $wordnikaudio_json[0]["fileUrl"] + ]; + } + + $collection = []; + $e[] = []; + + foreach($wordnikjson as $data){ + + if(!isset($data["partOfSpeech"])){ + + continue; + } + + if(isset($data["text"])){ + + if(!isset($collection[$data["partOfSpeech"]])){ + + $collection[$data["partOfSpeech"]] = []; + $c = 0; + }else{ + $c = count($collection[$data["partOfSpeech"]]); + } + + if(!isset($e[$data["partOfSpeech"]])){ + + $e[$data["partOfSpeech"]] = 0; + } + + $e[$data["partOfSpeech"]]++; + $text = $e[$data["partOfSpeech"]] . ". " . $this->unescapehtml(strip_tags($data["text"])); + + $syn = false; + if( + isset($data["relatedWords"]) && + count($data["relatedWords"]) !== 0 + ){ + + $syn = " ("; + + $u = 0; + foreach($data["relatedWords"] as $related){ + + $syn .= ucfirst($related["relationshipType"]) . ": "; + + $c = count($related["words"]); + $b = 0; + foreach($related["words"] as $word){ + + $syn .= trim($this->unescapehtml(strip_tags($word))); + + $b++; + if($b !== $c){ + + $syn .= ", "; + } + } + + $u++; + if($u !== count($data["relatedWords"])){ + + $syn .= ". "; + } + } + + $syn .= ")"; + } + + if( + $c !== 0 && + $collection[$data["partOfSpeech"]][$c - 1]["type"] == "text" + ){ + $collection[$data["partOfSpeech"]][$c - 1]["value"] .= + "\n" . $text; + + }else{ + + if( + $c !== 0 && + ( + $collection[$data["partOfSpeech"]][$c - 1]["type"] == "text" || + $collection[$data["partOfSpeech"]][$c - 1]["type"] == "italic" + ) + ){ + + $text = "\n" . $text; + } + + $collection[$data["partOfSpeech"]][] = + [ + "type" => "text", + "value" => $text + ]; + } + + if($syn){ + + $collection[$data["partOfSpeech"]][] = [ + "type" => "italic", + "value" => $syn + ]; + } + + if(isset($data["exampleUses"])){ + + foreach($data["exampleUses"] as $use){ + + $collection[$data["partOfSpeech"]][] = [ + "type" => "quote", + "value" => $this->unescapehtml(strip_tags($use["text"])) + ]; + } + } + + if(isset($data["citations"])){ + + foreach($data["citations"] as $citation){ + + if(!isset($citation["cite"])){ + + continue; + } + + $value = $this->unescapehtml(strip_tags($citation["cite"])); + + if( + isset($citation["source"]) && + trim($citation["source"]) != "" + ){ + $value .= " - " . $this->unescapehtml(strip_tags($citation["source"])); + } + + $collection[$data["partOfSpeech"]][] = [ + "type" => "quote", + "value" => $value + ]; + } + } + } + } + + foreach($collection as $key => $items){ + + $out["answer"][0]["description"][] = + [ + "type" => "title", + "value" => $key + ]; + + $out["answer"][0]["description"] = + array_merge($out["answer"][0]["description"], $items); + } + } + + }catch(Exception $e){ + + // do nothing + } + } + + unset($wordnik); + + /* + Check for stackoverflow answers + */ + + // /a.js?p=1&src_id=stack_overflow&from=nlp_qa&id=3390396,2559318&q=how%20can%20i%20check%20for%20undefined%20in%20javascript&s=stackoverflow.com&tl=How%20can%20I%20check%20for%20%22undefined%22%20in%20JavaScript%3F%20%2D%20Stack%20Overflow + // /a.js?p=1&src_id=arqade&from=nlp_qa&id=370293,375682&q=what%20is%20the%20difference%20between%20at%20and%20positioned%20in%20execute&s=gaming.stackexchange.com&tl=minecraft%20java%20edition%20minecraft%20commands%20%2D%20What%20is%20the%20difference + // /a.js?p=1&src_id=unix&from=nlp_qa&id=312754&q=how%20to%20strip%20metadata%20from%20image%20files&s=unix.stackexchange.com&tl=How%20to%20strip%20metadata%20from%20image%20files%20%2D%20Unix%20%26%20Linux%20Stack%20Exchange + preg_match( + '/nrj\(\'(\/a\.js\?.*from=nlp_qa.*)\'\)/U', + $js, + $stack + ); + + if(isset($stack[1])){ + + $stack = $stack[1]; + + try{ + $stackjs = $this->get( + "https://duckduckgo.com" . $stack, + [], + ddg::req_xhr + ); + + if( + !preg_match( + '/^DDG\.duckbar\.failed/', + $stackjs + ) + ){ + + preg_match( + '/DDG\.duckbar\.add_array\((\[\{[\S\s]*}])\)/U', + $stackjs, + $stackjson + ); + + $stackjson = json_decode($stackjson[1], true)[0]["data"][0]; + + $out["answer"][] = [ + "title" => $stackjson["Heading"], + "description" => $this->htmltoarray($stackjson["Abstract"]), + "url" => str_replace(["http://", "ddg"], ["https://", ""], $stackjson["AbstractURL"]), + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + } + + }catch(Exception $e){ + + // do nothing + } + } + + /* + Check for musicmatch (lyrics) + */ + preg_match( + '/nrj\(\'(\/a\.js\?.*&s=lyrics.*)\'\)/U', + $js, + $lyrics + ); + + if(isset($lyrics[1])){ + + $lyrics = $lyrics[1]; + + try{ + $lyricsjs = $this->get( + "https://duckduckgo.com" . $lyrics, + [], + ddg::req_xhr + ); + + if( + !preg_match( + '/^DDG\.duckbar\.failed/', + $lyricsjs + ) + ){ + + preg_match( + '/DDG\.duckbar\.add_array\((\[\{[\S\s]*}])\)/U', + $lyricsjs, + $lyricsjson + ); + + $lyricsjson = json_decode($lyricsjson[1], true)[0]["data"][0]; + + $title = null; + + if(isset($lyricsjson["Heading"])){ + + $title = $lyricsjson["Heading"]; + }elseif(isset($lyricsjson["data"][1]["urlTitle"])){ + + $title = $lyricsjson["data"][1]["urlTitle"]; + }else{ + + $title = $lyricsjson["data"][0]["song_title"]; + } + + $description = [ + [ + "type" => "text", + "value" => null + ] + ]; + $parts = + explode( + "<br>", + str_ireplace( + ["<br>", "</br>", "<br/>"], + "<br>", + $lyricsjson["Abstract"] + ), + ); + + for($i=0; $i<count($parts); $i++){ + + $description[0]["value"] .= trim($parts[$i]) . "\n"; + } + + $description[0]["value"] = trim($description[0]["value"]); + + $description[] = + [ + "type" => "quote", + "value" => + "Written by " . implode(", ", $lyricsjson["data"][0]["writers"]) . + "\nFrom the album " . $lyricsjson["data"][0]["albums"][0]["title"] . + "\nReleased on the " . date("jS \of F Y", strtotime($lyricsjson["data"][0]["albums"][0]["release_date"])) + ]; + + $out["answer"][] = [ + "title" => $title, + "description" => $description, + "url" => $lyricsjson["AbstractURL"], + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + } + + }catch(Exception $e){ + + // do nothing + } + } + } + + /* + Get related searches + */ + preg_match( + '/DDG\.duckbar\.loadModule\(\'related_searches\', ?{[\s\S]*"results":(\[{[\s\S]*}]),"vqd"/U', + $js, + $related + ); + + if(isset($related[1])){ + + try{ + $related = json_decode($related[1], true); + + for($i=0; $i<count($related); $i++){ + + if(isset($related[$i]["text"])){ + + array_push($out["related"], $related[$i]["text"]); + } + } + + }catch(Exception $e){ + + // do nothing + } + } + + unset($related); + + /* + Get answers + */ + $answer_count = preg_match_all( + '/DDG\.duckbar\.add\(({.*[\S\s]*})(?:\);|,null,"index"\))/U', + $js . $inithtml, + $answers + ); + + try{ + + if(isset($answers[1])){ + + $answers = $answers[1]; + + for($i=0; $i<$answer_count; $i++){ + + $answers[$i] = json_decode($answers[$i], true); + + // remove dupes + for($k=0; $k<count($out["answer"]); $k++){ + + if( + !isset($answers[$i]["data"]["AbstractURL"]) || + str_replace("_", "%20", $out["answer"][$k]["url"]) == str_replace("_", "%20", $this->sanitizeurl($answers[$i]["data"]["AbstractURL"])) + ){ + + continue 2; + } + } + + // get more related queries + if( + isset($answers[$i]["data"]["RelatedTopics"]) && + $answers[$i]["data"]["RelatedTopics"] != 0 + ){ + + for($k=0; $k<count($answers[$i]["data"]["RelatedTopics"]); $k++){ + + if(isset($answers[$i]["data"]["RelatedTopics"][$k]["Result"])){ + + preg_match( + '/">(.*)<\//', + $answers[$i]["data"]["RelatedTopics"][$k]["Result"], + $label + ); + + array_push($out["related"], htmlspecialchars_decode(strip_tags($label[1]))); + } + } + } + + $image = null; + + // get image + if( + isset($answers[$i]["data"]["Image"]) && + !empty($answers[$i]["data"]["Image"]) && + $answers[$i]["data"]["Image"] != "https://duckduckgo.com/i/" + ){ + if(strpos($answers[$i]["data"]["Image"], "https://duckduckgo.com/i/") === true){ + + $image = $answers[$i]["data"]["Image"]; + }else{ + + if( + strlen($answers[$i]["data"]["Image"]) > 0 && + $answers[$i]["data"]["Image"][0] == "/" + ){ + + $answers[$i]["data"]["Image"] = substr($answers[$i]["data"]["Image"], 1); + } + + $image = "https://duckduckgo.com/" . $answers[$i]["data"]["Image"]; + } + } + + $count = count($out["answer"]); + + if(isset($answers[$i]["data"]["AbstractText"]) && !empty($answers[$i]["data"]["AbstractText"])){ + + $description = $this->htmltoarray($answers[$i]["data"]["AbstractText"]); + }elseif(isset($answers[$i]["data"]["Abstract"]) && !empty($answers[$i]["data"]["Abstract"])){ + + $description = $this->htmltoarray($answers[$i]["data"]["Abstract"]); + }elseif(isset($answers[$i]["data"]["Answer"]) && !empty($answers[$i]["data"]["Answer"])){ + + $description = $this->htmltoarray($answers[$i]["data"]["Answer"]); + }else{ + + $description = []; + } + + if(isset($answers[$i]["data"]["Heading"]) && !empty($answers[$i]["data"]["Heading"])){ + + $title = $this->unescapehtml($answers[$i]["data"]["Heading"]); + }else{ + + // no title, ignore bs + continue; + //$title = null; + } + + if(isset($answers[$i]["data"]["AbstractURL"]) && !empty($answers[$i]["data"]["AbstractURL"])){ + + $url = $answers[$i]["data"]["AbstractURL"]; + }else{ + + $url = null; + } + + $out["answer"][$count] = [ + "title" => $title, + "description" => $description, + "url" => $this->sanitizeurl($url), + "thumb" => $image, + "table" => [], + "sublink" => [] + ]; + + if(isset($answers[$i]["data"]["Infobox"]["content"])){ + + for($k=0; $k<count($answers[$i]["data"]["Infobox"]["content"]); $k++){ + + // populate table + if($answers[$i]["data"]["Infobox"]["content"][$k]["data_type"] == "string"){ + + $out["answer"][$count]["table"][$answers[$i]["data"]["Infobox"]["content"][$k]["label"]] = + $answers[$i]["data"]["Infobox"]["content"][$k]["value"]; + continue; + } + + $url = ""; + $type = "Website"; + + switch($answers[$i]["data"]["Infobox"]["content"][$k]["data_type"]){ + case "official_site": + case "official_website": + $type = "Website"; + break; + + case "wikipedia": $type = "Wikipedia"; break; + case "itunes": $type = "iTunes"; break; + case "amazon": $type = "Amazon"; break; + + case "imdb_title_id": + case "imdb_id": + case "imdb_name_id": + $type = "IMDb"; + $delim = substr($answers[$i]["data"]["Infobox"]["content"][$k]["value"], 0, 2); + + if($delim == "nm"){ + + $url = "https://www.imdb.com/name/"; + }elseif($delim == "tt"){ + + $url = "https://www.imdb.com/title/"; + }elseif($delim == "co"){ + + $url = "https://www.imdb.com/search/title/?companies="; + }else{ + + $url = "https://www.imdb.com/title/"; + } + break; + + case "imdb_name_id": $url = "https://www.imdb.com/name/"; $type = "IMDb"; break; + case "twitter_profile": $url = "https://twitter.com/"; $type = "Twitter"; break; + case "instagram_profile": $url = "https://instagram.com/"; $type = "Instagram"; break; + case "facebook_profile": $url = "https://facebook.com/"; $type = "Facebook"; break; + case "spotify_artist_id": $url = "https://open.spotify.com/artist/"; $type = "Spotify"; break; + case "rotten_tomatoes": $url = "https://rottentomatoes.com/"; $type = "Rotten Tomatoes"; break; + case "youtube_channel": $url = "https://youtube.com/channel/"; $type = "YouTube"; break; + case "soundcloud_id": $url = "https://soundcloud.com/"; $type = "SoundCloud"; break; + + default: + continue 2; + } + + // populate sublinks + $out["answer"][$count]["sublink"][$type] = + $url . $answers[$i]["data"]["Infobox"]["content"][$k]["value"]; + } + } + } + } + + }catch(Exception $e){ + + // do nothing + } + + /* + Get shitcoin conversions + */ + if($extendedsearch){ + if( + preg_match( + '/"https?:\/\/(?:www\.coinbase\.com\/converter\/([a-z0-9]+)\/([a-z0-9]+)|changelly\.com\/exchange\/([a-z0-9]+)\/([a-z0-9]+)|coinmarketcap\.com\/currencies\/[a-z0-9]+\/([a-z0-9]+)\/([a-z0-9]+))\/?"/', + $js, + $shitcoins + ) + ){ + + $shitcoins = array_values(array_filter($shitcoins)); + + preg_match( + '/(?:[\s,.]*[0-9]+)+/', + $search, + $amount + ); + + if(count($amount) === 1){ + + $amount = (float)str_replace([" ", ","], ["", "."], $amount[0]); + }else{ + + $amount = 1; + } + + try{ + + $description = []; + + $shitcoinjs = $this->get( + "https://duckduckgo.com/js/spice/cryptocurrency/{$shitcoins[1]}/{$shitcoins[2]}/1", + [], + ddg::req_xhr + ); + + preg_match( + '/ddg_spice_cryptocurrency\(\s*({[\S\s]*})\s*\);/', + $shitcoinjs, + $shitcoinjson + ); + + $shitcoinjson = json_decode($shitcoinjson[1], true); + + if( + !isset($shitcoinjson["error"]) && + $shitcoinjson["status"]["error_code"] == 0 + ){ + + $shitcoinjson = $shitcoinjson["data"]; + $array_values = array_values($shitcoinjson["quote"])[0]; + + if($amount != 1){ + + // show conversion + $description[] = [ + "type" => "title", + "value" => "Conversion" + ]; + + $description[] = [ + "type" => "text", + "value" => + "{$amount} {$shitcoinjson["name"]} ({$shitcoinjson["symbol"]}) = " . $this->number_format($array_values["price"] * $amount) . " " . strtoupper($shitcoins[2]) . "\n" . + "{$amount} " . strtoupper($shitcoins[2]) . " = " . $this->number_format((1 / $array_values["price"]) * $amount) . " {$shitcoinjson["symbol"]}" + ]; + } + + $description[] = [ + "type" => "title", + "value" => "Current rates" + ]; + + // rates + $description[] = [ + "type" => "text", + "value" => + "1 {$shitcoinjson["name"]} ({$shitcoinjson["symbol"]}) = " . $this->number_format($array_values["price"]) . " " . strtoupper($shitcoins[2]) . "\n" . + "1 " . strtoupper($shitcoins[2]) . " = " . $this->number_format(1 / $array_values["price"]) . " {$shitcoinjson["symbol"]}" + ]; + + $description[] = [ + "type" => "quote", + "value" => "Last fetched: " . date("jS \of F Y @ g:ia", strtotime($shitcoinjson["last_updated"])) + ]; + + $out["answer"][] = [ + "title" => $shitcoinjson["name"] . " (" . strtoupper($shitcoins[1]) . ") & " . strtoupper($shitcoins[2]) . " market", + "description" => $description, + "url" => "https://coinmarketcap.com/converter/" . strtoupper($shitcoins[1]) . "/" . strtoupper($shitcoins[2]) . "/?amt={$amount}", + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + } + + }catch(Exception $e){ + + // do nothing + } + }else{ + + /* + Get currency conversion + */ + if( + preg_match( + '/"https:\/\/www\.xe\.com\/currencyconverter\/convert\/\?From=([A-Z0-9]+)&To=([A-Z0-9]+)"/', + $js, + $currencies + ) + ){ + + preg_match( + '/(?:[\s,.]*[0-9]+)+/', + $search, + $amount + ); + + if(count($amount) === 1){ + + $amount = (float)str_replace([" ", ","], ["", "."], $amount[0]); + }else{ + + $amount = 1; + } + + try{ + $currencyjs = $this->get( + "https://duckduckgo.com/js/spice/currency/{$amount}/" . strtolower($currencies[1]) . "/" . strtolower($currencies[2]), + [], + ddg::req_xhr + ); + + preg_match( + '/ddg_spice_currency\(\s*({[\S\s]*})\s*\);/', + $currencyjs, + $currencyjson + ); + + $currencyjson = json_decode($currencyjson[1], true); + + if(empty($currencyjson["headers"]["description"])){ + + $currencyjson = $currencyjson["conversion"]; + $description = []; + + if($amount != 1){ + + $description[] = + [ + "type" => "title", + "value" => "Conversion" + ]; + + $description[] = + [ + "type" => "text", + "value" => + $this->number_format($currencyjson["from-amount"]) . " {$currencyjson["from-currency-symbol"]} = " . + $this->number_format($currencyjson["converted-amount"]) . " {$currencyjson["to-currency-symbol"]}" + ]; + } + + $description[] = + [ + "type" => "title", + "value" => "Current rates" + ]; + + $description[] = + [ + "type" => "text", + "value" => + "{$currencyjson["conversion-rate"]}\n" . + "{$currencyjson["conversion-inverse"]}" + ]; + + $description[] = + [ + "type" => "quote", + "value" => "Last fetched: " . date("jS \of F Y @ g:ia", strtotime($currencyjson["rate-utc-timestamp"])) + ]; + + $out["answer"][] = [ + "title" => + "{$currencyjson["from-currency-name"]} ({$currencyjson["from-currency-symbol"]}) to " . + "{$currencyjson["to-currency-name"]} ({$currencyjson["to-currency-symbol"]})", + "description" => $description, + "url" => "https://www.xe.com/currencyconverter/convert/?Amount={$amount}&From={$currencies[1]}&To={$currencies[2]}", + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + } + + }catch(Exception $e){ + + // do nothing + } + } + } + } + + /* + Get small answer + */ + preg_match( + '/DDG\.ready\(function ?\(\) ?{DDH\.add\(({[\S\s]+}),"index"\)}\)/U', + $inithtml, + $smallanswer + ); + + if(isset($smallanswer[1])){ + + $smallanswer = json_decode($smallanswer[1], true); + + if( + !isset($smallanswer["require"]) && + isset($smallanswer["data"]["title"]) + ){ + + if(isset($smallanswer["data"]["url"])){ + + $url = $this->unescapehtml($smallanswer["data"]["url"]); + }elseif(isset($smallanswer["meta"]["sourceUrl"])){ + + $url = $this->unescapehtml($smallanswer["meta"]["sourceUrl"]); + }else{ + + $url = null; + } + + $out["answer"] = [ + [ + "title" => $this->unescapehtml($smallanswer["data"]["title"]), + "description" => [], + "url" => $this->sanitizeurl($url), + "thumb" => null, + "table" => [], + "sublink" => [] + ], + ...$out["answer"] + ]; + + if(isset($smallanswer["data"]["subtitle"])){ + + $out["answer"][0]["description"][] = + [ + "type" => "text", + "value" => isset($smallanswer["data"]["subtitle"]) ? $this->unescapehtml($smallanswer["data"]["subtitle"]) : null + ]; + } + } + } + + unset($inithtml); + unset($answers); + unset($answer_count); + + /* + Get spelling autocorrect + */ + + preg_match( + '/DDG\.page\.showMessage\(\'spelling\',({[\S\s]+})\)/U', + $js, + $spelling + ); + + if(isset($spelling[1])){ + + $spelling = json_decode($spelling[1], true); + + switch((int)$spelling["qc"]){ + + case 1: + case 3: + case 5: + $type = "including"; + break; + + default: + $type = "not_many"; + break; + } + + $out["spelling"] = [ + "type" => $type, + "using" => $this->unescapehtml(strip_tags($spelling["suggestion"])), + "correction" => $this->unescapehtml(strip_tags($spelling["recourseText"])) + ]; + } + + unset($spelling); + + /* + Get web results + */ + preg_match( + '/DDG\.pageLayout\.load\(\'d\', ?(\[{"[\S\s]*"}])\)/U', + $js, + $web + ); + + if(isset($web[1])){ + + try{ + $web = json_decode($web[1], true); + + for($i=0; $i<count($web); $i++){ + + // ignore google placeholder + fake next page + if( + isset($web[$i]["t"]) && + ( + $web[$i]["t"] == "EOP" || + $web[$i]["t"] == "EOF" + ) && + strpos($web[$i]["c"], "://www.google.") !== false + ){ + + break; + } + + // store next page token + if(isset($web[$i]["n"])){ + + $out["npt"] = $this->nextpage->store($web[$i]["n"] . "&biaexp=b&eslexp=a&litexp=c&msvrtexp=b&wrap=1", "web"); + continue; + } + + // ignore malformed data + if(!isset($web[$i]["t"])){ + + continue; + } + + $sublinks = []; + + if(isset($web[$i]["l"])){ + + for($k=0; $k<count($web[$i]["l"]); $k++){ + + if( + !isset($web[$i]["l"][$k]["targetUrl"]) || + !isset($web[$i]["l"][$k]["text"]) + ){ + + continue; + } + + array_push( + $sublinks, + [ + "title" => $this->titledots($this->unescapehtml($web[$i]["l"][$k]["text"])), + "date" => null, + "description" => isset($web[$i]["l"][$k]["snippet"]) ? $this->titledots($this->unescapehtml($web[$i]["l"][$k]["snippet"])) : null, + "url" => $this->sanitizeurl($web[$i]["l"][$k]["targetUrl"]) + ] + ); + } + } + + if( + preg_match( + '/^<span class="result__type">PDF<\/span>/', + $web[$i]["t"] + ) + ){ + + $type = "pdf"; + $web[$i]["t"] = + str_replace( + '<span class="result__type">PDF</span>', + "", + $web[$i]["t"] + ); + }else{ + + $type = "web"; + } + + if(isset($web[$i]["e"])){ + + $date = strtotime($web[$i]["e"]); + }else{ + + $date = null; + } + + array_push( + $out["web"], + [ + "title" => $this->titledots($this->unescapehtml(strip_tags($web[$i]["t"]))), + "description" => $this->titledots($this->unescapehtml(strip_tags($web[$i]["a"]))), + "url" => isset($web[$i]["u"]) ? $this->sanitizeurl($web[$i]["u"]) : $this->sanitizeurl($web[$i]["c"]), + "date" => $date, + "type" => $type, + "thumb" => + [ + "url" => null, + "ratio" => null + ], + "sublink" => $sublinks, + "table" => [] + ] + ); + } + + }catch(Exception $e){ + + // do nothing + } + } + + unset($web); + + /* + Get images + */ + preg_match( + '/DDG\.duckbar\.load\(\'images\', ?{[\s\S]*"results":(\[{"[\s\S]*}]),"vqd"/U', + $js, + $images + ); + + if(isset($images[1])){ + + try{ + $images = json_decode($images[1], true); + + for($i=0; $i<count($images); $i++){ + + if( + !isset($images[$i]["title"]) || + !isset($images[$i]["image"]) || + !isset($images[$i]["thumbnail"]) || + !isset($images[$i]["width"]) || + !isset($images[$i]["height"]) + ){ + + continue; + } + + $ratio = + $this->bingratio( + (int)$images[$i]["width"], + (int)$images[$i]["height"] + ); + + array_push( + $out["image"], + [ + "title" => $this->titledots($this->unescapehtml($images[$i]["title"])), + "source" => [ + [ + "url" => $images[$i]["image"], + "width" => (int)$images[$i]["width"], + "height" => (int)$images[$i]["height"] + ], + [ + "url" => $this->bingimg($images[$i]["thumbnail"]), + "width" => $ratio[0], + "height" => $ratio[1] + ] + ], + "url" => $this->sanitizeurl($images[$i]["url"]) + ] + ); + } + + }catch(Exception $e){ + + // do nothing + } + } + + unset($images); + + /* + Get videos + */ + preg_match( + '/DDG\.duckbar\.load\(\'videos\', ?{[\s\S]*"results":(\[{"[\s\S]*}]),"vqd"/U', + $js, + $videos + ); + + if(isset($videos[1])){ + try{ + $videos = json_decode($videos[1], true); + + for($i=0; $i<count($videos); $i++){ + + $cachekey = false; + + foreach(["large", "medium", "small"] as &$key){ + + if(isset($videos[$i]["images"][$key])){ + + $cachekey = $key; + break; + } + } + + if( + !isset($videos[$i]["title"]) || + !isset($videos[$i]["description"]) || + $cachekey === false || + !isset($videos[$i]["content"]) + ){ + + continue; + } + + array_push( + $out["video"], + [ + "title" => $this->titledots($this->unescapehtml($videos[$i]["title"])), + "description" => $videos[$i]["description"] == "" ? null : $this->titledots($this->unescapehtml($videos[$i]["description"])), + "date" => $videos[$i]["published"] == "" ? null : strtotime($videos[$i]["published"]), + "duration" => $videos[$i]["duration"] == 0 ? null : $this->hmstoseconds($videos[$i]["duration"]), + "views" => $videos[$i]["statistics"]["viewCount"] == 0 ? null : $videos[$i]["statistics"]["viewCount"], + "thumb" => + [ + "url" => $this->bingimg($videos[$i]["images"][$cachekey]), + "ratio" => "16:9" + ], + "url" => $this->sanitizeurl($videos[$i]["content"]) + ] + ); + } + + }catch(Exception $e){ + + // do nothing + } + } + + unset($videos); + + /* + Get news + */ + preg_match( + '/DDG\.duckbar\.load\(\'news\', ?{[\s\S]*"results":(\[{"[\s\S]*}]),"vqd"/U', + $js, + $news + ); + + if(isset($news[1])){ + try{ + $news = json_decode($news[1], true); + + for($i=0; $i<count($news); $i++){ + + if( + !isset($news[$i]["title"]) || + !isset($news[$i]["excerpt"]) || + !isset($news[$i]["url"]) + ){ + + continue; + } + + array_push( + $out["news"], + [ + "title" => $this->titledots($this->unescapehtml($news[$i]["title"])), + "description" => $this->titledots($this->unescapehtml(strip_tags($news[$i]["excerpt"]))), + "date" => isset($news[$i]["date"]) ? (int)$news[$i]["date"] : null, + "thumb" => + [ + "url" => isset($news[$i]["image"]) ? $news[$i]["image"] : null, + "ratio" => "16:9" + ], + "url" => $this->sanitizeurl($news[$i]["url"]) + ] + ); + } + + }catch(Exception $e){ + + // do nothing + } + } + + return $out; + } + + public function image($get){ + + if($get["npt"]){ + + $npt = $this->nextpage->get($get["npt"], "images"); + + try{ + $json = json_decode($this->get( + "https://duckduckgo.com/i.js?" . $npt, + [], + ddg::req_xhr + ), true); + + }catch(Exception $err){ + + throw new Exception("Failed to get i.js"); + } + + }else{ + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $date = $get["date"]; + $size = $get["size"]; + $color = $get["color"]; + $type = $get["type"]; + $layout = $get["layout"]; + $license = $get["license"]; + + $filter = []; + $get_filters = [ + "q" => $search, + "iax" => "images", + "ia" => "images" + ]; + + if($date != "any"){ $filter[] = "time:$date"; } + if($size != "any"){ $filter[] = "size:$size"; } + if($color != "any"){ $filter[] = "color:$color"; } + if($type != "any"){ $filter[] = "type:$type"; } + if($layout != "any"){ $filter[] = "layout:$layout"; } + if($license != "any"){ $filter[] = "license:$license"; } + + $filter = implode(",", $filter); + + if($filter != ""){ + + $get_filters["iaf"] = $filter; + } + + switch($nsfw){ + + case "yes": $get_filters["kp"] = "-2"; break; + case "no": $get_filters["kp"] = "-1"; break; + } + + try{ + + $html = $this->get( + "https://duckduckgo.com", + $get_filters, + ddg::req_web + ); + }catch(Exception $err){ + + throw new Exception("Failed to get html"); + } + + preg_match( + '/vqd=([0-9-]+)/', + $html, + $vqd + ); + + if(!isset($vqd[1])){ + + throw new Exception("Failed to get vqd token"); + } + + $vqd = $vqd[1]; + + // @TODO: s param = image offset + $js_params = [ + "l" => $country, + "o" => "json", + "q" => $search, + "vqd" => $vqd + ]; + + switch($nsfw){ + + case "yes": $js_params["p"] = "-1"; break; + case "no": $js_params["p"] = "1"; break; + } + + if(empty($filter)){ + + $js_params["f"] = "1"; + }else{ + + $js_params["f"] = $filter; + } + + try{ + $json = json_decode($this->get( + "https://duckduckgo.com/i.js", + $js_params, + ddg::req_xhr + ), true); + + }catch(Exception $err){ + + throw new Exception("Failed to get i.js"); + } + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + if(isset($json["next"])){ + + if(!isset($vqd)){ + + $vqd = array_values($json["vqd"])[0]; + } + + $out["npt"] = + $this->nextpage->store( + explode("?", $json["next"])[1] . "&vqd=" . + $vqd, + "images" + ); + } + + for($i=0; $i<count($json["results"]); $i++){ + + $bingimg = $this->bingimg($json["results"][$i]["thumbnail"]); + $ratio = + $this->bingratio( + (int)$json["results"][$i]["width"], + (int)$json["results"][$i]["height"] + ); + + $out["image"][] = [ + "title" => $this->titledots($this->unescapehtml($json["results"][$i]["title"])), + "source" => [ + [ + "url" => $json["results"][$i]["image"], + "width" => (int)$json["results"][$i]["width"], + "height" => (int)$json["results"][$i]["height"] + ], + [ + "url" => $bingimg, + "width" => $ratio[0], + "height" => $ratio[1], + ] + ], + "url" => $this->sanitizeurl($json["results"][$i]["url"]) + ]; + } + + return $out; + } + + public function video($get){ + + if($get["npt"]){ + + $npt = $this->nextpage->get($get["npt"], "videos"); + + try{ + $json = json_decode($this->get( + "https://duckduckgo.com/v.js?" . + $npt, + [], + ddg::req_xhr + ), true); + + }catch(Exception $err){ + + throw new Exception("Failed to get v.js"); + } + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $date = $get["date"]; + $resolution = $get["resolution"]; + $duration = $get["duration"]; + $license = $get["license"]; + + $filter = []; + + $get_filters = [ + "q" => $search, + "iax" => "videos", + "ia" => "videos" + ]; + + switch($nsfw){ + + case "yes": $get_filters["kp"] = "-2"; break; + case "no": $get_filters["kp"] = "-1"; break; + } + + if($date != "any"){ $filter[] = "publishedAfter:{$date}"; } + if($resolution != "any"){ $filter[] = "videoDefinition:{$resolution}"; } + if($duration != "any"){ $filter[] = "videoDuration:{$duration}"; } + if($license != "any"){ $filter[] = "videoLicense:{$license}"; } + + $filter = implode(",", $filter); + + try{ + + $html = $this->get( + "https://duckduckgo.com", + $get_filters, + ddg::req_web + ); + }catch(Exception $err){ + + throw new Exception("Failed to get html"); + } + + preg_match( + '/vqd=([0-9-]+)/', + $html, + $vqd + ); + + if(!isset($vqd[1])){ + + throw new Exception("Failed to get vqd token"); + } + + $vqd = $vqd[1]; + + try{ + $json = json_decode($this->get( + "https://duckduckgo.com/v.js", + [ + "l" => "us-en", + "o" => "json", + "sr" => 1, + "q" => $search, + "vqd" => $vqd, + "f" => $filter, + "p" => $get_filters["kp"] + ], + ddg::req_xhr + ), true); + + }catch(Exception $err){ + + throw new Exception("Failed to get v.js"); + } + } + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + if(isset($json["next"])){ + + $out["npt"] = + $this->nextpage->store( + explode("?", $json["next"])[1], + "videos" + ); + } + + for($i=0; $i<count($json["results"]); $i++){ + + $cachekey = false; + + foreach(["large", "medium", "small"] as &$key){ + + if(isset($json["results"][$i]["images"][$key])){ + + $cachekey = $key; + break; + } + } + + if( + !isset($json["results"][$i]["title"]) || + !isset($json["results"][$i]["description"]) || + $cachekey === false || + !isset($json["results"][$i]["content"]) + ){ + + continue; + } + + array_push( + $out["video"], + [ + "title" => $this->titledots($this->unescapehtml($json["results"][$i]["title"])), + "description" => $json["results"][$i]["description"] == "" ? null : $this->titledots($this->unescapehtml($json["results"][$i]["description"])), + "author" => [ + "name" => empty($json["results"][$i]["uploader"]) ? null : $this->unescapehtml($json["results"][$i]["uploader"]), + "url" => null, + "avatar" => null + ], + "date" => $json["results"][$i]["published"] == "" ? null : strtotime($json["results"][$i]["published"]), + "duration" => $json["results"][$i]["duration"] == 0 ? null : $this->hmstoseconds($json["results"][$i]["duration"]), + "views" => $json["results"][$i]["statistics"]["viewCount"] == 0 ? null : $json["results"][$i]["statistics"]["viewCount"], + "thumb" => [ + "url" => $this->bingimg($json["results"][$i]["images"][$cachekey]), + "ratio" => "16:9" + ], + "url" => $this->sanitizeurl($json["results"][$i]["content"]) + ] + ); + } + + return $out; + } + + public function news($get){ + + if($get["npt"]){ + + $req = $this->nextpage->get($get["npt"], "news"); + + try{ + + $json = json_decode($this->get( + "https://duckduckgo.com/news.js?" . + $req, + [], + ddg::req_xhr + ), true); + + }catch(Exception $err){ + + throw new Exception("Failed to get news.js"); + } + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $date = $get["date"]; + + $get_params = [ + "q" => $search, + "iar" => "news", + "ia" => "news" + ]; + + switch($nsfw){ + + case "yes": $get_filters["kp"] = "-2"; break; + case "maybe": $get_filters["kp"] = "-1"; break; + case "no": $get_filters["kp"] = "1"; break; + } + + if($date != "any"){ + + $get_params["df"] = $date; + } + + try{ + + $html = $this->get( + "https://duckduckgo.com", + $get_params, + ddg::req_web + ); + }catch(Exception $err){ + + throw new Exception("Failed to get html"); + } + + preg_match( + '/vqd=([0-9-]+)/', + $html, + $vqd + ); + + if(!isset($vqd[1])){ + + throw new Exception("Failed to get vqd token"); + } + + $vqd = $vqd[1]; + + try{ + + $js_params = [ + "l" => $country, + "o" => "json", + "noamp" => "1", + "q" => $search, + "vqd" => $vqd, + "p" => $get_filters["kp"] + ]; + + if($date != "any"){ + + $js_params["df"] = $date; + }else{ + + $js_params["df"] = ""; + } + + $json = json_decode($this->get( + "https://duckduckgo.com/news.js", + $js_params, + ddg::req_xhr + ), true); + + }catch(Exception $err){ + + throw new Exception("Failed to get news.js"); + } + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + if(isset($json["next"])){ + + $out["npt"] = + $this->nextpage->store( + explode("?", $json["next"])[1], + "news" + ); + } + + for($i=0; $i<count($json["results"]); $i++){ + + $out["news"][] = [ + "title" => $this->titledots($this->unescapehtml($json["results"][$i]["title"])), + "author" => $this->unescapehtml($json["results"][$i]["source"]), + "description" => $this->titledots($this->unescapehtml(strip_tags($json["results"][$i]["excerpt"]))), + "date" => $json["results"][$i]["date"], + "thumb" => + [ + "url" => isset($json["results"][$i]["image"]) ? $json["results"][$i]["image"] : null, + "ratio" => "16:9" + ], + "url" => $this->sanitizeurl($json["results"][$i]["url"]) + ]; + } + + return $out; + } + + private function hmstoseconds($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + private function titledots($title){ + + $substr = substr($title, -3); + + if( + $substr == "..." || + $substr == "…" + ){ + + return trim(substr($title, 0, -3)); + } + + return trim($title); + } + + private function unescapehtml($str){ + + return html_entity_decode( + str_replace( + [ + "<br>", + "<br/>", + "</br>", + "<BR>", + "<BR/>", + "</BR>", + ], + "\n", + $str + ), + ENT_QUOTES | ENT_XML1, 'UTF-8' + ); + } + + private function bingimg($url){ + + $parse = parse_url($url); + parse_str($parse["query"], $parts); + + return "https://" . $parse["host"] . "/th?id=" . urlencode($parts["id"]); + } + + private function htmltoarray($html){ + + $html = strip_tags($html, ["img", "pre", "code", "br", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "a"]); + + libxml_use_internal_errors(true); + $dom = new DOMDocument("1.0", "utf-8"); + $dom->loadHTML('<div>' . $html . '</div>'); + $xpath = new DOMXPath($dom); + $descendants = $xpath->query('//div/node()'); + + $images = $xpath->query('//div/node()/img'); + $imageiterator = 0; + + if(count($descendants) === 0){ + + return [ + "type" => "text", + "value" => $this->unescapehtml($html) + ]; + } + + $array = []; + $previoustype = null; + + foreach($descendants as $node){ + + // $node->nodeValue = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $node->nodeValue); + + // get node type + switch($node->nodeName){ + case "#text": + $type = "text"; + break; + + case "pre": + $type = "code"; + break; + + case "code": + $type = "inline_code"; + break; + + case "h1": + case "h2": + case "h3": + case "h4": + case "h5": + case "h6": + $type = "title"; + break; + + case "blockquote": + $type = "quote"; + break; + + case "a": + $type = "link"; + break; + + case "img": + $type = "image"; + break; + } + + // add node to array + switch($type){ + + case "text": + $value = preg_replace( + '/ {2,}/', + " ", + $this->limitnewlines($this->unescapehtml($node->textContent)) + ); + + if( + $previoustype == "quote" || + $previoustype === null || + $previoustype == "image" || + $previoustype == "title" || + $previoustype == "code" + ){ + + $value = ltrim($value); + } + + if($value == ""){ + + $previoustype = $type; + continue 2; + } + + // merge with previous text node + if($previoustype == "text"){ + + $array[count($array) - 1]["value"] = trim($array[count($array) - 1]["value"]) . "\n" . $this->bstoutf8($value); + }else{ + + $array[] = [ + "type" => "text", + "value" => $this->bstoutf8($value) + ]; + } + break; + + case "inline_code": + case "bold": + $array[] = [ + "type" => "inline_code", + "value" => $this->bstoutf8(trim($this->limitnewlines($this->unescapehtml($node->textContent)))) + ]; + break; + + case "link": + // check for link nested inside of image + + if(strlen($node->childNodes->item(0)->textContent) !== 0){ + + $array[] = [ + "type" => "link", + "value" => $this->bstoutf8(trim($this->unescapehtml($node->textContent))), + "url" => $this->bstoutf8(preg_replace('/\/ddg$/', "", preg_replace('/^http:\/\//', "https://", $this->sanitizeurl($node->getAttribute("href"))))) + ]; + break; + } + + $type = "image"; + + if($previoustype == "text"){ + + $array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]); + } + + $array[] = [ + "type" => "image", + "url" => $this->bstoutf8(preg_replace('/^http:\/\//', "https://", preg_replace('/^\/\/images\.duckduckgo\.com\/iu\/\?u=/', "", $images->item($imageiterator)->getAttribute("src")))) + ]; + + $imageiterator++; + + break; + + case "image": + + if($previoustype == "text"){ + + $array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]); + } + + $array[] = [ + "type" => "image", + "url" => $this->bstoutf8(preg_replace('/^http:\/\//', "https://", preg_replace('/^\/\/images\.duckduckgo\.com\/iu\/\?u=/', "", $node->getAttribute("src")))) + ]; + break; + + case "quote": + case "title": + case "code": + if($previoustype == "text"){ + + $array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]); + } + // no break + + default: + + $value = trim($this->limitnewlines($this->unescapehtml($node->textContent))); + if($type != "code"){ + + $value = preg_replace( + '/ {2,}/', + " ", + $value + ); + } + + $array[] = [ + "type" => $type, + "value" => $this->bstoutf8($value) + ]; + break; + } + + $previoustype = $type; + } + + return $array; + } + + private function bstoutf8($bs){ + + return iconv("UTF-8", "ISO-8859-1//TRANSLIT", $bs); + } + + private function limitnewlines($text){ + + preg_replace( + '/(?:[\n\r] *){2,}/m', + "\n\n", + $text + ); + + return $text; + } + + private function sanitizeurl($url){ + + // check for domains w/out first short subdomain (ex: www.) + + $domain = parse_url($url, PHP_URL_HOST); + + $subdomain = preg_replace( + '/^[A-z0-9]{1,3}\./', + "", + $domain + ); + + switch($subdomain){ + case "ebay.com.au": + case "ebay.at": + case "ebay.ca": + case "ebay.fr": + case "ebay.de": + case "ebay.com.hk": + case "ebay.ie": + case "ebay.it": + case "ebay.com.my": + case "ebay.nl": + case "ebay.ph": + case "ebay.pl": + case "ebay.com.sg": + case "ebay.es": + case "ebay.ch": + case "ebay.co.uk": + case "cafr.ebay.ca": + case "ebay.com": + case "community.ebay.com": + case "pages.ebay.com": + + // remove ebay tracking elements + $old_params = parse_url($url, PHP_URL_QUERY); + parse_str($old_params, $params); + + if(isset($params["mkevt"])){ unset($params["mkevt"]); } + if(isset($params["mkcid"])){ unset($params["mkcid"]); } + if(isset($params["mkrid"])){ unset($params["mkrid"]); } + if(isset($params["campid"])){ unset($params["campid"]); } + if(isset($params["customid"])){ unset($params["customid"]); } + if(isset($params["toolid"])){ unset($params["toolid"]); } + if(isset($params["_sop"])){ unset($params["_sop"]); } + if(isset($params["_dcat"])){ unset($params["_dcat"]); } + if(isset($params["epid"])){ unset($params["epid"]); } + if(isset($params["epid"])){ unset($params["oid"]); } + + $params = http_build_query($params); + + if(strlen($params) === 0){ + $replace = "\?"; + }else{ + $replace = ""; + } + + $url = preg_replace( + "/" . $replace . preg_quote($old_params, "/") . "$/", + $params, + $url + ); + break; + } + + return $url; + } + + private function number_format($number){ + + $number = explode(".", sprintf('%f', $number)); + + if(count($number) === 1){ + + return number_format((float)$number[0], 0, ",", "."); + } + + return number_format((float)$number[0], 0, ",", "") . "." . (string)$number[1]; + } + + private function bingratio($width, $height){ + + $ratio = [ + 474 / $width, + 474 / $height + ]; + + if($ratio[0] < $ratio[1]){ + + $ratio = $ratio[0]; + }else{ + + $ratio = $ratio[1]; + } + + return [ + floor($width * $ratio), + floor($height * $ratio) + ]; + } +} diff --git a/scraper/google.php b/scraper/google.php new file mode 100644 index 0000000..6a746f7 --- /dev/null +++ b/scraper/google.php @@ -0,0 +1,1562 @@ +<?php + +class google{ + + private const is_class = "."; + private const is_id = "#"; + + public function __construct(){ + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + include "lib/nextpage.php"; + $this->nextpage = new nextpage("google"); + } + + public function getfilters($page){ + + switch($page){ + + case "web": return [];/* + return [ + "country" => [ + "display" => "Country", + "option" => [ + "zz" => "Instance region", + "af" => "Afghanistan", + "al" => "Albania", + "dz" => "Algeria", + "as" => "American Samoa", + "ad" => "Andorra", + "ao" => "Angola", + "ag" => "Antigua & Barbuda", + "ar" => "Argentina", + "am" => "Armenia", + "au" => "Australia", + "at" => "Austria", + "az" => "Azerbaijan", + "bs" => "Bahamas", + "bh" => "Bahrain", + "bd" => "Bangladesh", + "by" => "Belarus", + "be" => "Belgium", + "bz" => "Belize", + "bj" => "Benin", + "bt" => "Bhutan", + "bo" => "Bolivia", + "ba" => "Bosnia & Herzegovina", + "bw" => "Botswana", + "br" => "Brazil", + "bn" => "Brunei", + "bg" => "Bulgaria", + "bf" => "Burkina Faso", + "bi" => "Burundi", + "kh" => "Cambodia", + "cm" => "Cameroon", + "ca" => "Canada", + "cv" => "Cape Verde", + "cf" => "Central African Republic", + "td" => "Chad", + "cl" => "Chile", + "co" => "Colombia", + "cg" => "Congo - Brazzaville", + "cd" => "Congo - Kinshasa", + "ck" => "Cook Islands", + "cr" => "Costa Rica", + "ci" => "Côte d’Ivoire", + "hr" => "Croatia", + "cu" => "Cuba", + "cy" => "Cyprus", + "cz" => "Czechia", + "dk" => "Denmark", + "dj" => "Djibouti", + "dm" => "Dominica", + "do" => "Dominican Republic", + "ec" => "Ecuador", + "eg" => "Egypt", + "sv" => "El Salvador", + "ee" => "Estonia", + "et" => "Ethiopia", + "fj" => "Fiji", + "fi" => "Finland", + "fr" => "France", + "ga" => "Gabon", + "gm" => "Gambia", + "ge" => "Georgia", + "de" => "Germany", + "gh" => "Ghana", + "gi" => "Gibraltar", + "gr" => "Greece", + "gl" => "Greenland", + "gt" => "Guatemala", + "gg" => "Guernsey", + "gy" => "Guyana", + "ht" => "Haiti", + "hn" => "Honduras", + "hk" => "Hong Kong", + "hu" => "Hungary", + "is" => "Iceland", + "in" => "India", + "id" => "Indonesia", + "iq" => "Iraq", + "ie" => "Ireland", + "im" => "Isle of Man", + "il" => "Israel", + "it" => "Italy", + "jm" => "Jamaica", + "jp" => "Japan", + "je" => "Jersey", + "jo" => "Jordan", + "kz" => "Kazakhstan", + "ke" => "Kenya", + "ki" => "Kiribati", + "kw" => "Kuwait", + "kg" => "Kyrgyzstan", + "la" => "Laos", + "lv" => "Latvia", + "lb" => "Lebanon", + "ls" => "Lesotho", + "ly" => "Libya", + "li" => "Liechtenstein", + "lt" => "Lithuania", + "lu" => "Luxembourg", + "mg" => "Madagascar", + "mw" => "Malawi", + "my" => "Malaysia", + "mv" => "Maldives", + "ml" => "Mali", + "mt" => "Malta", + "mu" => "Mauritius", + "mx" => "Mexico", + "fm" => "Micronesia", + "md" => "Moldova", + "mn" => "Mongolia", + "me" => "Montenegro", + "ma" => "Morocco", + "mz" => "Mozambique", + "mm" => "Myanmar (Burma)", + "na" => "Namibia", + "nr" => "Nauru", + "np" => "Nepal", + "nl" => "Netherlands", + "nz" => "New Zealand", + "ni" => "Nicaragua", + "ne" => "Niger", + "ng" => "Nigeria", + "nu" => "Niue", + "mk" => "North Macedonia", + "no" => "Norway", + "om" => "Oman", + "pk" => "Pakistan", + "ps" => "Palestine", + "pa" => "Panama", + "pg" => "Papua New Guinea", + "py" => "Paraguay", + "pe" => "Peru", + "ph" => "Philippines", + "pn" => "Pitcairn Islands", + "pl" => "Poland", + "pt" => "Portugal", + "pr" => "Puerto Rico", + "qa" => "Qatar", + "ro" => "Romania", + "ru" => "Russia", + "rw" => "Rwanda", + "ws" => "Samoa", + "sm" => "San Marino", + "st" => "São Tomé & Príncipe", + "sa" => "Saudi Arabia", + "sn" => "Senegal", + "rs" => "Serbia", + "sc" => "Seychelles", + "sl" => "Sierra Leone", + "sg" => "Singapore", + "sk" => "Slovakia", + "si" => "Slovenia", + "sb" => "Solomon Islands", + "so" => "Somalia", + "za" => "South Africa", + "kr" => "South Korea", + "es" => "Spain", + "lk" => "Sri Lanka", + "sh" => "St. Helena", + "vc" => "St. Vincent & Grenadines", + "sr" => "Suriname", + "se" => "Sweden", + "ch" => "Switzerland", + "tw" => "Taiwan", + "tj" => "Tajikistan", + "tz" => "Tanzania", + "th" => "Thailand", + "tl" => "Timor-Leste", + "tg" => "Togo", + "to" => "Tonga", + "tt" => "Trinidad & Tobago", + "tn" => "Tunisia", + "tr" => "Türkiye", + "tm" => "Turkmenistan", + "vi" => "U.S. Virgin Islands", + "ug" => "Uganda", + "ua" => "Ukraine", + "ae" => "United Arab Emirates", + "gb" => "United Kingdom", + "us" => "United States", + "uy" => "Uruguay", + "uz" => "Uzbekistan", + "vu" => "Vanuatu", + "ve" => "Venezuela", + "vn" => "Vietnam", + "zm" => "Zambia", + "zw" => "Zimbabwe" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ], + "lang" => [ // prefix with lang_ + "display" => "Language", + "option" => [ + "any" => "Any language", + "af" => "Afrikaans", + "ca" => "català", + "cs" => "čeština", + "da" => "dansk", + "de" => "Deutsch", + "et" => "eesti", + "en" => "English", + "es" => "español", + "eo" => "esperanto", + "tl" => "Filipino", + "fr" => "français", + "hr" => "hrvatski", + "id" => "Indonesia", + "is" => "íslenska", + "it" => "italiano", + "sw" => "Kiswahili", + "lv" => "latviešu", + "lt" => "lietuvių", + "hu" => "magyar", + "nl" => "Nederlands", + "no" => "norsk", + "pl" => "polski", + "pt" => "português", + "ro" => "română", + "sk" => "slovenčina", + "sl" => "slovenščina", + "fi" => "suomi", + "sv" => "svenska", + "vi" => "Tiếng Việt", + "tr" => "Türkçe", + "el" => "Ελληνικά", + "be" => "беларуская", + "bg" => "български", + "ru" => "русский", + "sr" => "српски", + "uk" => "українська", + "hy" => "հայերեն", + "iw" => "עברית", + "ar" => "العربية", + "fa" => "فارسی", + "hi" => "हिन्दी", + "th" => "ไทย", + "ko" => "한국어", + "zh-CN" => "中文 (简体)", + "zh-TW" => "中文 (繁體)", + "ja" => "日本語" + ] + ], + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "h" => "Last hour", + "d" => "Last 24 hours", + "w" => "Last week", + "m" => "Last month", + "y" => "Last year" + ] + ], + "verbatim" => [ + "display" => "Verbatim", + "option" => [ + "no" => "No", + "yes" => "Yes" + ] + ] + ];*/ + break; + + case "images": + return [ + "country" => [ // gl=<country> + "display" => "Country", + "option" => [ + "any" => "Instance's country", + "af" => "Afghanistan", + "al" => "Albania", + "dz" => "Algeria", + "as" => "American Samoa", + "ad" => "Andorra", + "ao" => "Angola", + "ai" => "Anguilla", + "aq" => "Antarctica", + "ag" => "Antigua and Barbuda", + "ar" => "Argentina", + "am" => "Armenia", + "aw" => "Aruba", + "au" => "Australia", + "at" => "Austria", + "az" => "Azerbaijan", + "bs" => "Bahamas", + "bh" => "Bahrain", + "bd" => "Bangladesh", + "bb" => "Barbados", + "by" => "Belarus", + "be" => "Belgium", + "bz" => "Belize", + "bj" => "Benin", + "bm" => "Bermuda", + "bt" => "Bhutan", + "bo" => "Bolivia", + "ba" => "Bosnia and Herzegovina", + "bw" => "Botswana", + "bv" => "Bouvet Island", + "br" => "Brazil", + "io" => "British Indian Ocean Territory", + "bn" => "Brunei Darussalam", + "bg" => "Bulgaria", + "bf" => "Burkina Faso", + "bi" => "Burundi", + "kh" => "Cambodia", + "cm" => "Cameroon", + "ca" => "Canada", + "cv" => "Cape Verde", + "ky" => "Cayman Islands", + "cf" => "Central African Republic", + "td" => "Chad", + "cl" => "Chile", + "cn" => "China", + "cx" => "Christmas Island", + "cc" => "Cocos (Keeling) Islands", + "co" => "Colombia", + "km" => "Comoros", + "cg" => "Congo", + "cd" => "Congo, the Democratic Republic of the", + "ck" => "Cook Islands", + "cr" => "Costa Rica", + "ci" => "Cote D'ivoire", + "hr" => "Croatia", + "cu" => "Cuba", + "cy" => "Cyprus", + "cz" => "Czech Republic", + "dk" => "Denmark", + "dj" => "Djibouti", + "dm" => "Dominica", + "do" => "Dominican Republic", + "ec" => "Ecuador", + "eg" => "Egypt", + "sv" => "El Salvador", + "gq" => "Equatorial Guinea", + "er" => "Eritrea", + "ee" => "Estonia", + "et" => "Ethiopia", + "fk" => "Falkland Islands (Malvinas)", + "fo" => "Faroe Islands", + "fj" => "Fiji", + "fi" => "Finland", + "fr" => "France", + "gf" => "French Guiana", + "pf" => "French Polynesia", + "tf" => "French Southern Territories", + "ga" => "Gabon", + "gm" => "Gambia", + "ge" => "Georgia", + "de" => "Germany", + "gh" => "Ghana", + "gi" => "Gibraltar", + "gr" => "Greece", + "gl" => "Greenland", + "gd" => "Grenada", + "gp" => "Guadeloupe", + "gu" => "Guam", + "gt" => "Guatemala", + "gn" => "Guinea", + "gw" => "Guinea-Bissau", + "gy" => "Guyana", + "ht" => "Haiti", + "hm" => "Heard Island and Mcdonald Islands", + "va" => "Holy See (Vatican City State)", + "hn" => "Honduras", + "hk" => "Hong Kong", + "hu" => "Hungary", + "is" => "Iceland", + "in" => "India", + "id" => "Indonesia", + "ir" => "Iran, Islamic Republic of", + "iq" => "Iraq", + "ie" => "Ireland", + "il" => "Israel", + "it" => "Italy", + "jm" => "Jamaica", + "jp" => "Japan", + "jo" => "Jordan", + "kz" => "Kazakhstan", + "ke" => "Kenya", + "ki" => "Kiribati", + "kp" => "Korea, Democratic People's Republic of", + "kr" => "Korea, Republic of", + "kw" => "Kuwait", + "kg" => "Kyrgyzstan", + "la" => "Lao People's Democratic Republic", + "lv" => "Latvia", + "lb" => "Lebanon", + "ls" => "Lesotho", + "lr" => "Liberia", + "ly" => "Libyan Arab Jamahiriya", + "li" => "Liechtenstein", + "lt" => "Lithuania", + "lu" => "Luxembourg", + "mo" => "Macao", + "mk" => "Macedonia, the Former Yugosalv Republic of", + "mg" => "Madagascar", + "mw" => "Malawi", + "my" => "Malaysia", + "mv" => "Maldives", + "ml" => "Mali", + "mt" => "Malta", + "mh" => "Marshall Islands", + "mq" => "Martinique", + "mr" => "Mauritania", + "mu" => "Mauritius", + "yt" => "Mayotte", + "mx" => "Mexico", + "fm" => "Micronesia, Federated States of", + "md" => "Moldova, Republic of", + "mc" => "Monaco", + "mn" => "Mongolia", + "ms" => "Montserrat", + "ma" => "Morocco", + "mz" => "Mozambique", + "mm" => "Myanmar", + "na" => "Namibia", + "nr" => "Nauru", + "np" => "Nepal", + "nl" => "Netherlands", + "an" => "Netherlands Antilles", + "nc" => "New Caledonia", + "nz" => "New Zealand", + "ni" => "Nicaragua", + "ne" => "Niger", + "ng" => "Nigeria", + "nu" => "Niue", + "nf" => "Norfolk Island", + "mp" => "Northern Mariana Islands", + "no" => "Norway", + "om" => "Oman", + "pk" => "Pakistan", + "pw" => "Palau", + "ps" => "Palestinian Territory, Occupied", + "pa" => "Panama", + "pg" => "Papua New Guinea", + "py" => "Paraguay", + "pe" => "Peru", + "ph" => "Philippines", + "pn" => "Pitcairn", + "pl" => "Poland", + "pt" => "Portugal", + "pr" => "Puerto Rico", + "qa" => "Qatar", + "re" => "Reunion", + "ro" => "Romania", + "ru" => "Russian Federation", + "rw" => "Rwanda", + "sh" => "Saint Helena", + "kn" => "Saint Kitts and Nevis", + "lc" => "Saint Lucia", + "pm" => "Saint Pierre and Miquelon", + "vc" => "Saint Vincent and the Grenadines", + "ws" => "Samoa", + "sm" => "San Marino", + "st" => "Sao Tome and Principe", + "sa" => "Saudi Arabia", + "sn" => "Senegal", + "cs" => "Serbia and Montenegro", + "sc" => "Seychelles", + "sl" => "Sierra Leone", + "sg" => "Singapore", + "sk" => "Slovakia", + "si" => "Slovenia", + "sb" => "Solomon Islands", + "so" => "Somalia", + "za" => "South Africa", + "gs" => "South Georgia and the South Sandwich Islands", + "es" => "Spain", + "lk" => "Sri Lanka", + "sd" => "Sudan", + "sr" => "Suriname", + "sj" => "Svalbard and Jan Mayen", + "sz" => "Swaziland", + "se" => "Sweden", + "ch" => "Switzerland", + "sy" => "Syrian Arab Republic", + "tw" => "Taiwan, Province of China", + "tj" => "Tajikistan", + "tz" => "Tanzania, United Republic of", + "th" => "Thailand", + "tl" => "Timor-Leste", + "tg" => "Togo", + "tk" => "Tokelau", + "to" => "Tonga", + "tt" => "Trinidad and Tobago", + "tn" => "Tunisia", + "tr" => "Turkey", + "tm" => "Turkmenistan", + "tc" => "Turks and Caicos Islands", + "tv" => "Tuvalu", + "ug" => "Uganda", + "ua" => "Ukraine", + "ae" => "United Arab Emirates", + "uk" => "United Kingdom", + "us" => "United States", + "um" => "United States Minor Outlying Islands", + "uy" => "Uruguay", + "uz" => "Uzbekistan", + "vu" => "Vanuatu", + "ve" => "Venezuela", + "vn" => "Viet Nam", + "vg" => "Virgin Islands, British", + "vi" => "Virgin Islands, U.S.", + "wf" => "Wallis and Futuna", + "eh" => "Western Sahara", + "ye" => "Yemen", + "zm" => "Zambia", + "zw" => "Zimbabwe" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // safe=active + "no" => "No" // safe=off + ] + ], + "lang" => [ // lr=<lang> (prefix lang with "lang_") + "display" => "Language", + "option" => [ + "any" => "Any language", + "ar" => "Arabic", + "bg" => "Bulgarian", + "ca" => "Catalan", + "cs" => "Czech", + "da" => "Danish", + "de" => "German", + "el" => "Greek", + "en" => "English", + "es" => "Spanish", + "et" => "Estonian", + "fi" => "Finnish", + "fr" => "French", + "hr" => "Croatian", + "hu" => "Hungarian", + "id" => "Indonesian", + "is" => "Icelandic", + "it" => "Italian", + "iw" => "Hebrew", + "ja" => "Japanese", + "ko" => "Korean", + "lt" => "Lithuanian", + "lv" => "Latvian", + "nl" => "Dutch", + "no" => "Norwegian", + "pl" => "Polish", + "pt" => "Portuguese", + "ro" => "Romanian", + "ru" => "Russian", + "sk" => "Slovak", + "sl" => "Slovenian", + "sr" => "Serbian", + "sv" => "Swedish", + "tr" => "Turkish", + "zh-CN" => "Chinese (Simplified)", + "zh-TW" => "Chinese (Traditional)" + ] + ], + "newer" => [ // &sort=review-date:r:20090301:20090430 + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ], + "size" => [ // tbs=isz:<size> + "display" => "Size", + "option" => [ + "any" => "Any size", + "l" => "Large", + "m" => "Medium", + "i" => "Icon" + ] + ], + "color" => [ // tbs=ic:<color> + "display" => "Color", + "option" => [ + "any" => "Any color", + "gray" => "Black and white", + "trans" => "Transparent", + // from here, format is + // tbs=specific,isc:<color> + "red" => "Red", + "orange" => "Orange", + "yellow" => "Yellow", + "green" => "Green", + "teal" => "Teal", + "blue" => "Blue", + "purple" => "Purple", + "pink" => "Pink", + "white" => "White", + "gray" => "Gray", + "black" => "Black", + "brown" => "Brown" + ] + ], + "type" => [ // tbs=itp:<type> + "display" => "Type", + "option" => [ + "any" => "Any type", + "clipart" => "Clip Art", + "lineart" => "Line Drawing", + "animated" => "GIF" + ] + ], + "rights" => [ // tbs=il:<rights> + "display" => "Usage rights", + "option" => [ + "any" => "No license", + "cl" => "Creative Commons licenses", + "ol" => "Commercial & other licenses" + ] + ] + ]; + break; + } + } + + private function get($url, $get = []){ + + $headers = [ + "User-Agent: Mozilla/5.0 (Linux; U; Android 2.3.3; pt-pt; LG-P500h-parrot Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MMS/LG-Android-MMS-V1.0/1.2", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1" + ]; + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + $handle = fopen("scraper/google.html", "r"); + $html = fread($handle, filesize("scraper/google.html")); + fclose($handle); + + $this->fuckhtml->load($html); + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $styles = + $this->fuckhtml + ->getElementsByTagName("style"); + + $this->computedstyle = []; + + foreach($styles as $style){ + + $this->computedstyle = + array_merge( + $this->computedstyle, + $this->parsestyles($style["innerHTML"]) + ); + } + + // get images in javascript var + preg_match( + '/google\.ldi=({[^}]+})/', + $html, + $js_image + ); + + if(count($js_image) !== 0){ + + $js_image = json_decode($js_image[1], true); + }else{ + + $js_image = []; + } + + // get nodes + // fuck you google!!!!!!!!!!!!!! + + $containers = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "background-color" => "#fff", + "margin-bottom" => "10px", + "-webkit-box-shadow" => "0 1px 6px rgba(32,33,36,0.28)", + "border-radius" => "8px" + ], + self::is_class + ), + "div" + ); + + foreach($containers as $container){ + + $this->fuckhtml->load($container); + + // get link at the top + $link = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($link) !== 0){ + + $link = + $this->decodeurl( + $link + [0] + ["attributes"] + ["href"] + ); + } + + /* + Check for carousel presence + */ + $carousel = + $this->fuckhtml + ->getElementsByClassName( + "pcitem", + "div" + ); + + $title = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "color" => "#1967d2", + "font-size" => "20px", + "line-height" => "26px" + ], + self::is_class + ), + "div" + ); + + $carousel_title = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "font-size" => "16px", + "line-height" => "20px", + "font-weight" => "400" + ], + self::is_class + ), + "div" + ); + + if(count($carousel) !== 0){ + + $sublink = []; // twitter carousel sublinks + foreach($carousel as $item){ + + $this->fuckhtml->load($item); + + $url = + $this->decodeurl( + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0] + ["attributes"] + ["href"] + ); + + // detect if its a twitter carousel or + // a list of news articles + + $grey_node = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "white-space" => "pre-line", + "word-wrap" => "break-word" + ], + self::is_class + ), + "div" + ); + + if(count($carousel_title) !== 0){ + + if( + $this->fuckhtml + ->getTextContent( + $carousel_title[0] + ) + == "Top stories" + ){ + + $img = + $this->fuckhtml + ->getElementsByTagName("img"); + + if( + count($img) !== 0 && + isset($img[0]["attributes"]["id"]) && + isset($js_image[$img[0]["attributes"]["id"]]) + ){ + + $img = [ + "url" => $js_image[$img[0]["attributes"]["id"]], + "ratio" => "16:9" + ]; + }else{ + + $img = [ + "url" => null, + "ratio" => null + ]; + } + + /* + Is a news node + */ + $out["news"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $grey_node[0] + ), + "description" => null, + "date" => + strtotime( + explode( + "\n", + $grey_node[1]["innerHTML"] + )[1] + ), + "thumb" => $img, + "url" => $url + ]; + } + }else{ + + /* + Is a web node (twitter-like) + create a link -> sublink structure and + ignore images + */ + + switch(count($grey_node)){ + + case 0: + continue 2; + + case 1: + $sublink_title = $grey_node[0]; + $sublink_description = null; + break; + + case 2: + $sublink_title = $grey_node[1]; + $sublink_description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $grey_node[0] + ) + ); + break; + } + + $sublink_url = + $this->decodeurl( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0] + ["attributes"] + ["href"] + ) + ); + + if($link == $sublink_url){ + + continue; + } + + $sublink_title = + explode( + " • ", + $this->fuckhtml + ->getTextContent( + $sublink_title["innerHTML"] + ) + ); + + if(count($sublink_title) !== 1){ + + $date = strtotime($sublink_title[1]); + }else{ + + $date = null; + } + + $sublink_title = $this->titledots($sublink_title[0]); + + $sublink[] = [ + "title" => $sublink_title, + "date" => $date, + "description" => $sublink_description, + "url" => $sublink_url + ]; + } + } + + // if it was a web node + if(count($sublink) !== 0){ + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ), + "description" => null, + "url" => $url, + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => $sublink, + "table" => [] + ]; + } + + continue; + } + + if(count($title) !== 0){ + + /* + Get WEB search results + */ + + $thumb = + $this->fuckhtml + ->getElementsByTagName("img"); + + if( + count($thumb) !== 0 && + isset($js_image[$thumb[0]["attributes"]["id"]]) + ){ + + $thumb = [ + "url" => + $js_image[$thumb[0]["attributes"]["id"]], + "ratio" => "1:1" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + // this contains description, sublinks + $inner_category = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "white-space" => "pre-line", + "word-wrap" => "break-word" + ], + self::is_class + ), + "div" + ); + + // set empty values + $description = null; + $table = []; + $sublinks = []; + $date = null; + + foreach($inner_category as $category){ + + if($category["level"] !== 6){ + + // enterring protocol 6 + // and u dont seem to understaaaaandddddd + continue; + } + + $this->fuckhtml->load($category); + + // check if its a table + preg_match( + '/^[A-z0-9 ]+: <span/', + $category["innerHTML"], + $tablematch + ); + + if(count($tablematch) !== 0){ + + $categories = explode("<br>", $category["innerHTML"]); + + foreach($categories as $cat){ + + $cat = explode(":", $cat, 2); + + $table[ + $this->fuckhtml + ->getTextContent( + $cat[0] + ) + ] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $cat[1] + ) + ); + } + continue; + } + + $spans = + $this->fuckhtml + ->getElementsByTagName("span"); + + foreach($spans as $span){ + + // replace element with nothing + if(empty($description)){ + $category["innerHTML"] = + str_replace( + $span["outerHTML"], + "", + $category["innerHTML"] + ); + } + + // get rating + if(isset($span["attributes"]["aria-hidden"])){ + + $table["Rating"] = $span["innerHTML"]; + continue; + } + } + + if(empty($description)){ + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $category + ) + ); + } + } + + // check if traversed div is the description + /* + if( + count( + $this->fuckhtml + ->getElementsByTagName("*") + ) === 0 + ){ + + $description = + $this->fuckhtml + ->getTextContent($inner_category); + }else{ + + $this-> + + // we need to traverse description struct + foreach($inner_category as $category){ + + // detect description + $this->fuckhtml->load($category); + + $spans = + $this->fuckhtml + ->getElementsByTagName("span"); + + $is_desc = false; + $is_first_span = true; + + foreach($spans as $span){ + + // get rating + if(isset($span["attributes"]["aria-hidden"])){ + + $table["Rating"] = $span["innerHTML"] . "/5"; + continue; + } + + // get date posted + if( + $is_first_span && + $date_tmp = strtotime($span["innerHTML"]) + ){ + + $date = $date_tmp; + continue; + } + + $is_first_span = false; + } + } + }*/ + + // get sublinks + $this->fuckhtml->load($container["innerHTML"]); + + $as = + $this->fuckhtml->getElementsByTagName("a"); + + foreach($as as $a){ + + $this->fuckhtml->load($a); + + $detect = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "color" => "#1967d2", + "font-size" => "14px", + "line-height" => "20px" + ], + self::is_class + ), + "span" + ); + + if(count($detect) !== 0){ + + $sublinks[] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $a + ) + ), + "date" => null, + "description" => null, + "url" => + $this->decodeurl( + $a["attributes"]["href"] + ) + ]; + } + } + + $data = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ), + "description" => $description, + "url" => $link, + "date" => $date, + "type" => "web", + "thumb" => $thumb, + "sublink" => $sublinks, + "table" => $table + ]; + + $out["web"][] = $data; + + continue; + } + + /* + Check related searches node + */ + $relateds = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "display" => "block", + "position" => "relative", + "width" => "100%" + ], + self::is_class + ), + "a" + ); + + if(count($relateds) !== 0){ + + foreach($relateds as $related){ + + $out["related"][] = + $this->fuckhtml + ->getTextContent( + $related + ); + } + } + + /* + Get next page + */ + $nextpage = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "-webkit-box-flex" => "1", + "display" => "block" + ], + self::is_class + ), + "a" + ); + + if(count($nextpage) !== 0){ + + $out["npt"] = + explode( + "?", + $this->fuckhtml + ->getTextContent( + $nextpage[0] + ["attributes"] + ["href"] + ) + )[1]; + } + } + + return $out; + } + + public function image($get){ + + $handle = fopen("scraper/google-img.html", "r"); + $html = fread($handle, filesize("scraper/google-img.html")); + fclose($handle); + + $this->fuckhtml->load($html); + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + $images = + $this->fuckhtml + ->getElementsByClassName( + "islrtb isv-r", + "div" + ); + + // get next page + // https://www.google.com/search + // ?q=higurashi + // &tbm=isch + // &async=_id%3Aislrg_c%2C_fmt%3Ahtml + // &asearch=ichunklite + // &ved=0ahUKEwidjYXJqJSAAxWrElkFHZ07CDwQtDIIQygA + $ved = + $this->fuckhtml + ->getElementById("islrg", "div"); + + if($ved){ + + $ved = + $this->fuckhtml + ->getTextContent( + $ved["attributes"]["data-ved"] + ); + + // &vet=1{$ved}..i (10ahUKEwidjYXJqJSAAxWrElkFHZ07CDwQtDIIQygA..i) + + /* + These 2 are handled by us + start = start + number of results + ijn = current page number + */ + // &start=100 + // &ijn=1 + + // &imgvl=CAEY7gQgBSj3Aji8VTjXVUC4AUC3AUgAYNdV + preg_match( + '/var e=\'([A-z0-9]+)\';/', + $html, + $imgvl + ); + + $imgvl = $imgvl[1]; + + $out["npt"] = [ + "q" => $get["s"], + "tbm" => "isch", + "async" => "_id:islrg_c,_fmt:html", + "asearch" => "ichunklite", + "ved" => $ved, + "vet" => "1" . $ved . "..i", + "start" => 100, + "ijn" => 1, + "imgvl" => $imgvl + ]; + } + + foreach($images as $image){ + + $this->fuckhtml->load($image); + $img = + $this->fuckhtml + ->getElementsByTagName("img")[0]; + + $og_width = (int)$image["attributes"]["data-ow"]; + $og_height = (int)$image["attributes"]["data-oh"]; + $thumb_width = (int)$image["attributes"]["data-tw"]; + + $ratio = $og_width / $og_height; + + if(isset($img["attributes"]["data-src"])){ + + $src = &$img["attributes"]["data-src"]; + }else{ + + $src = &$img["attributes"]["src"]; + } + + $thumb_height = floor($thumb_width / $ratio); + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $image["attributes"]["data-pt"] + ) + ), + "source" => [ + [ + "url" => + $this->fuckhtml + ->getTextContent( + $image["attributes"]["data-ou"] + ), + "width" => $og_width, + "height" => $og_height + ], + [ + "url" => + $this->fuckhtml + ->getTextContent( + $src + ), + "width" => $thumb_width, + "height" => $thumb_height + ] + ], + "url" => + $this->fuckhtml + ->getTextContent( + $image["attributes"]["data-ru"] + ) + ]; + } + + return $out; + } + + private function findstyles($rules, $is){ + + ksort($rules); + + foreach($this->computedstyle as $stylename => $styles){ + + if($styles == $rules){ + + preg_match( + '/\\' . $is . '([^ .]+)/', + $stylename, + $out + ); + + if(count($out) === 2){ + + return $out[1]; + } + + return false; + } + } + + return false; + } + + private function parsestyles($style){ + + // get style tags + preg_match_all( + '/([^{]+){([^}]+)}/', + $style, + $tags_regex + ); + + $tags = []; + + for($i=0; $i<count($tags_regex[0]); $i++){ + + $tagnames = explode(",", trim($tags_regex[1][$i])); + + foreach($tagnames as $tagname){ + + $tagname = trim($tagname); + + if(!isset($tags[$tagname])){ + $tags[$tagname] = []; + } + + $values = explode(";", $tags_regex[2][$i]); + + foreach($values as $value){ + + $value = explode(":", $value, 2); + + if(count($value) !== 2){ + + continue; + } + + $tags[$tagname][trim($value[0])] = + trim($value[1]); + } + } + } + + foreach($tags as &$value){ + + ksort($value); + } + + return $tags; + } + + private function decodeurl($url){ + + preg_match( + '/^\/url\?q=([^&]+)|^\/interstitial\?url=([^&]+)/', + $this->fuckhtml + ->getTextContent($url), + $match + ); + + if(count($match) !== 0){ + + if(!empty($match[1])){ + + return urldecode($match[1]); + } + + if(!empty($match[2])){ + + return urldecode($match[2]); + } + } + + return null; + } + + private function titledots($title){ + + return rtrim($title, ".… \t\n\r\0\x0B"); + } +} + diff --git a/scraper/marginalia.php b/scraper/marginalia.php new file mode 100644 index 0000000..c8ab09f --- /dev/null +++ b/scraper/marginalia.php @@ -0,0 +1,242 @@ +<?php + +class marginalia{ + public function __construct(){ + + $this->key = "public"; + } + + public function getfilters($page){ + + switch($page){ + + case "web": + return [ + "profile" => [ + "display" => "Profile", + "option" => [ + "any" => "Default", + "modern" => "Modern" + ] + ], + "format" => [ + "display" => "Format", + "option" => [ + "any" => "Any", + "html5" => "html5", + "xhtml" => "xhtml", + "html123" => "html123" + ] + ], + "file" => [ + "display" => "File", + "option" => [ + "any" => "Any", + "nomedia" => "Deny media", + "media" => "Contains media", + "audio" => "Contains audio", + "video" => "Contains video", + "archive" => "Contains archive", + "document" => "Contains document" + ] + ], + "javascript" => [ + "display" => "Javascript", + "option" => [ + "any" => "Allow JS", + "deny" => "Deny JS", + "require" => "Require JS" + ] + ], + "trackers" => [ + "display" => "Trackers", + "option" => [ + "any" => "Allow trackers", + "deny" => "Deny trackers", + "require" => "Require trackers" + ] + ], + "cookies" => [ + "display" => "Cookies", + "option" => [ + "any" => "Allow cookies", + "deny" => "Deny cookies", + "require" => "Require cookies" + ] + ], + "affiliate" => [ + "display" => "Affiliate links in body", + "option" => [ + "any" => "Allow affiliate links", + "deny" => "Deny affiliate links", + "require" => "Require affiliate links" + ] + ] + ]; + } + } + + private function get($url, $get = []){ + + $headers = [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1" + ]; + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + $search = [$get["s"]]; + $profile = $get["profile"]; + $format = $get["format"]; + $file = $get["file"]; + + foreach( + [ + "javascript" => $get["javascript"], + "trackers" => $get["trackers"], + "cookies" => $get["cookies"], + "affiliate" => $get["affiliate"] + ] + as $key => $value + ){ + + if($value == "any"){ continue; } + + switch($key){ + + case "javascript": $str = "js:true"; break; + case "trackers": $str = "special:tracking"; break; + case "cookies": $str = "special:cookies"; break; + case "affiliate": $str = "special:affiliate"; break; + } + + if($value == "deny"){ + $str = "-" . $str; + } + + $search[] = $str; + } + + if($format != "any"){ + + $search[] = "format:$format"; + } + + switch($file){ + + case "any": break; + case "nomedia": $search[] = "-special:media"; break; + case "media": $search[] = "special:media"; break; + + default: + $search[] = "file:$file"; + } + + $search = implode(" ", $search); + + $params = [ + "count" => 20 + ]; + + if($profile == "modern"){ + + $params["index"] = 1; + } + + try{ + $json = + $this->get( + "https://api.marginalia.nu/{$this->key}/search/" . urlencode($search), + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get JSON"); + } + + if($json == "Slow down"){ + + throw new Exception("The API key used is rate limited. Please try again in a few minutes."); + } + + $json = json_decode($json, true); + /* + $handle = fopen("scraper/marginalia.json", "r"); + $json = json_decode(fread($handle, filesize("scraper/marginalia.json")), true); + fclose($handle);*/ + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + foreach($json["results"] as $result){ + + $out["web"][] = [ + "title" => $result["title"], + "description" => str_replace("\n", " ", $result["description"]), + "url" => $result["url"], + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } +} + diff --git a/scraper/mojeek.php b/scraper/mojeek.php new file mode 100644 index 0000000..a0b5016 --- /dev/null +++ b/scraper/mojeek.php @@ -0,0 +1,1182 @@ +<?php + +class mojeek{ + public function __construct(){ + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + include "lib/nextpage.php"; + $this->nextpage = new nextpage("mojeek"); + } + + public function getfilters($page){ + + switch($page){ + + case "web": + return [ + "focus" => [ + "display" => "Focus", + "option" => [ + "any" => "No focus", + "blogs" => "Blogs", + "Dictionary" => "Dictionary", + "Recipes" => "Recipes", + "Time" => "Time", + "Weather" => "Weather" + ] + ], + "lang" => [ + "display" => "Language", + "option" => [ + "any" => "Any language", + "af" => "Afrikaans", + "sq" => "Albanian", + "an" => "Aragonese", + "ay" => "Aymara", + "bi" => "Bislama", + "br" => "Breton", + "ca" => "Catalan", + "kw" => "Cornish", + "co" => "Corsican", + "hr" => "Croatian", + "da" => "Danish", + "nl" => "Dutch", + "dz" => "Dzongkha", + "en" => "English", + "fj" => "Fijian", + "fi" => "Finnish", + "fr" => "French", + "gd" => "Gaelic", + "gl" => "Galician", + "de" => "German", + "ht" => "Haitian", + "io" => "Ido", + "id" => "Indonesian", + "ia" => "Interlingua", + "ie" => "Interlingue", + "ga" => "Irish", + "it" => "Italian", + "rw" => "Kinyarwanda", + "la" => "Latin", + "li" => "Limburgish", + "lb" => "Luxembourgish", + "no" => "Norwegian", + "nb" => "Norwegian Bokmål", + "nn" => "Norwegian Nynorsk", + "oc" => "Occitan (post 1500)", + "pl" => "Polish", + "pt" => "Portuguese", + "rm" => "Romansh", + "rn" => "Rundi", + "sg" => "Sango", + "so" => "Somali", + "es" => "Spanish", + "sw" => "Swahili", + "ss" => "Swati", + "sv" => "Swedish", + "ty" => "Tahitian", + "to" => "Tonga (Tonga Islands)", + "ts" => "Tsonga", + "vo" => "Volapük", + "wa" => "Walloon", + "cy" => "Welsh", + "xh" => "Xhosa", + "zu" => "Zulu" + ] + ], + "country" => [ + "display" => "Country", + "option" => [ + "any" => "No location bias", + "af" => "Afghanistan", + "ax" => "Åland Islands", + "al" => "Albania", + "dz" => "Algeria", + "as" => "American Samoa", + "ad" => "Andorra", + "ao" => "Angola", + "ai" => "Anguilla", + "aq" => "Antarctica", + "ag" => "Antigua and Barbuda", + "ar" => "Argentina", + "am" => "Armenia", + "aw" => "Aruba", + "au" => "Australia", + "at" => "Austria", + "az" => "Azerbaijan", + "bs" => "Bahamas", + "bh" => "Bahrain", + "bd" => "Bangladesh", + "bb" => "Barbados", + "by" => "Belarus", + "be" => "Belgium", + "bz" => "Belize", + "bj" => "Benin", + "bm" => "Bermuda", + "bt" => "Bhutan", + "bo" => "Bolivia (Plurinational State of)", + "bq" => "Bonaire, Sint Eustatius and Saba", + "ba" => "Bosnia and Herzegovina", + "bw" => "Botswana", + "bv" => "Bouvet Island", + "br" => "Brazil", + "io" => "British Indian Ocean Territory", + "bn" => "Brunei Darussalam", + "bg" => "Bulgaria", + "bf" => "Burkina Faso", + "bi" => "Burundi", + "cv" => "Cabo Verde", + "kh" => "Cambodia", + "cm" => "Cameroon", + "ca" => "Canada", + "ky" => "Cayman Islands", + "cf" => "Central African Republic", + "td" => "Chad", + "cl" => "Chile", + "cn" => "China", + "cx" => "Christmas Island", + "cc" => "Cocos (Keeling) Islands", + "co" => "Colombia", + "km" => "Comoros", + "cg" => "Congo", + "cd" => "Congo (Democratic Republic of the)", + "ck" => "Cook Islands", + "cr" => "Costa Rica", + "ci" => "Côte d'Ivoire", + "hr" => "Croatia", + "cu" => "Cuba", + "cw" => "Curaçao", + "cy" => "Cyprus", + "cz" => "Czechia", + "dk" => "Denmark", + "dj" => "Djibouti", + "dm" => "Dominica", + "do" => "Dominican Republic", + "ec" => "Ecuador", + "eg" => "Egypt", + "sv" => "El Salvador", + "gq" => "Equatorial Guinea", + "er" => "Eritrea", + "ee" => "Estonia", + "et" => "Ethiopia", + "fk" => "Falkland Islands (Malvinas)", + "fo" => "Faroe Islands", + "fj" => "Fiji", + "fi" => "Finland", + "fr" => "France", + "gf" => "French Guiana", + "pf" => "French Polynesia", + "tf" => "French Southern Territories", + "ga" => "Gabon", + "gm" => "Gambia", + "ge" => "Georgia", + "de" => "Germany", + "gh" => "Ghana", + "gi" => "Gibraltar", + "gr" => "Greece", + "gl" => "Greenland", + "gd" => "Grenada", + "gp" => "Guadeloupe", + "gu" => "Guam", + "gt" => "Guatemala", + "gg" => "Guernsey", + "gn" => "Guinea", + "gw" => "Guinea-Bissau", + "gy" => "Guyana", + "ht" => "Haiti", + "hm" => "Heard Island and McDonald Islands", + "va" => "Holy See", + "hn" => "Honduras", + "hk" => "Hong Kong", + "hu" => "Hungary", + "is" => "Iceland", + "in" => "India", + "id" => "Indonesia", + "ir" => "Iran (Islamic Republic of)", + "iq" => "Iraq", + "ie" => "Ireland", + "im" => "Isle of Man", + "il" => "Israel", + "it" => "Italy", + "jm" => "Jamaica", + "jp" => "Japan", + "je" => "Jersey", + "jo" => "Jordan", + "kz" => "Kazakhstan", + "ke" => "Kenya", + "ki" => "Kiribati", + "kp" => "Korea (Democratic People's Republic of)", + "kr" => "Korea (Republic of)", + "kw" => "Kuwait", + "kg" => "Kyrgyzstan", + "la" => "Lao People's Democratic Republic", + "lv" => "Latvia", + "lb" => "Lebanon", + "ls" => "Lesotho", + "lr" => "Liberia", + "ly" => "Libya", + "li" => "Liechtenstein", + "lt" => "Lithuania", + "lu" => "Luxembourg", + "mo" => "Macao", + "mk" => "Macedonia (the former Yugoslav Republic of)", + "mg" => "Madagascar", + "mw" => "Malawi", + "my" => "Malaysia", + "mv" => "Maldives", + "ml" => "Mali", + "mt" => "Malta", + "mh" => "Marshall Islands", + "mq" => "Martinique", + "mr" => "Mauritania", + "mu" => "Mauritius", + "yt" => "Mayotte", + "mx" => "Mexico", + "fm" => "Micronesia (Federated States of)", + "md" => "Moldova (Republic of)", + "mc" => "Monaco", + "mn" => "Mongolia", + "me" => "Montenegro", + "ms" => "Montserrat", + "ma" => "Morocco", + "mz" => "Mozambique", + "mm" => "Myanmar", + "na" => "Namibia", + "nr" => "Nauru", + "np" => "Nepal", + "nl" => "Netherlands", + "nc" => "New Caledonia", + "nz" => "New Zealand", + "ni" => "Nicaragua", + "ne" => "Niger", + "ng" => "Nigeria", + "nu" => "Niue", + "nf" => "Norfolk Island", + "mp" => "Northern Mariana Islands", + "no" => "Norway", + "om" => "Oman", + "pk" => "Pakistan", + "pw" => "Palau", + "ps" => "Palestine, State of", + "pa" => "Panama", + "pg" => "Papua New Guinea", + "py" => "Paraguay", + "pe" => "Peru", + "ph" => "Philippines", + "pn" => "Pitcairn", + "pl" => "Poland", + "pt" => "Portugal", + "pr" => "Puerto Rico", + "qa" => "Qatar", + "re" => "Réunion", + "ro" => "Romania", + "ru" => "Russian Federation", + "rw" => "Rwanda", + "bl" => "Saint Barthélemy", + "sh" => "Saint Helena, Ascension and Tristan da Cunha", + "kn" => "Saint Kitts and Nevis", + "lc" => "Saint Lucia", + "mf" => "Saint Martin (French part)", + "pm" => "Saint Pierre and Miquelon", + "vc" => "Saint Vincent and the Grenadines", + "ws" => "Samoa", + "sm" => "San Marino", + "st" => "Sao Tome and Principe", + "sa" => "Saudi Arabia", + "sn" => "Senegal", + "rs" => "Serbia", + "sc" => "Seychelles", + "sl" => "Sierra Leone", + "sg" => "Singapore", + "sx" => "Sint Maarten (Dutch part)", + "sk" => "Slovakia", + "si" => "Slovenia", + "sb" => "Solomon Islands", + "so" => "Somalia", + "za" => "South Africa", + "gs" => "South Georgia and South Sandwich Islands", + "ss" => "South Sudan", + "es" => "Spain", + "lk" => "Sri Lanka", + "sd" => "Sudan", + "sr" => "Suriname", + "sj" => "Svalbard and Jan Mayen", + "sz" => "Swaziland", + "se" => "Sweden", + "ch" => "Switzerland", + "sy" => "Syrian Arab Republic", + "tw" => "Taiwan", + "tj" => "Tajikistan", + "tz" => "Tanzania, United Republic of", + "th" => "Thailand", + "tl" => "Timor-Leste", + "tg" => "Togo", + "tk" => "Tokelau", + "to" => "Tonga", + "tt" => "Trinidad and Tobago", + "tn" => "Tunisia", + "tr" => "Turkey", + "tm" => "Turkmenistan", + "tc" => "Turks and Caicos Islands", + "tv" => "Tuvalu", + "ug" => "Uganda", + "ua" => "Ukraine", + "ae" => "United Arab Emirates", + "gb" => "United Kingdom", + "us" => "United States of America", + "um" => "United States Minor Outlying Islands", + "uy" => "Uruguay", + "uz" => "Uzbekistan", + "vu" => "Vanuatu", + "ve" => "Venezuela (Bolivarian Republic of)", + "vn" => "Viet Nam", + "vg" => "Virgin Islands (British)", + "vi" => "Virgin Islands (U.S.)", + "wf" => "Wallis and Futuna", + "eh" => "Western Sahara", + "ye" => "Yemen", + "zm" => "Zambia", + "zw" => "Zimbabwe" + ] + ], + "region" => [ + "display" => "Region", + "option" => [ + "any" => "Any region", + "eu" => "European Union", + "de" => "Germany", + "fr" => "France", + "uk" => "United Kingdom" + ] + ], + "domain" => [ + "display" => "Results per domain", + "option" => [ + "1" => "1 result", + "2" => "2 results", + "3" => "3 results", + "4" => "4 results", + "5" => "5 results", + "10" => "10 results", + "0" => "Unlimited", + ] + ] + ]; + break; + + case "news": + return []; + } + } + + private function get($url, $get = []){ + + $headers = [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1" + ]; + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + $token = $this->nextpage->get($get["npt"], "web"); + + try{ + $html = + $this->get( + "https://www.mojeek.com" . $token, + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + }else{ + $search = $get["s"]; + $lang = $get["lang"]; + $country = $get["country"]; + $region = $get["region"]; + $domain = $get["domain"]; + $focus = $get["focus"]; + + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $params = [ + "q" => $search, + "t" => 20, // number of results/page + "tn" => 7, // number of news results/page + "date" => 1, // show date + "tlen" => 128, // max length of title + "dlen" => 511, // max length of description + "arc" => ($country == "any" ? "none" : $country) // location. don't use autodetect! + ]; + + switch($focus){ + + case "any": break; + + case "blogs": + $params["fmt"] = "sst"; + $params["sst"] = "1"; + break; + + default: + $params["foc_t"] = $focus; + break; + } + + if($lang != "any"){ + + $params["lb"] = $lang; + } + + if($region != "any"){ + + $params["reg"] = $region; + } + + if($domain != "1"){ + + $params["si"] = $domain; + } + + try{ + $html = + $this->get( + "https://www.mojeek.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + /* + $handle = fopen("scraper/mojeek.html", "r"); + $html = fread($handle, filesize("scraper/mojeek.html")); + fclose($handle);*/ + + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + $results = + $this->fuckhtml + ->getElementsByClassName("results-standard", "ul"); + + if(count($results) === 0){ + + return $out; + } + + $this->fuckhtml->load($results[0]); + + /* + Get search results + */ + $results = + $this->fuckhtml + ->getElementsByTagName("li"); + + foreach($results as $result){ + + $data = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + $this->fuckhtml->load($result); + + $title = + $this->fuckhtml + ->getElementsByClassName("title", "a")[0]; + + $data["title"] = + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $title["innerHTML"] + ) + ); + + $data["url"] = + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $title["attributes"]["href"] + ) + ); + + $description = + $this->fuckhtml + ->getElementsByClassName( + "s", "p" + ); + + if(count($description) !== 0){ + + $data["description"] = + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ) + ); + } + + $data["date"] = + explode( + " - ", + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName("i", "p")[1] + ) + ); + + $data["date"] = + strtotime( + $data["date"][count($data["date"]) - 1] + ); + + $out["web"][] = $data; + } + + /* + Get instant answers + */ + $this->fuckhtml->load($html); + + $infoboxes = + $this->fuckhtml + ->getElementsByClassName( + "infobox infobox-top", + "div" + ); + + foreach($infoboxes as $infobox){ + + $answer = [ + "title" => null, + "description" => [], + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + + // load first part with title + short definition + $infobox_html = + explode( + "<hr>", + $infobox["innerHTML"] + ); + + $this->fuckhtml->load($infobox_html[0]); + + // title + $answer["title"] = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName("h1")[0] + ); + + // short definition + $definition = + $this->fuckhtml + ->getElementsByTagName( + "p" + ); + + if(count($definition) !== 0){ + + $answer["description"][] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $definition[0] + ) + ]; + } + + // get thumbnail, if it exists + $this->fuckhtml->load($infobox_html[1]); + + $thumb = + $this->fuckhtml + ->getElementsByClassName("float-right", "img"); + + if(count($thumb) !== 0){ + + preg_match( + '/\/image\?img=([^&]+)/i', + $thumb[0]["attributes"]["src"], + $thumb + ); + + if(count($thumb) === 2){ + + $answer["thumb"] = + $this->fuckhtml + ->getTextContent( + $thumb[1] + ); + } + } + + // get description + $ps = + $this->fuckhtml + ->getElementsByTagName("p"); + + $first_tag = true; + foreach($ps as $p){ + + $this->fuckhtml->load($p); + + if( + preg_match( + '/^\s*<strong>/i', + $p["innerHTML"] + ) + ){ + + /* + Parse table + */ + + $strong = + $this->fuckhtml + ->getElementsByTagName("strong")[0]; + + $p["innerHTML"] = + str_replace($strong["innerHTML"], "", $p["innerHTML"]); + + $strong = + preg_replace( + '/:$/', + "", + ucfirst( + $this->fuckhtml + ->getTextContent( + $strong + ) + ) + ); + + $answer["table"][trim($strong)] = + trim( + $this->fuckhtml + ->getTextContent( + $p + ) + ); + + continue; + } + + $as = + $this->fuckhtml + ->getElementsByClassName("svg-icon"); + + if(count($as) !== 0){ + + /* + Parse websites + */ + foreach($as as $a){ + + $answer["sublink"][ + ucfirst(explode(" ", $a["attributes"]["class"], 2)[1]) + ] = + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ); + } + + continue; + } + + /* + Parse text content + */ + $tags = + $this->fuckhtml + ->getElementsByTagName("*"); + + $i = 0; + foreach($tags as $tag){ + + $c = count($answer["description"]); + + // remove tag from innerHTML + $p["innerHTML"] = + explode($tag["outerHTML"], $p["innerHTML"], 2); + + if(count($p["innerHTML"]) === 2){ + + if( + $i === 0 && + $c !== 0 && + $answer["description"][$c - 1]["type"] == "link" + ){ + + $append = "\n\n"; + }else{ + + $append = ""; + } + + if($p["innerHTML"][0] != ""){ + $answer["description"][] = [ + "type" => "text", + "value" => $append . trim($p["innerHTML"][0]) + ]; + } + + $p["innerHTML"] = $p["innerHTML"][1]; + }else{ + + $p["innerHTML"] = $p["innerHTML"][0]; + } + + switch($tag["tagName"]){ + + case "a": + + $value = + $this->fuckhtml + ->getTextContent( + $tag + ); + + if(strtolower($value) == "wikipedia"){ + + if($c !== 0){ + $answer["description"][$c - 1]["value"] = + rtrim($answer["description"][$c - 1]["value"]); + } + break; + } + + $answer["description"][] = [ + "type" => "link", + "url" => + $this->fuckhtml + ->getTextContent( + $tag["attributes"]["href"] + ), + "value" => + $this->fuckhtml + ->getTextContent( + $tag + ) + ]; + break; + } + + $i++; + } + } + + // get URL + $this->fuckhtml->load($infobox_html[2]); + + $answer["url"] = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0] + ["attributes"] + ["href"] + ); + + // append answer + $out["answer"][] = $answer; + } + + /* + Get news + */ + $this->fuckhtml->load($html); + + $news = + $this->fuckhtml + ->getElementsByClassName( + "results news-results", + "div" + ); + + if(count($news) !== 0){ + + $this->fuckhtml->load($news[0]); + + $lis = + $this->fuckhtml + ->getElementsByTagName("li"); + + foreach($lis as $li){ + + $this->fuckhtml->load($li); + + $a = + $this->fuckhtml + ->getElementsByClassName( + "ob", + "a" + ); + + if(count($a) === 0){ + + continue; + } + + $a = $a[0]; + + $out["news"][] = [ + "title" => + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $a + ) + ), + "description" => null, + "date" => + strtotime( + explode( + " - ", + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName( + "span" + )[0] + ), + 2 + )[1] + ), + "thumb" => [ + "url" => null, + "ratio" => null + ], + "url" => + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ) + ]; + } + } + + /* + Get next page + */ + $this->fuckhtml->load($html); + + $pagination = + $this->fuckhtml + ->getElementsByClassName("pagination"); + + if(count($pagination) !== false){ + + $this->fuckhtml->load($pagination[0]); + $as = + $this->fuckhtml + ->getElementsByTagName("a"); + + foreach($as as $a){ + + if($a["innerHTML"] == "Next"){ + + $out["npt"] = $this->nextpage->store( + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ), + "web" + ); + } + } + } + + return $out; + } + + public function news($get){ + + $search = $get["s"]; + + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + try{ + $html = + $this->get( + "https://www.mojeek.com/search", + [ + "q" => $search, + "fmt" => "news" + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + /* + $handle = fopen("scraper/mojeek.html", "r"); + $html = fread($handle, filesize("scraper/mojeek.html")); + fclose($handle);*/ + + /* + Get big, standard and smaller nodes + */ + foreach( + [ + "results-extended", + "results-standard" + ] + as $categoryname + ){ + + $this->fuckhtml->load($html); + + $categories = + $this->fuckhtml + ->getElementsByClassName( + $categoryname, + "ul" + ); + + foreach($categories as $category){ + + $this->fuckhtml->load($category); + + $nodes = + $this->fuckhtml + ->getElementsByTagName("li"); + + foreach($nodes as $node){ + + $data = [ + "title" => null, + "author" => null, + "description" => null, + "date" => null, + "thumb" => + [ + "url" => null, + "ratio" => null + ], + "url" => null + ]; + + /* + Parse the results + */ + $this->fuckhtml->load($node); + + // get title + url + $a = + $this->fuckhtml + ->getElementsByTagName("a")[0]; + + $data["title"] = + $this->fuckhtml + ->getTextContent( + $a["attributes"]["title"] + ); + + $data["url"] = + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ); + + // get image + $image = + $this->fuckhtml + ->getElementsByTagName("img"); + + if(count($image) !== 0){ + + $data["thumb"] = [ + "url" => + urldecode( + str_replace( + "/image?img=", + "", + $this->fuckhtml + ->getTextContent( + $image[0]["attributes"]["src"] + ) + ) + ), + "ratio" => "16:9" + ]; + } + + // get description + $description = + $this->fuckhtml + ->getElementsByClassName("s", "p"); + + if(count($description) !== 0){ + + $data["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ); + } + + // get date + time + $date = + $this->fuckhtml + ->getElementsByClassName( + "date", + "p" + ); + + $i = + $this->fuckhtml + ->getElementsByClassName("i", "p"); + + if(count($date) !== 0){ + + // we're inside a big node + $data["date"] = strtotime($date[0]["innerHTML"]); + + if(count($i) !== 0){ + + $this->fuckhtml->load($i[0]); + + $a = + $this->fuckhtml + ->getElementsByTagName("a"); + + if(count($a) !== 0){ + + $data["author"] = + $this->fuckhtml + ->getTextContent($a[0]); + } + } + }else{ + + // we're inside a small node + if(count($i) !== 0){ + + $i = + explode( + " - ", + $this->fuckhtml + ->getTextContent($i[0]) + ); + + $data["date"] = strtotime(array_pop($i)); + $data["author"] = implode(" - ", $i); + } + } + + $out["news"][] = $data; + } + } + } + + return $out; + } + + private function titledots($title){ + + return trim($title, ". \t\n\r\0\x0B"); + } +} + diff --git a/scraper/wiby.php b/scraper/wiby.php new file mode 100644 index 0000000..a1daf57 --- /dev/null +++ b/scraper/wiby.php @@ -0,0 +1,244 @@ +<?php + +class wiby{ + + public function __construct(){ + + include "lib/nextpage.php"; + $this->nextpage = new nextpage("wiby"); + } + + public function getfilters($page){ + + if($page != "web"){ + + return []; + } + + return [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ], + "date" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "day" => "Past day", + "week" => "Past week", + "month" => "Past month", + "year" => "Past year", + ] + ] + ]; + } + + private function get($url, $get = [], $nsfw){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Cookie: ws={$nsfw}", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + $q = + json_decode( + $this->nextpage->get($get["npt"], "web"), + true + ); + + $nsfw = $q["nsfw"]; + unset($q["nsfw"]); + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $date = $get["date"]; + $nsfw = $get["nsfw"] == "yes" ? "0" : "1"; + + $search = + str_replace( + [ + "!g", + "!gi", + "!gv", + "!gm", + "!b", + "!bi", + "!bv", + "!bm", + "!td", + "!tw", + "!tm", + "!ty", + "&g", + "&gi", + "&gv", + "&gm", + "&b", + "&bi", + "&bv", + "&bm", + "&td", + "&tw", + "&tm", + "&ty", + ], + "", + $search + ); + + switch($date){ + + case "day": $search = "!td " . $search; break; + case "week": $search = "!tw " . $search; break; + case "month": $search = "!tm " . $search; break; + case "year": $search = "!ty " . $search; break; + } + + $q = [ + "q" => $search + ]; + } + + try{ + $html = $this->get( + "https://wiby.me/", + $q, + $nsfw + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + preg_match( + '/<p class="pin"><blockquote>(?:<\/p>)?<br><a class="more" href="\/\?q=[^"]+&p=([0-9]+)">Find more\.\.\.<\/a><\/blockquote>/', + $html, + $nextpage + ); + + if(count($nextpage) === 0){ + + $nextpage = null; + }else{ + + $nextpage = + $this->nextpage->store( + json_encode([ + "q" => $q["q"], + "p" => (int)$nextpage[1], + "nsfw" => $nsfw + ]), + "web" + ); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => $nextpage, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + preg_match_all( + '/<blockquote>[\s]*<a .* href="(.*)">(.*)<\/a>.*<p>(.*)<\/p>[\s]*<\/blockquote>/Ui', + $html, + $links + ); + + for($i=0; $i<count($links[0]); $i++){ + + $out["web"][] = [ + "title" => $this->unescapehtml(trim($links[2][$i])), + "description" => $this->unescapehtml(trim(strip_tags($links[3][$i]))), + "url" => trim($links[1][$i]), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } + + private function unescapehtml($str){ + + return html_entity_decode( + str_replace( + [ + "<br>", + "<br/>", + "</br>", + "<BR>", + "<BR/>", + "</BR>", + ], + "\n", + $str + ), + ENT_QUOTES | ENT_XML1, 'UTF-8' + ); + } +} diff --git a/scraper/yandex.php b/scraper/yandex.php new file mode 100644 index 0000000..437c8aa --- /dev/null +++ b/scraper/yandex.php @@ -0,0 +1,530 @@ +<?php + +class yandex{ + + /* + curl functions + */ + public function __construct(){ + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + include "lib/nextpage.php"; + $this->nextpage = new nextpage("yandex"); + } + + private function get($url, $get = [], $nsfw){ + + $curlproc = curl_init(); + + $search = $get["text"]; + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + switch($nsfw){ + case "yes": $nsfw = "0"; break; + case "maybe": $nsfw = "1"; break; + case "no": $nsfw = "2"; break; + } + + $headers = + ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Encoding: gzip", + "Accept-Language: en-US,en;q=0.5", + "DNT: 1", + "Cookie: yp=1716337604.sp.family%3A{$nsfw}#1685406411.szm.1:1920x1080:1920x999", + "Referer: https://yandex.com/images/search?text={$search}", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site", + "Upgrade-Insecure-Requests: 1"]; + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function getfilters($pagetype){ + + switch($pagetype){ + + case "images": + return + [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "week" => "Last week" + ] + ], + "size" => [ + "display" => "Size", + "option" => [ + "any" => "Any size", + "small" => "Small", + "medium" => "Medium", + "large" => "Large", + "wallpaper" => "Wallpaper" + ] + ], + "color" => [ + "display" => "Colors", + "option" => [ + "any" => "All colors", + "color" => "Color images only", + "gray" => "Black and white", + "red" => "Red", + "orange" => "Orange", + "yellow" => "Yellow", + "cyan" => "Cyan", + "green" => "Green", + "blue" => "Blue", + "violet" => "Purple", + "white" => "White", + "black" => "Black" + ] + ], + "type" => [ + "display" => "Type", + "option" => [ + "any" => "All types", + "photo" => "Photos", + "clipart" => "White background", + "lineart" => "Drawings and sketches", + "face" => "People", + "demotivator" => "Demotivators" + ] + ], + "layout" => [ + "display" => "Layout", + "option" => [ + "any" => "All layouts", + "horizontal" => "Horizontal", + "vertical" => "Vertical", + "square" => "Square" + ] + ], + "format" => [ + "display" => "Format", + "option" => [ + "any" => "Any format", + "jpeg" => "JPEG", + "png" => "PNG", + "gif" => "GIF" + ] + ] + ]; + break; + + default: + return []; + break; + } + } + + public function image($get){ + + if($get["npt"]){ + + $request = + json_decode( + $this->nextpage->get( + $get["npt"], + "images" + ), + true + ); + + $nsfw = $request["nsfw"]; + unset($request["nsfw"]); + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $nsfw = $get["nsfw"]; + $time = $get["time"]; + $size = $get["size"]; + $color = $get["color"]; + $type = $get["type"]; + $layout = $get["layout"]; + $format = $get["format"]; + /* + $handle = fopen("scraper/yandex.json", "r"); + $json = fread($handle, filesize("scraper/yandex.json")); + fclose($handle);*/ + + // SIZE + // large + // 227.0=1;203.0=1;76fe94.0=1;41d251.0=1;75.0=1;371.0=1;291.0=1;307.0=1;f797ee.0=1;1cf7c2.0=1;deca32.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&isize=large&suggest_reqid=486139416166165501540886508227485&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // medium + // 227.0=1;203.0=1;76fe94.0=1;41d251.0=1;75.0=1;371.0=1;291.0=1;307.0=1;f797ee.0=1;1cf7c2.0=1;deca32.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&isize=medium&suggest_reqid=486139416166165501540886508227485&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // small + // 227.0=1;203.0=1;76fe94.0=1;41d251.0=1;75.0=1;371.0=1;291.0=1;307.0=1;f797ee.0=1;1cf7c2.0=1;deca32.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&isize=small&suggest_reqid=486139416166165501540886508227485&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // ORIENTATION + // Horizontal + // 227.0=1;203.0=1;76fe94.0=1;41d251.0=1;75.0=1;371.0=1;291.0=1;307.0=1;f797ee.0=1;1cf7c2.0=1;deca32.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&iorient=horizontal&suggest_reqid=486139416166165501540886508227485&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Vertical + // 227.0=1;203.0=1;76fe94.0=1;41d251.0=1;75.0=1;371.0=1;291.0=1;307.0=1;f797ee.0=1;1cf7c2.0=1;deca32.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&iorient=vertical&suggest_reqid=486139416166165501540886508227485&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Square + // 227.0=1;203.0=1;76fe94.0=1;41d251.0=1;75.0=1;371.0=1;291.0=1;307.0=1;f797ee.0=1;1cf7c2.0=1;deca32.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&iorient=square&suggest_reqid=486139416166165501540886508227485&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // TYPE + // Photos + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&text=minecraft&type=photo&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // White background + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&text=minecraft&type=clipart&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Drawings and sketches + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&text=minecraft&type=lineart&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // People + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&text=minecraft&type=face&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Demotivators + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&text=minecraft&type=demotivator&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // COLOR + // Color images only + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=color&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Black and white + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=gray&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Red + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=red&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Orange + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=orange&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Yellow + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=yellow&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Cyan + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=cyan&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Green + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=green&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Blue + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=blue&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Purple + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=violet&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // White + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=white&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Black + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=black&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // FORMAT + // jpeg + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&itype=jpg&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // png + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&itype=png&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // gif + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&itype=gifan&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // RECENT + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&recent=7D&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // WALLPAPER + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&isize=wallpaper&text=minecraft&wp=wh16x9_1920x1080&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + + $request = [ + "format" => "json", + "request" => [ + "blocks" => [ + [ + "block" => "extra-content", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "i-global__params:ajax", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "search2:ajax", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "preview__isWallpaper", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "content_type_search", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "serp-controller", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "cookies_ajax", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "advanced-search-block", + "params" => (object)[], + "version" => 2 + ] + ], + "metadata" => [ + "bundles" => [ + "lb" => "AS?(E<X120" + ], + "assets" => [ + // las base + "las" => "justifier-height=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;" + + // las default + //"las" => "justifier-height=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;227.0=1;203.0=1;76fe94.0=1;215f96.0=1;75.0=1" + ], + "extraContent" => [ + "names" => [ + "i-react-ajax-adapter" + ] + ] + ] + ] + ]; + + /* + Apply filters + */ + if($time == "week"){ + $request["recent"] = "7D"; + } + + if($size != "any"){ + + $request["isize"] = $size; + } + + if($type != "any"){ + + $request["type"] = $type; + } + + if($color != "any"){ + + $request["icolor"] = $color; + } + + if($layout != "any"){ + + $request["iorient"] = $layout; + } + + if($format != "any"){ + + $request["itype"] = $format; + } + + $request["text"] = $search; + $request["uinfo"] = "sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080"; + + $request["request"] = json_encode($request["request"]); + } + + try{ + $json = $this->get( + "https://yandex.com/images/search", + $request, + $nsfw + ); + }catch(Exception $err){ + + throw new Exception("Failed to get JSON"); + } + /* + $handle = fopen("scraper/yandex.json", "r"); + $json = fread($handle, filesize("scraper/yandex.json")); + fclose($handle);*/ + + $json = json_decode($json, true); + + if( + isset($json["type"]) && + $json["type"] == "captcha" + ){ + + throw new Exception("Yandex blocked this 4get instance. Yandex blocks don't last very long, but the block timer gets reset everytime you make another unsuccessful request. Please try again in ~7 minutes."); + } + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + // get html + $html = ""; + foreach($json["blocks"] as $block){ + + $html .= $block["html"]; + } + + $this->fuckhtml->load($html); + $div = $this->fuckhtml->getElementsByTagName("div"); + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + // check for next page + if( + count( + $this->fuckhtml + ->getElementsByClassName( + "more more_direction_next", + $div + ) + ) !== 0 + ){ + + $request["nsfw"] = $nsfw; + + if(isset($request["p"])){ + + $request["p"]++; + }else{ + + $request["p"] = 1; + } + + $out["npt"] = $this->nextpage->store(json_encode($request), "images"); + } + + // get search results + foreach( + $this->fuckhtml + ->getElementsByClassName( + "serp-item serp-item_type_search", + $div + ) + as $image + ){ + + $image = + json_decode( + $image + ["attributes"] + ["data-bem"], + true + )["serp-item"]; + + $title = [html_entity_decode($image["snippet"]["title"], ENT_QUOTES | ENT_HTML5)]; + + if(isset($image["snippet"]["text"])){ + + $title[] = html_entity_decode($image["snippet"]["text"], ENT_QUOTES | ENT_HTML5); + } + + $tmp = [ + "title" => + $this->fuckhtml + ->getTextContent( + $this->titledots( + implode(": ", $title) + ) + ), + "source" => [], + "url" => htmlspecialchars_decode($image["snippet"]["url"]) + ]; + + foreach($image["dups"] as $dup){ + + $tmp["source"][] = [ + "url" => htmlspecialchars_decode($dup["url"]), + "width" => (int)$dup["w"], + "height" => (int)$dup["h"], + ]; + } + + $tmp["source"][] = [ + "url" => + preg_replace( + '/^\/\//', + "https://", + htmlspecialchars_decode($image["thumb"]["url"]) + ), + "width" => (int)$image["thumb"]["size"]["width"], + "height" => (int)$image["thumb"]["size"]["height"] + ]; + + $out["image"][] = $tmp; + } + + return $out; + } + + private function titledots($title){ + + $substr = substr($title, -3); + + if( + $substr == "..." || + $substr == "…" + ){ + + return trim(substr($title, 0, -3)); + } + + return trim($title); + } +} diff --git a/scraper/youtube.php b/scraper/youtube.php new file mode 100644 index 0000000..83a68ba --- /dev/null +++ b/scraper/youtube.php @@ -0,0 +1,1723 @@ +<?php + +//$yt = new youtube(); +//header("Content-Type: application/json"); +//echo json_encode($yt->video("minecraft", null, "today", "any", "any", "live", "relevance")); + +class youtube{ + + public function __construct(){ + + include "lib/nextpage.php"; + $this->nextpage = new nextpage("yt"); + } + + public function getfilters($page){ + + if($page != "videos"){ + + return []; + } + + return [ + "date" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "hour" => "Last hour", + "today" => "Today", + "week" => "This week", + "month" => "This month", + "year" => "This year" + ] + ], + "type" => [ + "display" => "Type", + "option" => [ + "video" => "Video", + "channel" => "Channel", + "playlist" => "Playlist", + "Movie" => "Movie" + ] + ], + "duration" => [ + "display" => "Duration", + "option" => [ + "any" => "Any duration", + "short" => "Short (>4min)", + "medium" => "Medium (4-20min)", + "long" => "Long (<20min)" + ] + ], + "feature" => [ + "display" => "Feature", + "option" => [ + "any" => "No features", + "live" => "Live", + "4k" => "4K", + "hd" => "HD", + "subtitles" => "Subtitles/CC", + "creativecommons" => "Creative Commons", + "360" => "VR 360°", + "vr180" => "VR 180°", + "3d" => "3D", + "hdr" => "HDR" + ] + ], + "sort" => [ + "display" => "Sort by", + "option" => [ + "relevance" => "Relevance", + "upload_date" => "Upload date", + "view_count" => "View count", + "rating" => "Rating" + ] + ] + ]; + } + + private function ytfilter($date, $type, $duration, $feature, $sort){ + + // ------------ + // INCOMPATIBLE FILTERS + // channel,playlist DURATION, FEATURES, SORT BY + // Movie Features=[live, subtitles, creative commons, 3d] + + // live, 3D + // Type[channel, playlist, movie] + + // UPLOAD DATE, DURATION, 4k, 360, VR180, HDR + // Type[channel, playlist] + + // ----------- + + // MUST BE TOGETHER + // Relevance,upload date Type=Video + + switch($type){ + + case "channel": + case "playlist": + if($duration != "any"){ $duration = "any"; } + if($feature != "any"){ $feature = "any"; } + if($sort != "any"){ $sort = "any"; } + break; + + case "movie": + if( + in_array( + $feature, + [ + "live", + "subtitles", + "creative_commons", + "3d" + ], + ) + ){ + + $feature = "any"; + } + break; + } + + switch($feature){ + + case "live": + case "3d": + if( + in_array( + $type, + [ + "channel", + "playlist", + "movie" + ], + ) + ){ + + $type = "video"; + } + break; + } + + if( + ( + $date != "any" || + $duration != "any" || + $feature == "4k" || + $feature == "360" || + $feature == "vr180" || + $feature == "hdr" + ) && + ( + $type == "channel" || + $type == "playlist" + ) + ){ + + $type = "video"; + } + + if( + $date == "any" && + $type == "video" && + $duration == "any" && + $feature == "any" && + $sort == "relevance" + ){ + + return null; + } + + //print_r([$date, $type, $duration, $feature, $sort]); + + /* + Encode hex data + */ + + // UPLOAD DATE + // hour EgQIARAB 12 04 08 01 10 01 + // today EgQIAhAB 12 04 08 02 10 01 + // week EgQIAxAB 12 04 08 03 10 01 + // month EgQIBBAB 12 04 08 04 10 01 + // year EgQIBRAB 12 04 08 05 10 01 + + // TYPE + // video EgIQAQ%253D%253D 12 02 10 01 + // channel EgIQAg%253D%253D 12 02 10 02 + // playlist EgIQAw%253D%253D 12 02 10 03 + // movie EgIQBA%253D%253D 12 02 10 04 + + // DURATION + // -4min EgIYAQ%253D%253D 12 02 18 01 + // 4-20min EgIYAw%253D%253D 12 02 18 03 + // 20+min EgIYAg%253D%253D 12 02 18 02 + + // FEATURE + // live EgJAAQ%253D%253D 12 02 40 01 + // 4K EgJwAQ%253D%253D 12 02 70 01 + // HD EgIgAQ%253D%253D 12 02 20 01 + // Subtitles/CC EgIoAQ%253D%253D 12 02 28 01 + // Creative Commons EgIwAQ%253D%253D 12 02 30 01 + // 360 EgJ4AQ%253D%253D 12 02 78 01 + // VR180 EgPQAQE%253D 12 03 d0 01 01 + // 3D EgI4AQ%253D%253D 12 02 38 01 + // HDR EgPIAQE%253D 12 03 c8 01 01 + // (location & purchased unused) + + // SORT BY + // Relevance CAASAhAB 08 00 12 02 10 01 (is nothing by default) + // Upload date CAI%253D 08 02 + // View count CAM%253D 08 03 + // Rating CAE%253D 08 01 + + // video + // 12 02 10 01 + + // under 4 minutes + // 12 02 18 01 + + // video + under 4 minutes + // 12 04 10 01 18 01 + + // video + under 4 minutes + HD + // 08 00 12 06 10 01 18 01 20 01 + + // video + under 4 minutes + upload date + // 08 02 12 04 10 01 18 01 + + // video + under 4 minutes + HD + upload date + // 08 02 12 06 10 01 18 01 20 01 + + // this year + video + under 4 minutes + HD + upload date + // 08 02 12 08 08 05 10 01 18 01 20 01 + + // this week + video + over 20 minutes + HD + view count + // 08 03 12 08 08 03 10 01 18 02 20 01 + + //echo urlencode(urlencode(base64_encode(hex2bin($str)))); + //echo bin2hex(base64_decode(urldecode(urldecode("CAI%253D")))); + + // week + video + 20min + rating + // 08 01 12 06 08 03 10 01 18 02 + + // week + video + 20min + live + rating + // 08 01 12 08 08 03 10 01 18 02 40 01 + + // live 12 02 40 01 + + $hex = null; + if( + $date == "any" && + $type == "video" && + $duration == "any" && + $feature == "any" && + $sort == "relevance" + ){ + + return $hex; + } + + $opcode = 0; + + if($date != "any"){ $opcode += 2; } + if($type != "any"){ $opcode += 2; } + if($duration != "any"){ $opcode += 2; } + + switch($feature){ + + case "live": + case "4k": + case "hd": + case "subtitles": + case "creativecommons": + case "360": + case "3d": + $opcode += 2; + break; + + case "hdr": + case "vr180": + $opcode += 3; + break; + } + + switch($sort){ + + case "relevance": $hex .= "0800"; break; + case "upload_date": $hex .= "0802"; break; + case "view_count": $hex .= "0803"; break; + case "rating": $hex .= "0801"; break; + } + + $hex .= "12" . "0".$opcode; + + switch($date){ + + case "hour": $hex .= "0801"; break; + case "today": $hex .= "0802"; break; + case "week": $hex .= "0803"; break; + case "month": $hex .= "0804"; break; + case "year": $hex .= "0805"; break; + } + + switch($type){ + + case "video": $hex .= "1001"; break; + case "channel": $hex .= "1002"; break; + case "playlist": $hex .= "1003"; break; + case "movie": $hex .= "1004"; break; + } + + switch($duration){ + + case "short": $hex .= "1801"; break; + case "medium": $hex .= "1803"; break; + case "long": $hex .= "1802"; break; + } + + switch($feature){ + + case "live": $hex .= "4001"; break; + case "4k": $hex .= "7001"; break; + case "hd": $hex .= "2001"; break; + case "subtitles": $hex .= "2801"; break; + case "creativecommons": $hex .= "3001"; break; + case "360": $hex .= "7801"; break; + case "vr180": $hex .= "d00101"; break; + case "3d": $hex .= "3801"; break; + case "hdr": $hex .= "c80101"; break; + } + + //echo $hex . "\n\n"; + return urlencode(base64_encode(hex2bin($hex))); + } + + // me reading youtube's json + // https://imgur.com/X9hVlFX + + const req_web = 0; + const req_xhr = 1; + + private function get($url, $get = [], $reqtype = self::req_web, $continuation = null){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + switch($reqtype){ + case self::req_web: + $headers = + ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Cookie: PREF=tz=America.New_York", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"]; + break; + + case self::req_xhr: + $headers = + ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:110.0) Gecko/20100101 Firefox/110.0", + "Accept: */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Cookie: PREF=tz=America.New_York", + "Referer: https://youtube.com.com/", + "Content-Type: application/json", + "Content-Length: " . strlen($continuation), + "DNT: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: same-origin", + "Sec-Fetch-Site: same-origin"]; + + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt($curlproc, CURLOPT_POSTFIELDS, $continuation); + break; + } + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function video($get){ + + $this->out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + if($get["npt"]){ + + // parse nextPage + // https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false + /* + $handle = fopen("nextpage.json", "r"); + $json = fread($handle, filesize("nextpage.json")); + fclose($handle);*/ + + $npt = + json_decode( + $this->nextpage->get( + $get["npt"], + "videos" + ), + true + ); + + try{ + $json = $this->get( + "https://www.youtube.com/youtubei/v1/search", + [ + "key" => $npt["key"], + "prettyPrint" => "false" + ], + self::req_xhr, + json_encode($npt["post"]) + ); + }catch(Exception $error){ + + throw new Exception("Could not fetch results page"); + } + + $json = json_decode($json); + + foreach( + $json + ->onResponseReceivedCommands[0] + ->appendContinuationItemsAction + ->continuationItems[0] + ->itemSectionRenderer + ->contents + as $video + ){ + + $this->parsevideoobject($video); + } + + if( + !isset( + $json + ->onResponseReceivedCommands[0] + ->appendContinuationItemsAction + ->continuationItems[1] + ->continuationItemRenderer + ->continuationEndpoint + ->continuationCommand + ->token + ) + ){ + + $npt = null; + + }else{ + // prepare nextpage for later.. + $npt["post"]["continuation"] = + $json + ->onResponseReceivedCommands[0] + ->appendContinuationItemsAction + ->continuationItems[1] + ->continuationItemRenderer + ->continuationEndpoint + ->continuationCommand + ->token; + } + + $this->out["npt"] = $npt; + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $date = $get["date"]; + $type = $get["type"]; + $duration = $get["duration"]; + $feature = $get["feature"]; + $sort = $get["sort"]; + + // parse ytInitialData + + $get = [ + "search_query" => $search + ]; + + if( + ( + $filter = + $this->ytfilter( + $date, + $type, + $duration, + $feature, + $sort + ) + ) !== null + ){ + + $get["sp"] = $filter; + } + + try{ + $json = $this->get( + "https://www.youtube.com/results", + $get + ); + }catch(Exception $error){ + + throw new Exception("Could not fetch results page"); + } + /* + $handle = fopen("test.html", "r"); + $json = fread($handle, filesize("test.html")); + fclose($handle); + */ + if( + !preg_match( + '/ytcfg\.set\(({".*})\); *window\.ytcfg/', + $json, + $ytconfig + ) + ){ + + throw new Exception("Could not get ytcfg"); + } + + $ytconfig = json_decode($ytconfig[1]); + + if( + !preg_match( + '/ytInitialData *= *({.*});<\/script>/', + $json, + $json + ) + ){ + + throw new Exception("Could not get ytInitialData"); + } + + $json = json_decode($json[1]); + + // generate POST data for nextpage + + $ytconfig->INNERTUBE_CONTEXT->client->screenWidthPoints = 1239; + $ytconfig->INNERTUBE_CONTEXT->client->screenHeightPoints = 999; + $ytconfig->INNERTUBE_CONTEXT->client->screenPixelDensity = 1; + $ytconfig->INNERTUBE_CONTEXT->client->screenDensityFloat = 1; + $ytconfig->INNERTUBE_CONTEXT->client->utcOffsetMinutes = -240; + $ytconfig->INNERTUBE_CONTEXT->request->internalExperimentFlags = []; + $ytconfig->INNERTUBE_CONTEXT->request->consistencyTokenJars = []; + + $ytconfig->INNERTUBE_CONTEXT->client->mainAppWebInfo = [ + "graftUrl" => $ytconfig->INNERTUBE_CONTEXT->client->originalUrl, + "webDisplayMode" => "WEB_DISPLAY_MODE_BROWSER", + "isWebNativeShareAvailable" => false + ]; + + $ytconfig->INNERTUBE_CONTEXT->adSignalsInfo = [ + "params" => [ + [ + "key" => "dt", + "value" => (string)$ytconfig->TIME_CREATED_MS + ], + [ + "key" => "flash", + "value" => "0" + ], + [ + "key" => "frm", + "value" => "0" + ], + [ + "key" => "u_tz", + "value" => "-240" + ], + [ + "key" => "u_his", + "value" => "3" + ], + [ + "key" => "u_h", + "value" => "1080" + ], + [ + "key" => "u_w", + "value" => "1920" + ], + [ + "key" => "u_ah", + "value" => "1080" + ], + [ + "key" => "u_cd", + "value" => "24" + ], + [ + "key" => "bc", + "value" => "31" + ], + [ + "key" => "bih", + "value" => "999" + ], + [ + "key" => "biw", + "value" => "1239" + ], + [ + "key" => "brdim", + "value" => "0,0,0,0,1920,0,1920,1061,1239,999" + ], + [ + "key" => "vis", + "value" => "1" + ], + [ + "key" => "wgl", + "value" => "true" + ], + [ + "key" => "ca_type", + "value" => "image" + ] + ] + ]; + + /* + echo json_encode($json); + die();*/ + + // *inhales* + foreach( + $json + ->contents + ->twoColumnSearchResultsRenderer + ->primaryContents + ->sectionListRenderer + ->contents[0] + ->itemSectionRenderer + ->contents + as $video + ){ + + $this->parsevideoobject($video); + } + + // get additional data from secondaryContents + if( + isset( + $json + ->contents + ->twoColumnSearchResultsRenderer + ->secondaryContents + ->secondarySearchContainerRenderer + ->contents[0] + ->universalWatchCardRenderer + ) + ){ + + $video = + $json + ->contents + ->twoColumnSearchResultsRenderer + ->secondaryContents + ->secondarySearchContainerRenderer + ->contents[0] + ->universalWatchCardRenderer; + /* + echo json_encode($video); + die();*/ + + $author = + [ + "name" => + $video + ->header + ->watchCardRichHeaderRenderer + ->title + ->simpleText, + "url" => + "https://www.youtube.com/channel/" . + $video + ->header + ->watchCardRichHeaderRenderer + ->titleNavigationEndpoint + ->browseEndpoint + ->browseId, + "avatar" => null + ]; + + if( + isset( + $video + ->header + ->watchCardRichHeaderRenderer + ->avatar + ->thumbnails[0] + ->url + ) + ){ + + $author["avatar"] = + $video + ->header + ->watchCardRichHeaderRenderer + ->avatar + ->thumbnails[0] + ->url; + } + + // add video in callToAction if present + if( + isset( + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->lengthText + ) + ){ + + array_push( + $this->out["video"], + [ + "title" => + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->title + ->simpleText, + "description" => null, + "author" => $author, + "date" => + $this->textualdate2unix( + trim( + explode( + "•", + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->subtitle + ->simpleText + )[2] + ) + ), + "duration" => + $this->hms2int( + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->lengthText + ->simpleText + ), + "views" => + $this->truncatedcount2int( + trim( + explode( + "•", + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->subtitle + ->simpleText, + 2 + )[1] + ) + ), + "thumb" => [ + "url" => + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->heroImage + ->singleHeroImageRenderer + ->thumbnail + ->thumbnails[0] + ->url, + "ratio" => "16:9" + ], + "url" => + "https://www.youtube.com/watch?v=" . + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->navigationEndpoint + ->watchEndpoint + ->videoId + ] + ); + } + + // get all playlists, ignore videos + $out = null; + + foreach( + $video + ->sections + as $section + ){ + + if( + isset( + $section + ->watchCardSectionSequenceRenderer + ->lists[0] + ->horizontalCardListRenderer + ->cards + ) + ){ + + $out = + $section + ->watchCardSectionSequenceRenderer + ->lists[0] + ->horizontalCardListRenderer + ->cards; + break; + } + } + + if($out !== null){ + + foreach( + $out as $video + ){ + + if( + !isset( + $video + ->searchRefinementCardRenderer + ) + ){ + + continue; + } + + $video = + $video + ->searchRefinementCardRenderer; + + array_push( + $this->out["playlist"], + [ + "title" => + $video + ->query + ->runs[0] + ->text, + "description" => null, + "author" => $author, + "date" => null, + "duration" => null, + "views" => null, + "thumb" => [ + "url" => + $video + ->thumbnail + ->thumbnails[0] + ->url, + "ratio" => "1:1" + ], + "url" => + "https://www.youtube.com" . + $video + ->searchEndpoint + ->commandMetadata + ->webCommandMetadata + ->url + ] + ); + } + } + } + + foreach( + $json + ->contents + ->twoColumnSearchResultsRenderer + ->primaryContents + ->sectionListRenderer + ->contents + as $cont + ){ + + if(isset($cont->continuationItemRenderer)){ + + $this->out["npt"] = [ + "key" => + $ytconfig + ->INNERTUBE_API_KEY, + "post" => [ + "context" => + $ytconfig + ->INNERTUBE_CONTEXT, + "continuation" => + $cont + ->continuationItemRenderer + ->continuationEndpoint + ->continuationCommand + ->token + ] + ]; + break; + } + } + } + + if($this->out["npt"] !== null){ + + $this->out["npt"] = $this->nextpage->store(json_encode($this->out["npt"]), "videos"); + } + + return $this->out; + } + + private function parsevideoobject($video){ + + if(isset($video->videoRenderer)){ + + $video = $video->videoRenderer; + + $description = null; + + if(isset($video->detailedMetadataSnippets)){ + foreach( + $video + ->detailedMetadataSnippets[0] + ->snippetText + ->runs + as $description_part + ){ + + $description .= $description_part->text; + } + } + + if( + isset( + $video + ->badges[0] + ->metadataBadgeRenderer + ->icon + ->iconType + ) && + $video + ->badges[0] + ->metadataBadgeRenderer + ->icon + ->iconType + == "LIVE" + ){ + + $type = "livestream"; + $date = null; + $duration = "_LIVE"; + + if(isset($video->viewCountText->runs[0]->text)){ + + $views = + $this->views2int( + $video + ->viewCountText + ->runs[0] + ->text + ); + }else{ + + $views = null; + } + }else{ + + $type = "video"; + + if(isset($video->publishedTimeText->simpleText)){ + + $date = $this->textualdate2unix( + $video + ->publishedTimeText + ->simpleText + ); + }else{ + + $date = null; + } + + if(isset($video->lengthText->simpleText)){ + + $duration = + $this->hms2int( + $video + ->lengthText + ->simpleText + ); + }else{ + + $duration = null; + } + + if(isset($video->viewCountText->simpleText)){ + + $views = + $this->views2int( + $video + ->viewCountText + ->simpleText + ); + }else{ + + $views = null; + } + } + + if( + $video + ->navigationEndpoint + ->commandMetadata + ->webCommandMetadata + ->webPageType + == "WEB_PAGE_TYPE_SHORTS" + ){ + + // haha you thought you could get me, youtube + // jokes on you i dont go outside + $type = "reel"; + } + + array_push( + $this->out[$type], + [ + "title" => + $video + ->title + ->runs[0] + ->text, + "description" => + $this->titledots($description), + "author" => [ + "name" => + $video + ->longBylineText + ->runs[0] + ->text, + "url" => + "https://www.youtube.com/channel/" . + $video + ->longBylineText + ->runs[0] + ->navigationEndpoint + ->browseEndpoint + ->browseId, + "avatar" => + $this->checkhttpspresence( + $video + ->channelThumbnailSupportedRenderers + ->channelThumbnailWithLinkRenderer + ->thumbnail + ->thumbnails[0] + ->url + ) + ], + "date" => $date, + "duration" => $duration, + "views" => $views, + "thumb" => [ + "url" => + $video + ->thumbnail + ->thumbnails[0] + ->url, + "ratio" => "16:9" + ], + "url" => + "https://www.youtube.com/watch?v=" . + $video + ->videoId + ] + ); + }elseif(isset($video->watchCardCompactVideoRenderer)){ + + $video = + $video + ->watchCardCompactVideoRenderer; + + array_push( + $this->out["video"], + [ + "title" => + $video + ->title + ->simpleText, + "description" => null, + "author" => [ + "name" => + $video + ->byline + ->runs[0] + ->text, + "url" => + "https://www.youtube.com/channel/" . + $video + ->byline + ->runs[0] + ->navigationEndpoint + ->browseEndpoint + ->browseId, + "avatar" => null + ], + "date" => + $this->textualdate2unix( + trim( + explode( + "•", + $video + ->subtitle + ->simpleText, + 2 + )[1] + ) + ), + "duration" => + $this->hms2int( + $video + ->lengthText + ->simpleText + ), + "views" => + $this->truncatedcount2int( + trim( + explode( + "•", + $video + ->subtitle + ->simpleText, + 2 + )[0] + ) + ), + "thumb" => [ + "url" => + $video + ->thumbnail + ->thumbnails[0] + ->url, + "ratio" => "16:9" + ], + "url" => + "https://www.youtube.com/watch?v=" . + $video + ->navigationEndpoint + ->watchEndpoint + ->videoId + ] + ); + + }elseif(isset($video->reelShelfRenderer)){ + + foreach( + $video + ->reelShelfRenderer + ->items + as $reel + ){ + + $reel = + $reel + ->reelItemRenderer; + + array_push( + $this->out["reel"], + [ + "title" => + $reel + ->headline + ->simpleText, + "description" => null, + "author" => [ + "name" => null, + "url" => null, + "avatar" => null + ], + "date" => null, + "duration" => + $this->textualtime2int( + $reel + ->accessibility + ->accessibilityData + ->label + ), + "views" => + $this->truncatedcount2int( + $reel + ->viewCountText + ->simpleText + ), + "thumb" => [ + "url" => + $reel + ->thumbnail + ->thumbnails[0] + ->url, + "ratio" => "9:16" + ], + "url" => + "https://www.youtube.com/watch?v=" . + $reel + ->videoId + ] + ); + } + } + + elseif(isset($video->channelRenderer)){ + + $video = $video->channelRenderer; + + $description = null; + + if(isset($video->descriptionSnippet)){ + + foreach( + $video + ->descriptionSnippet + ->runs + as $description_part + ){ + + $description .= $description_part->text; + } + } + + array_push( + $this->out["author"], + [ + "title" => + $video + ->title + ->simpleText, + "followers" => + isset( + $video + ->videoCountText + ->simpleText + ) ? + $this->truncatedcount2int( + $video + ->videoCountText + ->simpleText + ) : + 0, + "description" => $this->titledots($description), + "thumb" => + [ + "url" => + $this->checkhttpspresence( + $video + ->thumbnail + ->thumbnails[ + count( + $video + ->thumbnail + ->thumbnails + ) - 1 + ] + ->url + ), + "ratio" => "1:1" + ], + "url" => + "https://www.youtube.com/channel/" . + $video + ->channelId + ] + ); + } + + elseif(isset($video->shelfRenderer)){ + + if( + !is_object( + $video + ->shelfRenderer + ->content + ->verticalListRenderer + ) + ){ + return; + } + + foreach( + $video + ->shelfRenderer + ->content + ->verticalListRenderer + ->items + as $shelfvideo + ){ + + $this->parsevideoobject($shelfvideo); + } + + }elseif(isset($video->radioRenderer)){ + + $video = $video->radioRenderer; + + $description = + $video + ->videoCountText + ->runs[0] + ->text + . "."; + + $tmp = []; + foreach( + $video->videos + as $childvideo + ){ + + $tmp[] = + $childvideo + ->childVideoRenderer + ->title + ->simpleText; + } + + if(count($tmp) !== 0){ + + $description .= + " " . implode(", ", $tmp); + } + + array_push( + $this->out["playlist"], + [ + "title" => + $video + ->title + ->simpleText, + "description" => $description, + "author" => [ + "name" => + $video + ->longBylineText + ->simpleText, + "url" => null, + "avatar" => null + ], + "date" => null, + "duration" => null, + "views" => null, + "thumb" => [ + "url" => + $video + ->thumbnail + ->thumbnails[ + count( + $video + ->thumbnail + ->thumbnails + ) - 1 + ] + ->url, + "ratio" => "16:9" + ], + "url" => + "https://www.youtube.com/watch?v=" . + $video + ->videos[0] + ->childVideoRenderer + ->videoId . + "&list=" . + $video + ->playlistId . + "&start_radio=1" + ] + ); + + }elseif(isset($video->playlistRenderer)){ + + $video = $video->playlistRenderer; + + $description = $video->videoCount . " videos."; + + $tmp = []; + foreach( + $video + ->videos + as $childvideo + ){ + + $tmp[] = + $childvideo + ->childVideoRenderer + ->title + ->simpleText; + } + + if(count($tmp) !== 0){ + + $description .= + " " . implode(", ", $tmp); + } + + array_push( + $this->out["playlist"], + [ + "title" => + $video + ->title + ->simpleText, + "description" => $description, + "author" => [ + "name" => + $video + ->longBylineText + ->runs[0] + ->text, + "url" => + "https://www.youtube.com/channel/" . + $video + ->longBylineText + ->runs[0] + ->navigationEndpoint + ->browseEndpoint + ->browseId, + "picture" => null + ], + "date" => null, + "duration" => null, + "views" => null, + "thumb" => + [ + "url" => + $video + ->thumbnails[0] + ->thumbnails[ + count( + $video + ->thumbnails[0] + ->thumbnails + ) - 1 + ] + ->url, + "ratio" => "16:9" + ], + "url" => + "https://www.youtube.com/watch?v=" . + $video + ->videos[0] + ->childVideoRenderer + ->videoId . + "&list=" . + $video + ->playlistId . + "&start_radio=1" + ] + ); + + }/*else{ + if(!isset($video->searchPyvRenderer)){ + echo json_encode($video); + die();} + }*/ + } + + private function textualdate2unix($number){ + + $number = + explode( + " ", + str_replace( + [ + " ago", + "seconds", + "minutes", + "hours", + "days", + "weeks", + "months", + "years" + ], + [ + "", + "second", + "minute", + "hour", + "day", + "week", + "month", + "year" + ], + $number + ), + 2 + ); + + $time = 0; + switch($number[1]){ + + case "second": + $time = (int)$number[0]; + break; + + case "minute": + $time = (int)$number[0] * 60; + break; + + case "hour": + $time = (int)$number[0] * 3600; + break; + + case "day": + $time = (int)$number[0] * 86400; + break; + + case "week": + $time = (int)$number[0] * 604800; + break; + + case "month": + $time = (int)$number[0] * 2629746; + break; + + case "year": + $time = (int)$number[0] * 31556952; + break; + } + + return time() - $time; + } + + private function checkhttpspresence($link){ + + if(substr($link, 0, 2) == "//"){ + + return "https:" . $link; + } + + return $link; + } + + private function textualtime2int($number){ + + $number = explode(" - ", $number); + + if(count($number) >= 2){ + + $number = $number[count($number) - 2]; + }else{ + + $number = $number[0]; + } + + $number = + str_replace( + [ + " ", + "seconds", + "minutes", + "hours", + ], + [ + "", + "second", + "minute", + "hour" + ], + $number + ); + + preg_match_all( + '/([0-9]+)(second|minute|hour)/', + $number, + $number + ); + + $time = 0; + + for($i=0; $i<count($number[0]); $i++){ + + switch($number[2][$i]){ + + case "second": + $time = $time + (int)$number[1][$i]; + break; + + case "minute": + $time = $time + ((int)$number[1][$i] * 60); + break; + + case "hour": + $time = $time + ((int)$number[1][$i] * 3600); + break; + } + } + + return $time; + } + + private function views2int($views){ + + return + (int)str_replace( + ",", "", + explode(" ", $views, 2)[0] + ); + } + + private function hms2int($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + private function truncatedcount2int($number){ + + // decimal should always be 1 number long + $number = explode(" ", $number, 2); + $number = $number[0]; + + $unit = strtolower($number[strlen($number) - 1]); + + $tmp = explode(".", $number, 2); + $number = (int)$number; + + if(count($tmp) === 2){ + + $decimal = (int)$tmp[1]; + }else{ + + $decimal = 0; + } + + switch($unit){ + + case "k": + $exponant = 1000; + break; + + case "m": + $exponant = 1000000; + break; + + case "b"; + $exponant = 1000000000; + break; + + default: + $exponant = 1; + break; + } + + return ($number * $exponant) + ($decimal * ($exponant / 10)); + } + + private function titledots($title){ + + $substr = substr($title, -3); + + if( + $substr == "..." || + $substr == "…" + ){ + + return trim(substr($title, 0, -3), " \n\r\t\v\x00\0\x0B\xc2\xa0"); + } + + return trim($title, " \n\r\t\v\x00\0\x0B\xc2\xa0"); + } +} |