diff options
author | lolcat <will@lolcat.ca> | 2023-07-22 14:41:14 -0400 |
---|---|---|
committer | lolcat <will@lolcat.ca> | 2023-07-22 14:41:14 -0400 |
commit | bca265aea67ec62499aaa113a6490ce9ec7fe730 (patch) | |
tree | 3f05ec5ea542e41b474947e180034f42e99648e9 /lib |
still missing things on google scraper
Diffstat (limited to 'lib')
-rw-r--r-- | lib/bingcache-todo-fix.php | 144 | ||||
-rw-r--r-- | lib/classic.png | bin | 0 -> 7623 bytes | |||
-rw-r--r-- | lib/curlproxy.php | 652 | ||||
-rw-r--r-- | lib/favicon404.png | bin | 0 -> 807 bytes | |||
-rw-r--r-- | lib/frontend.php | 1282 | ||||
-rw-r--r-- | lib/fuckhtml.php | 361 | ||||
-rw-r--r-- | lib/img404.png | bin | 0 -> 4549 bytes | |||
-rw-r--r-- | lib/nextpage.php | 106 | ||||
-rw-r--r-- | lib/type-todo.php | 132 |
9 files changed, 2677 insertions, 0 deletions
diff --git a/lib/bingcache-todo-fix.php b/lib/bingcache-todo-fix.php new file mode 100644 index 0000000..a4acb5b --- /dev/null +++ b/lib/bingcache-todo-fix.php @@ -0,0 +1,144 @@ +<?php + +// https://www.bing.com/search?q=url%3Ahttps%3A%2F%2Flolcat.ca +// https://cc.bingj.com/cache.aspx?q=url%3ahttps%3a%2f%2flolcat.ca&d=4769685974291356&mkt=en-CA&setlang=en-US&w=tEsWuE7HW3Z5AIPQMVkDH4WaotS4LrK- +// <div class="b_attribution" u="0N|5119|4769685974291356|tEsWuE7HW3Z5AIPQMVkDH4WaotS4LrK-" tabindex="0"> + +new bingcache(); + +class bingcache{ + + public function __construct(){ + + if( + !isset($_GET["s"]) || + $this->validate_url($_GET["s"]) === false + ){ + + var_dump($this->validate_url($_GET["s"])); + $this->do404("Please provide a valid URL."); + } + + $url = $_GET["s"]; + + $curlproc = curl_init(); + + curl_setopt( + $curlproc, + CURLOPT_URL, + "https://www.bing.com/search?q=url%3A" . + urlencode($url) + ); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt( + $curlproc, + CURLOPT_HTTPHEADER, + ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 5); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + $this->do404("Failed to connect to bing servers. Please try again later."); + } + + curl_close($curlproc); + + preg_match( + '/<div class="b_attribution" u="(.*)" tabindex="0">/', + $data, + $keys + ); + + print_r($keys); + + if(count($keys) === 0){ + + $this->do404("Bing has not archived this URL."); + } + + $keys = explode("|", $keys[1]); + $count = count($keys); + + //header("Location: https://cc.bingj.com/cache.aspx?d=" . $keys[$count - 2] . "&w=" . $keys[$count - 1]); + echo("Location: https://cc.bingj.com/cache.aspx?d=" . $keys[$count - 2] . "&w=" . $keys[$count - 1]); + } + + public function do404($text){ + + include "lib/frontend.php"; + $frontend = new frontend(); + + echo + $frontend->load( + "error.html", + [ + "title" => "Shit", + "text" => $text + ] + ); + + die(); + } + + public function validate_url($url){ + + $url_parts = parse_url($url); + + // check if required parts are there + if( + !isset($url_parts["scheme"]) || + !( + $url_parts["scheme"] == "http" || + $url_parts["scheme"] == "https" + ) || + !isset($url_parts["host"]) + ){ + return false; + } + + if( + // if its not an RFC-valid URL + !filter_var($url, FILTER_VALIDATE_URL) + ){ + return false; + } + + $ip = + str_replace( + ["[", "]"], // handle ipv6 + "", + $url_parts["host"] + ); + + // if its not an IP + if(!filter_var($ip, FILTER_VALIDATE_IP)){ + + // resolve domain's IP + $ip = gethostbyname($url_parts["host"] . "."); + } + + // check if its localhost + return filter_var( + $ip, + FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE + ); + } +} diff --git a/lib/classic.png b/lib/classic.png Binary files differnew file mode 100644 index 0000000..3d2b8fc --- /dev/null +++ b/lib/classic.png diff --git a/lib/curlproxy.php b/lib/curlproxy.php new file mode 100644 index 0000000..846fbb7 --- /dev/null +++ b/lib/curlproxy.php @@ -0,0 +1,652 @@ +<?php + +class proxy{ + + public const req_web = 0; + public const req_image = 1; + + public function __construct($cache = true){ + + $this->cache = $cache; + } + + public function do404(){ + + http_response_code(404); + header("Content-Type: image/png"); + + $handle = fopen("lib/img404.png", "r"); + echo fread($handle, filesize("lib/img404.png")); + fclose($handle); + + die(); + return; + } + + public function getabsoluteurl($path, $relative){ + + if($this->validateurl($path)){ + + return $path; + } + + if(substr($path, 0, 2) == "//"){ + + return "https:" . $path; + } + + $url = null; + + $relative = parse_url($relative); + $url = $relative["scheme"] . "://"; + + if( + isset($relative["user"]) && + isset($relative["pass"]) + ){ + + $url .= $relative["user"] . ":" . $relative["pass"] . "@"; + } + + $url .= $relative["host"]; + + if(isset($relative["path"])){ + + $relative["path"] = explode( + "/", + $relative["path"] + ); + + unset($relative["path"][count($relative["path"]) - 1]); + $relative["path"] = implode("/", $relative["path"]); + + $url .= $relative["path"]; + } + + if( + strlen($path) !== 0 && + $path[0] !== "/" + ){ + + $url .= "/"; + } + + $url .= $path; + + return $url; + } + + public function validateurl($url){ + + $url_parts = parse_url($url); + + // check if required parts are there + if( + !isset($url_parts["scheme"]) || + !( + $url_parts["scheme"] == "http" || + $url_parts["scheme"] == "https" + ) || + !isset($url_parts["host"]) + ){ + return false; + } + + $ip = + str_replace( + ["[", "]"], // handle ipv6 + "", + $url_parts["host"] + ); + + // if its not an IP + if(!filter_var($ip, FILTER_VALIDATE_IP)){ + + // resolve domain's IP + $ip = gethostbyname($url_parts["host"] . "."); + } + + // check if its localhost + if( + filter_var( + $ip, + FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE + ) === false + ){ + + return false; + } + + return true; + } + + public function get($url, $reqtype = self::req_web, $acceptallcodes = false, $referer = null, $redirectcount = 0){ + + if($redirectcount === 5){ + + throw new Exception("Too many redirects"); + } + + // sanitize URL + try{ + + $this->validateurl($url); + }catch(Exception $error){ + + throw new Exception($error->getMessage()); + } + + $this->clientcache(); + + $curl = curl_init(); + + curl_setopt($curl, CURLOPT_URL, $url); + curl_setopt($curl, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curl, CURLOPT_HEADER, 1); + + switch($reqtype){ + case self::req_web: + curl_setopt( + $curl, + CURLOPT_HTTPHEADER, + [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1" + ] + ); + break; + + case self::req_image: + + if($referer === null){ + $referer = explode("/", $url, 4); + array_pop($referer); + + $referer = implode("/", $referer); + } + + curl_setopt( + $curl, + CURLOPT_HTTPHEADER, + [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0", + "Accept: image/avif,image/webp,*/*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate", + "DNT: 1", + "Connection: keep-alive", + "Referer: {$referer}" + ] + ); + break; + } + + curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curl, CURLOPT_TIMEOUT, 30); + + // limit size of payloads + curl_setopt($curl, CURLOPT_BUFFERSIZE, 1024); + curl_setopt($curl, CURLOPT_NOPROGRESS, false); + curl_setopt( + $curl, + CURLOPT_PROGRESSFUNCTION, + function($downloadsize, $downloaded, $uploadsize, $uploaded + ){ + + // if $downloaded exceeds 100MB, fuck off + return ($downloaded > 100000000) ? 1 : 0; + }); + + $body = curl_exec($curl); + + if(curl_errno($curl)){ + + throw new Exception(curl_error($curl)); + } + + curl_close($curl); + + $headers = []; + $http = null; + + while(true){ + + $header = explode("\n", $body, 2); + $body = $header[1]; + + if($http === null){ + + // http/1.1 200 ok + $header = explode("/", $header[0], 2); + $header = explode(" ", $header[1], 3); + + $http = [ + "version" => (float)$header[0], + "code" => (int)$header[1] + ]; + + continue; + } + + if(trim($header[0]) == ""){ + + // reached end of headers + break; + } + + $header = explode(":", $header[0], 2); + + // malformed headers + if(count($header) !== 2){ continue; } + + $headers[strtolower(trim($header[0]))] = trim($header[1]); + } + + // check http code + if( + $http["code"] >= 300 && + $http["code"] <= 309 + ){ + + // redirect + if(!isset($headers["location"])){ + + throw new Exception("Broken redirect"); + } + + $redirectcount++; + + return $this->get($this->getabsoluteurl($headers["location"], $url), $reqtype, $acceptallcodes, $referer, $redirectcount); + }else{ + if( + $acceptallcodes === false && + $http["code"] > 300 + ){ + + throw new Exception("Remote server returned an error code! ({$http["code"]})"); + } + } + + // check if data is okay + switch($reqtype){ + + case self::req_image: + + $format = false; + + if(isset($headers["content-type"])){ + + if($headers["content-type"] == "text/html"){ + + throw new Exception("Server returned an html document instead of image"); + } + + $tmp = explode(";", $headers["content-type"]); + + for($i=0; $i<count($tmp); $i++){ + + if( + preg_match( + '/^image\/([^ ]+)/i', + $tmp[$i], + $match + ) + ){ + + $format = strtolower($match[1]); + + if(substr($format, 0, 2) == "x-"){ + + $format = substr($format, 2); + } + break; + } + } + } + + return [ + "http" => $http, + "format" => $format, + "headers" => $headers, + "body" => $body + ]; + break; + + default: + + return [ + "http" => $http, + "headers" => $headers, + "body" => $body + ]; + break; + } + + return; + } + + public function stream_linear_image($url, $referer = null){ + + $this->stream($url, $referer, "image"); + } + + public function stream_linear_audio($url, $referer = null){ + + $this->stream($url, $referer, "audio"); + } + + private function stream($url, $referer, $format){ + + $this->url = $url; + $this->format = $format; + + // sanitize URL + try{ + + $this->validateurl($url); + }catch(Exception $error){ + + throw new Exception($error->getMessage()); + } + + $this->clientcache(); + + $curl = curl_init(); + + // set headers + if($referer === null){ + $referer = explode("/", $url, 4); + array_pop($referer); + + $referer = implode("/", $referer); + } + + switch($format){ + + case "image": + curl_setopt( + $curl, + CURLOPT_HTTPHEADER, + [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "Accept: image/avif,image/webp,*/*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br", + "DNT: 1", + "Connection: keep-alive", + "Referer: {$referer}" + ] + ); + break; + + case "audio": + curl_setopt( + $curl, + CURLOPT_HTTPHEADER, + [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "Accept: audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br", + "DNT: 1", + "Connection: keep-alive", + "Referer: {$referer}" + ] + ); + break; + } + + // follow redirects + curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($curl, CURLOPT_MAXREDIRS, 5); + curl_setopt($curl, CURLOPT_AUTOREFERER, 5); + + // set url + curl_setopt($curl, CURLOPT_URL, $url); + curl_setopt($curl, CURLOPT_ENCODING, ""); // default encoding + + // timeout + disable ssl + curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 10); + curl_setopt($curl, CURLOPT_TIMEOUT, 30); + + curl_setopt( + $curl, + CURLOPT_WRITEFUNCTION, + function($c, $data){ + + if(curl_getinfo($c, CURLINFO_HTTP_CODE) !== 200){ + + throw new Exception("Serber returned a non-200 code"); + } + + echo $data; + return strlen($data); + } + ); + + $this->empty_header = false; + $this->cont = false; + $this->headers_tmp = []; + $this->headers = []; + curl_setopt( + $curl, + CURLOPT_HEADERFUNCTION, + function($c, $header){ + + $head = trim($header); + $len = strlen($head); + + if($len === 0){ + + $this->empty_header = true; + $this->headers_tmp = []; + }else{ + + $this->empty_header = false; + $this->headers_tmp[] = $head; + } + + foreach($this->headers_tmp as $h){ + + // parse headers + $h = explode(":", $h, 2); + + if(count($h) !== 2){ + + if(curl_getinfo($c, CURLINFO_HTTP_CODE) !== 200){ + + // not HTTP 200, probably a redirect + $this->cont = false; + }else{ + + $this->cont = true; + } + + // is HTTP 200, just ignore that line + continue; + } + + $this->headers[strtolower(trim($h[0]))] = trim($h[1]); + } + + if( + $this->cont && + $this->empty_header + ){ + + // get content type + if(isset($this->headers["content-type"])){ + + $filetype = explode("/", $this->headers["content-type"]); + + if(strtolower($filetype[0]) != $this->format){ + + throw new Exception("Resource is not an {$this->format} (Found {$filetype[0]} instead)"); + } + + }else{ + + throw new Exception("Resource is not an {$this->format} (no Content-Type)"); + } + + header("Content-Type: {$this->format}/{$filetype[1]}"); + + // give payload size + if(isset($this->headers["content-length"])){ + + header("Content-Length: {$this->headers["content-length"]}"); + } + + // give filename + $this->getfilenameheader($this->headers, $this->url, $filetype[1]); + } + + return strlen($header); + } + ); + + curl_exec($curl); + + if(curl_errno($curl)){ + + throw new Exception(curl_error($curl)); + } + + curl_close($curl); + } + + public function getfilenameheader($headers, $url, $filetype = "jpg"){ + + // get filename from content-disposition header + if(isset($headers["content-disposition"])){ + + preg_match( + '/filename=([^;]+)/', + $headers["content-disposition"], + $filename + ); + + if(isset($filename[1])){ + + header("Content-Disposition: filename=" . $filename[1] . "." . $filetype); + return; + } + } + + // get filename from URL + $filename = parse_url($url, PHP_URL_PATH); + + if($filename === null){ + + // everything failed! rename file to domain name + header("Content-Disposition: filename=" . parse_url($url, PHP_URL_HOST) . "." . $filetype); + return; + } + + // remove extension from filename + $filename = + explode( + ".", + basename($filename) + ); + + if(count($filename) > 1){ + array_pop($filename); + } + + $filename = implode(".", $filename); + + header("Content-Disposition: inline; filename=" . $filename . "." . $filetype); + return; + } + + public function getimageformat($payload, &$imagick){ + + $finfo = new finfo(FILEINFO_MIME_TYPE); + $format = $finfo->buffer($payload["body"]); + + if($format === false){ + + if($payload["format"] === false){ + + header("X-Error: Could not parse format"); + $this->favicon404(); + } + + $format = $payload["format"]; + }else{ + + $format_tmp = explode("/", $format, 2); + + if($format_tmp[0] == "image"){ + + $format_tmp = strtolower($format_tmp[1]); + + if(substr($format_tmp, 0, 2) == "x-"){ + + $format_tmp = substr($format_tmp, 2); + } + + $format = $format_tmp; + } + } + + switch($format){ + + case "tiff": $format = "gif"; break; + case "vnd.microsoft.icon": $format = "ico"; break; + case "icon": $format = "ico"; break; + case "svg+xml": $format = "svg"; break; + } + + $imagick = new Imagick(); + + if( + !in_array( + $format, + array_map("strtolower", $imagick->queryFormats()) + ) + ){ + + // format could not be found, but imagemagick can + // sometimes detect it? shit's fucked + $format = false; + } + + return $format; + } + + public function clientcache(){ + + if($this->cache === false){ + + return; + } + + header("Last-Modified: Thu, 01 Oct 1970 00:00:00 GMT"); + $headers = getallheaders(); + + if( + isset($headers["If-Modified-Since"]) || + isset($headers["If-Unmodified-Since"]) + ){ + + http_response_code(304); // 304: Not Modified + die(); + } + } +} diff --git a/lib/favicon404.png b/lib/favicon404.png Binary files differnew file mode 100644 index 0000000..7540694 --- /dev/null +++ b/lib/favicon404.png diff --git a/lib/frontend.php b/lib/frontend.php new file mode 100644 index 0000000..3be912b --- /dev/null +++ b/lib/frontend.php @@ -0,0 +1,1282 @@ +<?php + +class frontend{ + + public function load($template, $replacements = []){ + + $handle = fopen("template/{$template}", "r"); + $data = fread($handle, filesize("template/{$template}")); + fclose($handle); + + $data = explode("\n", $data); + $html = ""; + + for($i=0; $i<count($data); $i++){ + + $html .= trim($data[$i]); + } + + foreach($replacements as $key => $value){ + + $html = + str_replace( + "{%{$key}%}", + $value, + $html + ); + } + + return trim($html); + } + + public function getthemeclass($raw = true){ + + if( + isset($_COOKIE["theme"]) && + $_COOKIE["theme"] == "cream" + ){ + + $body_class = "theme-white "; + }else{ + + $body_class = ""; + } + + if( + $raw && + $body_class != "" + ){ + + return ' class="' . rtrim($body_class) . '"'; + } + + return $body_class; + } + + public function loadheader(array $get, array $filters, string $page){ + + echo + $this->load("header.html", [ + "title" => trim($get["s"] . " ({$page})"), + "description" => ucfirst($page) . ' search results for "' . htmlspecialchars($get["s"]) . '"', + "index" => "no", + "search" => htmlspecialchars($get["s"]), + "tabs" => $this->generatehtmltabs($page, $get["s"]), + "filters" => $this->generatehtmlfilters($filters, $get), + "body_class" => $this->getthemeclass() + ]); + + if( + preg_match( + '/bot|wget|curl|python-requests|scrapy|feedfetcher|go-http-client|ruby|universalfeedparser|yahoo\! slurp|spider|rss/i', + $_SERVER["HTTP_USER_AGENT"] + ) + ){ + + // bot detected !! + echo + $this->drawerror( + "Tshh, blocked!", + 'You were blocked from viewing this page. If you wish to scrape data from 4get, please consider running <a href="https://git.lolcat.ca/lolcat/4get" rel="noreferrer nofollow">your own 4get instance</a> or using <a href="/api.txt">the API</a>.', + ); + die(); + } + } + + public function drawerror($title, $error){ + + return + $this->load("search.html", [ + "class" => "", + "right-left" => "", + "right-right" => "", + "left" => + '<div class="infobox">' . + '<h1>' . htmlspecialchars($title) . '</h1>' . + $error . + '</div>' + ]); + } + + public function drawtextresult($site, $greentext = null, $duration = null, $keywords, $tabindex = true){ + + $payload = + '<div class="text-result">'; + + // add favicon, link and archive links + $payload .= $this->drawlink($site["url"]); + + /* + Draw title + description + filetype + */ + $payload .= + '<a href="' . htmlspecialchars($site["url"]) . '" class="hover" rel="noreferrer nofollow"'; + + if($tabindex === false){ + + $payload .= ' tabindex="-1"'; + } + + $payload .= '>'; + + if($site["thumb"]["url"] !== null){ + + $payload .= + '<div class="thumb-wrap'; + + switch($site["thumb"]["ratio"]){ + + case "16:9": + $size = "landscape"; + break; + + case "9:16": + $payload .= " portrait"; + $size = "portrait"; + break; + + case "1:1": + $payload .= " square"; + $size = "square"; + break; + } + + $payload .= + '">' . + '<img class="thumb" src="/proxy?i=' . urlencode($site["thumb"]["url"]) . '&s=' . $size . '" alt="thumb">'; + + if($duration !== null){ + + $payload .= + '<div class="duration">' . + htmlspecialchars($duration) . + '</div>'; + } + + $payload .= + '</div>'; + } + + $payload .= + '<div class="title">'; + + if( + isset($site["type"]) && + $site["type"] != "web" + ){ + + $payload .= '<div class="type">' . strtoupper($site["type"]) . '</div>'; + } + + $payload .= + htmlspecialchars($site["title"]) . + '</div>'; + + if($greentext !== null){ + + $payload .= + '<div class="greentext">' . + htmlspecialchars($greentext) . + '</div>'; + } + + if($site["description"] !== null){ + + $payload .= + '<div class="description">' . + $this->highlighttext($keywords, $site["description"]) . + '</div>'; + } + + $payload .= '</a>'; + + /* + Sublinks + */ + if( + isset($site["sublink"]) && + !empty($site["sublink"]) + ){ + + usort($site["sublink"], function($a, $b){ + + return strlen($a["description"]) > strlen($b["description"]); + }); + + $payload .= + '<div class="sublinks">' . + '<table>'; + + $opentr = false; + for($i=0; $i<count($site["sublink"]); $i++){ + + if(($i % 2) === 0){ + + $opentr = true; + $payload .= '<tr>'; + }else{ + + $opentr = false; + } + + $payload .= + '<td>' . + '<a href="' . htmlspecialchars($site["sublink"][$i]["url"]) . '" rel="noreferrer nofollow">' . + '<div class="title">' . + htmlspecialchars($site["sublink"][$i]["title"]) . + '</div>'; + + if(!empty($site["sublink"][$i]["date"])){ + + $payload .= + '<div class="greentext">' . + date("jS M y @ g:ia", $site["sublink"][$i]["date"]) . + '</div>'; + } + + if(!empty($site["sublink"][$i]["description"])){ + + $payload .= + '<div class="description">' . + $this->highlighttext($keywords, $site["sublink"][$i]["description"]) . + '</div>'; + } + + $payload .= '</a></td>'; + + if($opentr === false){ + + $payload .= '</tr>'; + } + } + + if($opentr === true){ + + $payload .= '<td></td></tr>'; + } + + $payload .= '</table></div>'; + } + + if( + isset($site["table"]) && + !empty($site["table"]) + ){ + + $payload .= '<table class="info-table">'; + + foreach($site["table"] as $title => $value){ + + $payload .= + '<tr>' . + '<td>' . htmlspecialchars($title) . '</td>' . + '<td>' . htmlspecialchars($value) . '</td>' . + '</tr>'; + } + + $payload .= '</table>'; + } + + return $payload . '</div>'; + } + + public function highlighttext($keywords, $text){ + + $text = htmlspecialchars($text); + + $keywords = explode(" ", $keywords); + $regex = []; + + foreach($keywords as $word){ + + $regex[] = "\b" . preg_quote($word, "/") . "\b"; + } + + $regex = "/" . implode("|", $regex) . "/i"; + + return + preg_replace( + $regex, + '<b>${0}</b>', + $text + ); + } + + function highlightcode($text){ + + // https://www.php.net/highlight_string + ini_set("highlight.comment", "c-comment"); + ini_set("highlight.default", "c-default"); + ini_set("highlight.html", "c-default"); + ini_set("highlight.keyword", "c-keyword"); + ini_set("highlight.string", "c-string"); + + $text = + trim( + preg_replace( + '/<\/span>$/', + "", // remove stray ending span because of the <?php stuff + str_replace( + [ + '<br />', + ' ' + ], + [ + "\n", // replace <br> with newlines + " " // replace html entity to space + ], + str_replace( + [ + // leading <?php garbage + "<span style=\"color: c-default\">\n<?php ", + "<code>", + "</code>" + ], + "", + highlight_string("<?php " . $text, true) + ) + ) + ) + ); + + // replace colors + $classes = ["c-comment", "c-default", "c-keyword", "c-string"]; + + foreach($classes as $class){ + + $text = str_replace('<span style="color: ' . $class . '">', '<span class="' . $class . '">', $text); + } + + return $text; + } + + public function drawlink($link){ + + /* + Add favicon + */ + $host = parse_url($link); + $esc = + explode( + ".", + $host["host"], + 2 + ); + + if( + count($esc) === 2 && + $esc[0] == "www" + ){ + + $esc = $esc[1]; + }else{ + + $esc = $esc[0]; + } + + $esc = substr($esc, 0, 2); + + $urlencode = urlencode($link); + + $payload = + '<div class="url">' . + '<button class="favicon" tabindex="-1">' . + '<img src="/favicon?s=' . htmlspecialchars($host["scheme"] . "://" . $host["host"]) . '" alt="' . htmlspecialchars($esc) . '">' . + //'<img src="/404.php" alt="' . htmlspecialchars($esc) . '">' . + '</button>' . + '<div class="favicon-dropdown">'; + + /* + Add archive links + */ + if( + $host["host"] == "boards.4chan.org" || + $host["host"] == "boards.4channel.org" + ){ + + $archives = []; + $path = explode("/", $host["path"]); + $count = count($path); + // /pol/thread/417568063/post-shitty-memes-if-you-want-to + + if($count !== 0){ + + $isboard = true; + + switch($path[1]){ + + case "con": + break; + + case "q": + $archives[] = "desuarchive.org"; + break; + + case "qa": + $archives[] = "desuarchive.org"; + break; + + case "qb": + $archives[] = "arch.b4k.co"; + break; + + case "trash": + $archives[] = "desuarchive.org"; + break; + + case "a": + $archives[] = "desuarchive.org"; + break; + + case "c": + $archives[] = "desuarchive.org"; + break; + + case "w": + break; + + case "m": + $archives[] = "desuarchive.org"; + break; + + case "cgl": + $archives[] = "desuarchive.org"; + $archives[] = "warosu.org"; + break; + + case "cm": + $archives[] = "boards.fireden.net"; + break; + + case "f": + $archives[] = "archive.4plebs.org"; + break; + + case "n": + break; + + case "jp": + $archives[] = "warosu.org"; + break; + + case "vt": + $archives[] = "warosu.org"; + break; + + case "v": + $archives[] = "boards.fireden.net"; + $archives[] = "arch.b4k.co"; + break; + + case "vg": + $archives[] = "boards.fireden.net"; + $archives[] = "arch.b4k.co"; + break; + + case "vm": + $archives[] = "arch.b4k.co"; + break; + + case "vmg": + $archives[] = "arch.b4k.co"; + break; + + case "vp": + $archives[] = "arch.b4k.co"; + break; + + case "vr": + $archives[] = "desuarchive.org"; + $archives[] = "warosu.org"; + break; + + case "vrpg": + $archives[] = "arch.b4k.co"; + break; + + case "vst": + $archives[] = "arch.b4k.co"; + break; + + case "co": + $archives[] = "desuarchive.org"; + break; + + case "g": + $archives[] = "desuarchive.org"; + $archives[] = "arch.b4k.co"; + break; + + case "tv": + $archives[] = "archive.4plebs.org"; + break; + + case "k": + $archives[] = "desuarchive.org"; + break; + + case "o": + $archives[] = "archive.4plebs.org"; + break; + + case "an": + $archives[] = "desuarchive.org"; + break; + + case "tg": + $archives[] = "desuarchive.org"; + $archives[] = "archive.4plebs.org"; + break; + + case "sp": + $archives[] = "archive.4plebs.org"; + break; + + case "xs": + $archives[] = "eientei.xyz"; + break; + + case "pw": + break; + + case "sci": + $archives[] = "boards.fireden.net"; + $archives[] = "warosu.org"; + $archives[] = "eientei.xyz"; + break; + + case "his": + $archives[] = "desuarchive.org"; + break; + + case "int": + $archives[] = "desuarchive.org"; + break; + + case "out": + break; + + case "toy": + break; + + case "i": + $archives[] = "archiveofsins.com"; + $archives[] = "eientei.xyz"; + break; + + case "po": + break; + + case "p": + break; + + case "ck": + $archives[] = "warosu.org"; + break; + + case "ic": + $archives[] = "boards.fireden.net"; + $archives[] = "warosu.org"; + break; + + case "wg": + break; + + case "lit": + $archives[] = "warosu.org"; + break; + + case "mu": + $archives[] = "desuarchive.org"; + break; + + case "fa": + $archives[] = "warosu.org"; + break; + + case "3": + $archives[] = "warosu.org"; + $archives[] = "eientei.xyz"; + break; + + case "gd": + break; + + case "diy": + $archives[] = "warosu.org"; + break; + + case "wsg": + $archives[] = "desuarchive.org"; + break; + + case "qst": + break; + + case "biz": + $archives[] = "warosu.org"; + break; + + case "trv": + $archives[] = "archive.4plebs.org"; + break; + + case "fit": + $archives[] = "desuarchive.org"; + break; + + case "x": + $archives[] = "archive.4plebs.org"; + break; + + case "adv": + $archives[] = "archive.4plebs.org"; + break; + + case "lgbt": + $archives[] = "archiveofsins.com"; + break; + + case "mlp": + $archives[] = "desuarchive.org"; + $archives[] = "arch.b4k.co"; + break; + + case "news": + break; + + case "wsr": + break; + + case "vip": + break; + + case "b": + $archives[] = "thebarchive.com"; + break; + + case "r9k": + $archives[] = "desuarchive.org"; + break; + + case "pol": + $archives[] = "archive.4plebs.org"; + break; + + case "bant": + $archives[] = "thebarchive.com"; + break; + + case "soc": + $archives[] = "archiveofsins.com"; + break; + + case "s4s": + $archives[] = "archive.4plebs.org"; + break; + + case "s": + $archives[] = "archiveofsins.com"; + break; + + case "hc": + $archives[] = "archiveofsins.com"; + break; + + case "hm": + $archives[] = "archiveofsins.com"; + break; + + case "h": + $archives[] = "archiveofsins.com"; + break; + + case "e": + break; + + case "u": + $archives[] = "archiveofsins.com"; + break; + + case "d": + $archives[] = "desuarchive.org"; + break; + + case "y": + $archives[] = "boards.fireden.net"; + break; + + case "t": + $archives[] = "archiveofsins.com"; + break; + + case "hr": + $archives[] = "archive.4plebs.org"; + break; + + case "gif": + break; + + case "aco": + $archives[] = "desuarchive.org"; + break; + + case "r": + $archives[] = "archiveofsins.com"; + break; + + default: + $isboard = false; + break; + } + + if($isboard === true){ + + $archives[] = "archived.moe"; + } + + $trail = ""; + + if( + isset($path[2]) && + isset($path[3]) && + $path[2] == "thread" + ){ + + $trail .= "/" . $path[1] . "/thread/" . $path[3]; + }elseif($isboard){ + + $trail = "/" . $path[1] . "/"; + } + + for($i=0; $i<count($archives); $i++){ + + $payload .= + '<a href="https://' . $archives[$i] . $trail . '" class="list" target="_BLANK">' . + '<img src="/favicon?s=https://' . $archives[$i] . '" alt="' . $archives[$i][0] . $archives[$i][1] . '">' . + $archives[$i] . + '</a>'; + } + } + } + + $payload .= + '<a href="https://webcache.googleusercontent.com/search?q=cache:' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://google.com" alt="go">Google cache</a>' . + '<a href="https://web.archive.org/web/' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.org" alt="ar">Archive.org</a>' . + '<a href="https://archive.is/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' . + '<a href="https://www.bing.com/search?q=url%3A' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://bing.com" alt="bi">Bing cache</a>' . + '<a href="https://megalodon.jp/?url=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://megalodon.jp" alt="me">Megalodon</a>' . + '</div>'; + + /* + Draw link + */ + $parts = explode("/", $link); + $clickurl = ""; + + // remove trailing / + $c = count($parts) - 1; + if($parts[$c] == ""){ + + $parts[$c - 1] = $parts[$c - 1] . "/"; + unset($parts[$c]); + } + + // merge https://site together + $parts = [ + $parts[0] . $parts[1] . '//' . $parts[2], + ...array_slice($parts, 3, count($parts) - 1) + ]; + + $c = count($parts); + for($i=0; $i<$c; $i++){ + + if($i !== 0){ $clickurl .= "/"; } + + $clickurl .= $parts[$i]; + + if($i === $c - 1){ + + $parts[$i] = rtrim($parts[$i], "/"); + } + + $payload .= + '<a class="part" href="' . htmlspecialchars($clickurl) . '" rel="noreferrer nofollow" tabindex="-1">' . + htmlspecialchars(urldecode($parts[$i])) . + '</a>'; + + if($i !== $c - 1){ + + $payload .= '<span class="separator"></span>'; + } + } + + return $payload . '</div>'; + } + + public function getscraperfilters($page){ + + $get_scraper = null; + + switch($page){ + + case "web": + $get_scraper = isset($_COOKIE["scraper_web"]) ? $_COOKIE["scraper_web"] : null; + break; + + case "images": + $get_scraper = isset($_COOKIE["scraper_images"]) ? $_COOKIE["scraper_images"] : null; + break; + + case "videos": + $get_scraper = isset($_COOKIE["scraper_videos"]) ? $_COOKIE["scraper_videos"] : null; + break; + + case "news": + $get_scraper = isset($_COOKIE["scraper_news"]) ? $_COOKIE["scraper_news"] : null; + break; + } + + if( + isset($_GET["scraper"]) && + is_string($_GET["scraper"]) + ){ + + $get_scraper = $_GET["scraper"]; + }else{ + + if( + isset($_GET["npt"]) && + is_string($_GET["npt"]) + ){ + + $get_scraper = explode(".", $_GET["npt"], 2)[0]; + + $get_scraper = + preg_replace( + '/[0-9]+$/', + "", + $get_scraper + ); + } + } + + // add search field + $filters = + [ + "s" => [ + "option" => "_SEARCH" + ] + ]; + + // define default scrapers + switch($page){ + + case "web": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "ddg" => "DuckDuckGo", + "brave" => "Brave", + "google" => "Google", + "mojeek" => "Mojeek", + "marginalia" => "Marginalia", + "wiby" => "wiby" + ] + ]; + break; + + case "images": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "ddg" => "DuckDuckGo", + "yandex" => "Yandex", + "google" => "Google" + ] + ]; + break; + + case "videos": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "yt" => "YouTube", + "ddg" => "DuckDuckGo", + "google" => "Google" + ] + ]; + break; + + case "news": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "ddg" => "DuckDuckGo", + "brave" => "Brave", + "google" => "Google", + "mojeek" => "Mojeek" + ] + ]; + break; + } + + // get scraper name from user input, or default out to preferred scraper + $scraper_out = null; + $first = true; + + foreach($filters["scraper"]["option"] as $scraper_name => $scraper_pretty){ + + if($first === true){ + + $first = $scraper_name; + } + + if($scraper_name == $get_scraper){ + + $scraper_out = $scraper_name; + } + } + + if($scraper_out === null){ + + $scraper_out = $first; + } + + switch($scraper_out){ + + case "ddg": + include "scraper/ddg.php"; + $lib = new ddg(); + break; + + case "brave": + include "scraper/brave.php"; + $lib = new brave(); + break; + + case "yt"; + include "scraper/youtube.php"; + $lib = new youtube(); + break; + + case "yandex": + include "scraper/yandex.php"; + $lib = new yandex(); + break; + + case "google": + include "scraper/google.php"; + $lib = new google(); + break; + + case "mojeek": + include "scraper/mojeek.php"; + $lib = new mojeek(); + break; + + case "marginalia": + include "scraper/marginalia.php"; + $lib = new marginalia(); + break; + + case "wiby": + include "scraper/wiby.php"; + $lib = new wiby(); + break; + } + + // set scraper on $_GET + $_GET["scraper"] = $scraper_out; + + // set nsfw on $_GET + if( + isset($_COOKIE["nsfw"]) && + !isset($_GET["nsfw"]) + ){ + + $_GET["nsfw"] = $_COOKIE["nsfw"]; + } + + return + [ + $lib, + array_merge_recursive( + $filters, + $lib->getfilters($page) + ) + ]; + } + + public function parsegetfilters($parameters, $whitelist){ + + $sanitized = []; + + // add npt token + if( + isset($parameters["npt"]) && + is_string($parameters["npt"]) + ){ + + $sanitized["npt"] = $parameters["npt"]; + }else{ + + $sanitized["npt"] = false; + } + + // we're iterating over $whitelist, so + // you can't polluate $sanitized with useless + // parameters + foreach($whitelist as $parameter => $value){ + + if(isset($parameters[$parameter])){ + + if(!is_string($parameters[$parameter])){ + + $sanitized[$parameter] = null; + continue; + } + + // parameter is already set, use that value + $sanitized[$parameter] = $parameters[$parameter]; + }else{ + + // parameter is not set, add it + if(is_string($value["option"])){ + + // special field: set default value manually + switch($value["option"]){ + + case "_DATE": + // no date set + $sanitized[$parameter] = false; + break; + + case "_SEARCH": + // no search set + $sanitized[$parameter] = ""; + break; + } + + }else{ + + // set a default value + $sanitized[$parameter] = array_keys($value["option"])[0]; + } + } + + // sanitize input + if(is_array($value["option"])){ + if( + !in_array( + $sanitized[$parameter], + $keys = array_keys($value["option"]) + ) + ){ + + $sanitized[$parameter] = $keys[0]; + } + }else{ + + // sanitize search & string + switch($value["option"]){ + + case "_DATE": + if($sanitized[$parameter] !== false){ + + $sanitized[$parameter] = strtotime($sanitized[$parameter]); + if($sanitized[$parameter] <= 0){ + + $sanitized[$parameter] = false; + } + } + break; + + case "_SEARCH": + + // get search string & bang + $sanitized[$parameter] = trim($sanitized[$parameter]); + $sanitized["bang"] = ""; + + if( + strlen($sanitized[$parameter]) !== 0 && + $sanitized[$parameter][0] == "!" + ){ + + $sanitized[$parameter] = explode(" ", $sanitized[$parameter], 2); + + $sanitized["bang"] = trim($sanitized[$parameter][0]); + + if(count($sanitized[$parameter]) === 2){ + + $sanitized[$parameter] = trim($sanitized[$parameter][1]); + }else{ + + $sanitized[$parameter] = ""; + } + + $sanitized["bang"] = ltrim($sanitized["bang"], "!"); + } + + $sanitized[$parameter] = ltrim($sanitized[$parameter], "! \n\r\t\v\x00"); + } + } + } + + // invert dates if needed + if( + isset($sanitized["older"]) && + isset($sanitized["newer"]) && + $sanitized["newer"] !== false && + $sanitized["older"] !== false && + $sanitized["newer"] > $sanitized["older"] + ){ + + // invert + [ + $sanitized["older"], + $sanitized["newer"] + ] = [ + $sanitized["newer"], + $sanitized["older"] + ]; + } + + return $sanitized; + } + + public function s_to_timestamp($seconds){ + + if(is_string($seconds)){ + + return "LIVE"; + } + + return ($seconds >= 60) ? ltrim(gmdate("H:i:s", $seconds), ":0") : gmdate("0:s", $seconds); + } + + public function generatehtmltabs($page, $query){ + + $html = null; + + foreach(["web", "images", "videos", "news"] as $type){ + + $html .= '<a href="/' . $type . '?s=' . urlencode($query); + + if(!empty($params)){ + + $html .= $params; + } + + $html .= '" class="tab'; + + if($type == $page){ + + $html .= ' selected'; + } + + $html .= '">' . ucfirst($type) . '</a>'; + } + + return $html; + } + + public function generatehtmlfilters($filters, $params){ + + $html = null; + + foreach($filters as $filter_name => $filter_values){ + + if(!isset($filter_values["display"])){ + + continue; + } + + $output = true; + $tmp = + '<div class="filter">' . + '<div class="title">' . htmlspecialchars($filter_values["display"]) . '</div>'; + + if(is_array($filter_values["option"])){ + + $tmp .= '<select name="' . $filter_name . '">'; + + foreach($filter_values["option"] as $option_name => $option_title){ + + $tmp .= '<option value="' . $option_name . '"'; + + if($params[$filter_name] == $option_name){ + + $tmp .= ' selected'; + } + + $tmp .= '>' . htmlspecialchars($option_title) . '</option>'; + } + + $tmp .= '</select>'; + }else{ + + switch($filter_values["option"]){ + + case "_DATE": + $tmp .= '<input type="date" name="' . $filter_name . '"'; + + if($params[$filter_name] !== false){ + + $tmp .= ' value="' . date("Y-m-d", $params[$filter_name]) . '"'; + } + + $tmp .= '>'; + break; + + default: + $output = false; + break; + } + } + + $tmp .= '</div>'; + + if($output === true){ + + $html .= $tmp; + } + } + + return $html; + } + + public function buildquery($gets, $ommit = false){ + + $out = []; + foreach($gets as $key => $value){ + + if( + $value == null || + $value == false || + $key == "npt" || + $key == "extendedsearch" || + $value == "any" || + $value == "all" || + ( + $ommit === true && + $key == "s" + ) + ){ + + continue; + } + + $out[$key] = $value; + } + + return http_build_query($out); + } + + public function htmlnextpage($gets, $npt, $page){ + + $query = $this->buildquery($gets); + + return $page . "?" . $query . "&npt=" . $npt; + } +} diff --git a/lib/fuckhtml.php b/lib/fuckhtml.php new file mode 100644 index 0000000..8802511 --- /dev/null +++ b/lib/fuckhtml.php @@ -0,0 +1,361 @@ +<?php +class fuckhtml{ + + public function __construct($html = null, $isfile = false){ + + if($html !== null){ + + $this->load($html, $isfile); + } + } + + public function load($html, $isfile = false){ + + if(is_array($html)){ + + if(!isset($html["innerHTML"])){ + + throw new Exception("(load) Supplied array doesn't contain a innerHTML index"); + } + $html = $html["innerHTML"]; + } + + if($isfile){ + + $handle = fopen($html, "r"); + $fetch = fread($handle, filesize($html)); + fclose($handle); + + $this->html = $fetch; + }else{ + + $this->html = $html; + } + + $this->strlen = strlen($this->html); + } + + public function getElementsByTagName(string $tagname){ + + $out = []; + + /* + Scrape start of the tag. Example + <div class="mydiv"> ... + */ + + if($tagname == "*"){ + + $tagname = '[^\/<>\s]+'; + }else{ + + $tagname = preg_quote(strtolower($tagname)); + } + + preg_match_all( + '/<\s*(' . $tagname . ')(\s(?:[^>\'"]*|"[^"]*"|\'[^\']*\')+)?\s*>/i', + /* '/<\s*(' . $tagname . ')(\s[\S\s]*?)?>/i', */ + $this->html, + $starting_tags, + PREG_OFFSET_CAPTURE + ); + + for($i=0; $i<count($starting_tags[0]); $i++){ + + /* + Parse attributes + */ + $attributes = []; + + preg_match_all( + '/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/', + $starting_tags[2][$i][0], + $regex_attributes + ); + + for($k=0; $k<count($regex_attributes[0]); $k++){ + + if(trim($regex_attributes[2][$k]) == ""){ + + $attributes[$regex_attributes[1][$k]] = + "true"; + + continue; + } + + $attributes[$regex_attributes[1][$k]] = + trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00"); + } + + $out[] = [ + "tagName" => strtolower($starting_tags[1][$i][0]), + "startPos" => $starting_tags[0][$i][1], + "endPos" => 0, + "startTag" => $starting_tags[0][$i][0], + "attributes" => $attributes, + "innerHTML" => null + ]; + } + + /* + Get innerHTML + */ + // get closing tag positions + preg_match_all( + '/<\s*\/\s*(' . $tagname . ')\s*>/i', + $this->html, + $regex_closing_tags, + PREG_OFFSET_CAPTURE + ); + + // merge opening and closing tags together + for($i=0; $i<count($regex_closing_tags[1]); $i++){ + + $out[] = [ + "tagName" => strtolower($regex_closing_tags[1][$i][0]), + "endTag" => $regex_closing_tags[0][$i][0], + "startPos" => $regex_closing_tags[0][$i][1] + ]; + } + + usort( + $out, + function($a, $b){ + + return $a["startPos"] > $b["startPos"]; + } + ); + + // computer the indent level for each element + $level = []; + $count = count($out); + + for($i=0; $i<$count; $i++){ + + if(!isset($level[$out[$i]["tagName"]])){ + + $level[$out[$i]["tagName"]] = 0; + } + + if(isset($out[$i]["startTag"])){ + + // encountered starting tag + $level[$out[$i]["tagName"]]++; + $out[$i]["level"] = $level[$out[$i]["tagName"]]; + }else{ + + // encountered closing tag + $out[$i]["level"] = $level[$out[$i]["tagName"]]; + $level[$out[$i]["tagName"]]--; + } + } + + // if the indent level is the same for a div, + // we encountered _THE_ closing tag + for($i=0; $i<$count; $i++){ + + if(!isset($out[$i]["startTag"])){ + + continue; + } + + for($k=$i; $k<$count; $k++){ + + if( + isset($out[$k]["endTag"]) && + $out[$i]["tagName"] == $out[$k]["tagName"] && + $out[$i]["level"] + === $out[$k]["level"] + ){ + + $startlen = strlen($out[$i]["startTag"]); + $endlen = strlen($out[$k]["endTag"]); + + $out[$i]["endPos"] = $out[$k]["startPos"] + $endlen; + + $out[$i]["innerHTML"] = + substr( + $this->html, + $out[$i]["startPos"] + $startlen, + $out[$k]["startPos"] - ($out[$i]["startPos"] + $startlen) + ); + + $out[$i]["outerHTML"] = + substr( + $this->html, + $out[$i]["startPos"], + $out[$k]["startPos"] - $out[$i]["startPos"] + $endlen + ); + + break; + } + } + } + + // filter out ending divs + for($i=0; $i<$count; $i++){ + + if(isset($out[$i]["endTag"])){ + + unset($out[$i]); + } + + unset($out[$i]["startTag"]); + } + + return array_values($out); + } + + public function getElementsByAttributeName(string $name, $collection = null){ + + if($collection === null){ + + $collection = $this->getElementsByTagName("*"); + }elseif(is_string($collection)){ + + $collection = $this->getElementsByTagName($collection); + } + + $return = []; + foreach($collection as $elem){ + + foreach($elem["attributes"] as $attrib_name => $attrib_value){ + + if($attrib_name == $name){ + + $return[] = $elem; + continue 2; + } + } + } + + return $return; + } + + public function getElementsByFuzzyAttributeValue(string $name, string $value, $collection = null){ + + $elems = $this->getElementsByAttributeName($name, $collection); + $value = explode(" ", $value); + + $return = []; + + foreach($elems as $elem){ + + foreach($elem["attributes"] as $attrib_name => $attrib_value){ + + $attrib_value = explode(" ", $attrib_value); + $ac = count($attrib_value); + $nc = count($value); + $cr = 0; + + for($i=0; $i<$nc; $i++){ + + for($k=0; $k<$ac; $k++){ + + if($value[$i] == $attrib_value[$k]){ + + $cr++; + } + } + } + + if($cr === $nc){ + + $return[] = $elem; + continue 2; + } + } + } + + return $return; + } + + public function getElementsByAttributeValue(string $name, string $value, $collection = null){ + + $elems = $this->getElementsByAttributeName($name, $collection); + + $return = []; + + foreach($elems as $elem){ + + foreach($elem["attributes"] as $attrib_name => $attrib_value){ + + if($attrib_value == $value){ + + $return[] = $elem; + continue 2; + } + } + } + + return $return; + } + + public function getElementById(string $idname, $collection = null){ + + $id = $this->getElementsByAttributeValue("id", $idname, $collection); + + if(count($id) !== 0){ + + return $id[0]; + } + + return false; + } + + public function getElementsByClassName(string $classname, $collection = null){ + + return $this->getElementsByFuzzyAttributeValue("class", $classname, $collection); + } + + public function getTextContent($html, $whitespace = false, $trim = true){ + + if(is_array($html)){ + + if(!isset($html["innerHTML"])){ + + throw new Exception("(getTextContent) Supplied array doesn't contain a innerHTML index"); + } + $html = $html["innerHTML"]; + } + + $html = + preg_split('/\n|<\/?br>/i', $html); + + $out = ""; + for($i=0; $i<count($html); $i++){ + + $tmp = + html_entity_decode( + strip_tags( + $html[$i] + ), + ENT_QUOTES | ENT_XML1, "UTF-8" + ); + + if($trim){ + + $tmp = trim($tmp); + } + + $out .= $tmp; + + if($whitespace === true){ + + $out .= "\n"; + }else{ + + $out .= " "; + } + } + + if($trim){ + + return trim($out); + } + + return $out; + } +} + +?> diff --git a/lib/img404.png b/lib/img404.png Binary files differnew file mode 100644 index 0000000..4549dee --- /dev/null +++ b/lib/img404.png diff --git a/lib/nextpage.php b/lib/nextpage.php new file mode 100644 index 0000000..a883e49 --- /dev/null +++ b/lib/nextpage.php @@ -0,0 +1,106 @@ +<?php + +class nextpage{ + + public function __construct($scraper){ + + $this->scraper = $scraper; + } + + public function store($payload, $page){ + + $page = $page[0]; + $password = random_bytes(256); // 2048 bit + $salt = random_bytes(16); + $key = hash_pbkdf2("sha512", $password, $salt, 20000, 32, true); + $iv = + random_bytes( + openssl_cipher_iv_length("aes-256-gcm") + ); + + $tag = ""; + $out = openssl_encrypt($payload, "aes-256-gcm", $key, OPENSSL_RAW_DATA, $iv, $tag, "", 16); + + $key = apcu_inc("key", 1); + + apcu_store( + $page . "." . + $this->scraper . + (string)($key), + gzdeflate($salt.$iv.$out.$tag), + 420 // cache information for 7 minutes blaze it + ); + + return + $this->scraper . $key . "." . + rtrim(strtr(base64_encode($password), '+/', '-_'), '='); + } + + public function get($npt, $page){ + + $page = $page[0]; + $explode = explode(".", $npt, 2); + + if(count($explode) !== 2){ + + throw new Exception("Malformed nextPageToken!"); + } + + $apcu = $page . "." . $explode[0]; + $key = $explode[1]; + + $payload = apcu_fetch($apcu); + + if($payload === false){ + + throw new Exception("The nextPageToken is invalid or has expired!"); + } + + $key = + base64_decode( + str_pad( + strtr($key, '-_', '+/'), + strlen($key) % 4, + '=', + STR_PAD_RIGHT + ) + ); + + $payload = gzinflate($payload); + + $key = + hash_pbkdf2( + "sha512", + $key, + substr($payload, 0, 16), // salt + 20000, + 32, + true + ); + $ivlen = openssl_cipher_iv_length("aes-256-gcm"); + + $payload = + openssl_decrypt( + substr( + $payload, + 16 + $ivlen, + -16 + ), + "aes-256-gcm", + $key, + OPENSSL_RAW_DATA, + substr($payload, 16, $ivlen), + substr($payload, -16) + ); + + if($payload === false){ + + throw new Exception("The nextPageToken is invalid or has expired!"); + } + + // remove the key after using + apcu_delete($apcu); + + return $payload; + } +} diff --git a/lib/type-todo.php b/lib/type-todo.php new file mode 100644 index 0000000..f813543 --- /dev/null +++ b/lib/type-todo.php @@ -0,0 +1,132 @@ + + public function type($get){ + + $search = $get["s"]; + $bang = $get["bang"]; + + if(empty($search)){ + + if(!empty($bang)){ + + // !youtube + $conn = pg_connect("host=localhost dbname=4get user=postgres password=postgres"); + + pg_prepare($conn, "bang_get", "SELECT bang,name FROM bangs WHERE bang LIKE $1 ORDER BY bang ASC LIMIT 8"); + $q = pg_execute($conn, "bang_get", ["$bang%"]); + + $results = []; + while($row = pg_fetch_array($q, null, PGSQL_ASSOC)){ + + $results[] = [ + "s" => "!" . $row["bang"], + "n" => $row["name"] + ]; + } + + return $results; + }else{ + + // everything is empty + // lets just return a bang list + return [ + [ + "s" => "!w", + "n" => "Wikipedia", + "u" => "https://en.wikipedia.org/wiki/Special:Search?search={%q%}" + ], + [ + "s" => "!4ch", + "n" => "4chan Board", + "u" => "https://find.4chan.org/?q={%q%}" + ], + [ + "s" => "!a", + "n" => "Amazon", + "u" => "https://www.amazon.com/s?k={%q%}" + ], + [ + "s" => "!e", + "n" => "eBay", + "u" => "https://www.ebay.com/sch/items/?_nkw={%q%}" + ], + [ + "s" => "!so", + "n" => "Stack Overflow", + "u" => "http://stackoverflow.com/search?q={%q%}" + ], + [ + "s" => "!gh", + "n" => "GitHub", + "u" => "https://github.com/search?utf8=%E2%9C%93&q={%q%}" + ], + [ + "s" => "!tw", + "n" => "Twitter", + "u" => "https://twitter.com/search?q={%q%}" + ], + [ + "s" => "!r", + "n" => "Reddit", + "u" => "https://www.reddit.com/search?q={%q%}" + ], + ]; + } + } + + // now we know search isnt empty + if(!empty($bang)){ + + // check if the bang exists + $conn = pg_connect("host=localhost dbname=4get user=postgres password=postgres"); + + pg_prepare($conn, "bang_get_single", "SELECT bang,name FROM bangs WHERE bang = $1 LIMIT 1"); + $q = pg_execute($conn, "bang_get_single", [$bang]); + + $row = pg_fetch_array($q, null, PGSQL_ASSOC); + + if(isset($row["bang"])){ + + $bang = "!$bang "; + }else{ + + $bang = ""; + } + } + + try{ + $res = $this->get( + "https://duckduckgo.com/ac/", + [ + "q" => strtolower($search) + ], + ddg::req_xhr + ); + + $res = json_decode($res, true); + + }catch(Exception $e){ + + throw new Exception("Failed to get /ac/"); + } + + $arr = []; + for($i=0; $i<count($res); $i++){ + + if($i === 8){break;} + + if(empty($bang)){ + + $arr[] = [ + "s" => $res[$i]["phrase"] + ]; + }else{ + + $arr[] = [ + "s" => $bang . $res[$i]["phrase"], + "n" => $row["name"] + ]; + } + } + + return $arr; + } |