From 0d98d7839d1c3da75b95ef29ce12ef54a2a20094 Mon Sep 17 00:00:00 2001 From: lolcat Date: Thu, 16 May 2024 17:22:49 -0400 Subject: added greppr support also btw im not dead --- README.md | 11 +- api/v1/ac.php | 12 +- api/v1/images.php | 2 +- api/v1/music.php | 2 +- api/v1/news.php | 2 +- api/v1/videos.php | 2 +- api/v1/web.php | 2 +- data/config.php | 3 +- lib/backend.php | 6 +- lib/frontend.php | 1 + lib/fuckhtml.php | 4 +- scraper/greppr.php | 429 +++++++++++++++++++++++++++++++++++++++++++++++++++++ scraper/sc.php | 17 ++- settings.php | 4 + 14 files changed, 469 insertions(+), 28 deletions(-) create mode 100644 scraper/greppr.php diff --git a/README.md b/README.md index 38ebe28..6cc82a7 100644 --- a/README.md +++ b/README.md @@ -36,11 +36,12 @@ tl;dr the best way to actually browse for shit. | Brave | Brave | DuckDuckGo | Brave | | DuckDuckGo | | Yandex | Yandex | Brave | Google | | Yandex | | Google | Google | Yandex | Qwant | | Google | -| Qwant | Qwant | Google | Mojeek | | Yep | -| Yep | Pinterest | Qwant | | | Marginalia | -| Crowdview | Yep | | | | YouTube | -| Mwmbl | Imgur | | | | Soundcloud | -| Mojeek | FindThatMeme | | | | | +| Qwant | Qwant | Google | Mojeek | | Qwant | +| Yep | Yep | Qwant | | | Yep | +| Greppr | Imgur | | | | Marginalia | +| Crowdview | FindThatMeme | | | | YouTube | +| Mwmbl | | | | | Soundcloud | +| Mojeek | | | | | | | Marginalia | | | | | | | wiby | | | | | | | Curlie | | | | | | diff --git a/api/v1/ac.php b/api/v1/ac.php index 9d9f534..236dc7b 100644 --- a/api/v1/ac.php +++ b/api/v1/ac.php @@ -100,7 +100,7 @@ class autocomplete{ $_GET["s"], $json ], - JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE ); break; @@ -135,7 +135,7 @@ class autocomplete{ $_GET["s"], $json ], - JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE ); break; @@ -154,7 +154,7 @@ class autocomplete{ $_GET["s"], $json ], - JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE ); break; @@ -167,7 +167,7 @@ class autocomplete{ $_GET["s"], $json[1] // ensure it contains valid key 0 ], - JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE ); break; } @@ -221,7 +221,7 @@ class autocomplete{ echo json_encode( ["error" => $error], - JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE ); die(); } @@ -233,7 +233,7 @@ class autocomplete{ $_GET["s"], [] ], - JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE ); die(); } diff --git a/api/v1/images.php b/api/v1/images.php index 348dda7..de2c5a9 100644 --- a/api/v1/images.php +++ b/api/v1/images.php @@ -30,7 +30,7 @@ $get = $frontend->parsegetfilters($_GET, $filters); try{ echo json_encode( $scraper->image($get), - JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE ); }catch(Exception $e){ diff --git a/api/v1/music.php b/api/v1/music.php index a1359eb..58985e3 100644 --- a/api/v1/music.php +++ b/api/v1/music.php @@ -30,7 +30,7 @@ $get = $frontend->parsegetfilters($_GET, $filters); try{ echo json_encode( $scraper->music($get), - JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE ); }catch(Exception $e){ diff --git a/api/v1/news.php b/api/v1/news.php index ca11b13..ab38781 100644 --- a/api/v1/news.php +++ b/api/v1/news.php @@ -30,7 +30,7 @@ $get = $frontend->parsegetfilters($_GET, $filters); try{ echo json_encode( $scraper->news($get), - JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE ); }catch(Exception $e){ diff --git a/api/v1/videos.php b/api/v1/videos.php index c0a7507..1d23780 100644 --- a/api/v1/videos.php +++ b/api/v1/videos.php @@ -30,7 +30,7 @@ $get = $frontend->parsegetfilters($_GET, $filters); try{ echo json_encode( $scraper->video($get), - JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE ); }catch(Exception $e){ diff --git a/api/v1/web.php b/api/v1/web.php index df5cec1..6a9c030 100644 --- a/api/v1/web.php +++ b/api/v1/web.php @@ -43,7 +43,7 @@ try{ echo json_encode( $scraper->web($get), - JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES + JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE ); }catch(Exception $e){ diff --git a/data/config.php b/data/config.php index 42a968a..13be0f4 100644 --- a/data/config.php +++ b/data/config.php @@ -43,7 +43,7 @@ class config{ // If this regex expression matches on the user agent, it blocks the request // Not useful at all against a targetted attack - const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider/i'; + const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider|qwant/i'; // Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!) // Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"]; @@ -128,6 +128,7 @@ class config{ const PROXY_PINTEREST = false; const PROXY_SEZNAM = false; const PROXY_NAVER = false; + const PROXY_GREPPR = false; const PROXY_CROWDVIEW = false; const PROXY_MWMBL = false; const PROXY_FTM = false; // findthatmeme diff --git a/lib/backend.php b/lib/backend.php index 7631ff3..cfb04a9 100644 --- a/lib/backend.php +++ b/lib/backend.php @@ -36,7 +36,7 @@ class backend{ } // this function is also called directly on nextpage - public function assign_proxy(&$curlproc, $ip){ + public function assign_proxy(&$curlproc, string $ip){ // parse proxy line [ @@ -91,7 +91,7 @@ class backend{ /* Next page stuff */ - public function store($payload, $page, $proxy){ + public function store(string $payload, string $page, string $proxy){ $key = sodium_crypto_secretbox_keygen(); $nonce = random_bytes(SODIUM_CRYPTO_SECRETBOX_NONCEBYTES); @@ -120,7 +120,7 @@ class backend{ rtrim(strtr(base64_encode($key), '+/', '-_'), '='); } - public function get($npt, $page){ + public function get(string $npt, string $page){ $page = $page[0]; $explode = explode(".", $npt, 2); diff --git a/lib/frontend.php b/lib/frontend.php index a48b722..1c3eb09 100644 --- a/lib/frontend.php +++ b/lib/frontend.php @@ -925,6 +925,7 @@ class frontend{ "google" => "Google", "qwant" => "Qwant", "yep" => "Yep", + "greppr" => "Greppr", "crowdview" => "Crowdview", "mwmbl" => "Mwmbl", "mojeek" => "Mojeek", diff --git a/lib/fuckhtml.php b/lib/fuckhtml.php index 6895fbf..f3a6efe 100644 --- a/lib/fuckhtml.php +++ b/lib/fuckhtml.php @@ -321,11 +321,11 @@ class fuckhtml{ throw new Exception("(getTextContent) Supplied array doesn't contain an innerHTML index"); } + $html = $html["innerHTML"]; } - $html = - preg_split('/\n|<\/?br>/i', $html); + $html = preg_split('/\n|<\/?br>/i', $html); $out = ""; for($i=0; $ibackend = new backend("greppr"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return []; + } + + private function get($proxy, $url, $get = [], $cookie = false){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($cookie === false){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Cookie: PHPSESSID=" . $cookie, + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"] + ); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $headers = []; + + curl_setopt( + $curlproc, + CURLOPT_HEADERFUNCTION, + function($curlproc, $header) use (&$headers){ + + $len = strlen($header); + $header = explode(':', $header, 2); + + if(count($header) < 2){ + + // ignore invalid headers + return $len; + } + + $headers[strtolower(trim($header[0]))] = trim($header[1]); + + return $len; + } + ); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + + return [ + "headers" => $headers, + "data" => $data + ]; + } + + public function web($get, $first_attempt = true){ + + if($get["npt"]){ + + [$q, $proxy] = $this->backend->get($get["npt"], "web"); + + $q = json_decode($q, true); + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + } + + // get token + // token[0] = static token that changes once a day + // token[1] = dynamic token that changes on every request + // token[1] = PHPSESSID cookie + $tokens = apcu_fetch("greppr_token"); + + if( + $tokens === false || + $first_attempt === false // force token fetch + ){ + + // we haven't gotten the token yet, get it + try{ + + $response = + $this->get( + $proxy, + "https://greppr.org", + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search tokens"); + } + + $tokens = $this->parse_token($response); + + if($tokens === false){ + + throw new Exception("Failed to grep search tokens"); + } + } + + try{ + + if($get["npt"]){ + + $params = [ + $tokens[0] => $q["q"], + "s" => $q["s"], + "l" => 30, + "n" => $tokens[1] + ]; + }else{ + + $params = [ + $tokens[0] => $search, + "n" => $tokens[1] + ]; + } + + $searchresults = $this->get( + $proxy, + "https://greppr.org/search", + $params, + $tokens[2] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + if(strlen($searchresults["data"]) === 0){ + + // redirected to main page, which means we got old token + // generate a new one + + // ... unless we just tried to do that + if($first_attempt === false){ + + throw new Exception("Failed to get a new search token"); + } + + $this->get($get, false); + } + + // refresh the token with new data (this also triggers fuckhtml load) + $this->parse_token($searchresults, $tokens[2]); + + // response object + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // get results for later + $results = + $this->fuckhtml + ->getElementsByClassName( + "result", + "div" + ); + + // check for next page + $next_elem = + $this->fuckhtml + ->getElementsByClassName( + "pagination", + "ul" + ); + + if(count($next_elem) !== 0){ + + $this->fuckhtml->load($next_elem[0]); + + $as = + $this->fuckhtml + ->getElementsByClassName( + "page-link", + "a" + ); + + $break = false; + foreach($as as $a){ + + if($break === true){ + + parse_str( + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ), + $values + ); + + $values = array_values($values); + + $out["npt"] = + $this->backend->store( + json_encode( + [ + "q" => $values[0], + "s" => $values[1] + ] + ), + "web", + $proxy + ); + break; + } + + if($a["attributes"]["href"] == "#"){ + + $break = true; + } + } + } + + // scrape results + foreach($results as $result){ + + $this->fuckhtml->load($result); + + $a = + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0]; + + $description = + $this->fuckhtml + ->getElementsByFuzzyAttributeValue( + "style", + "color:#777777;", + "p" + ); + + if(count($description) === 0){ + + $description = null; + }else{ + + $description = + $this->fuckhtml + ->getTextContent( + $description[0] + ); + } + + $date = + $this->fuckhtml + ->getElementsByTagName( + "p" + ); + + $date = + strtotime( + explode( + "Added:", + $this->fuckhtml + ->getTextContent( + $date[count($date) - 1]["innerHTML"] + ) + )[1] + ); + + $out["web"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $a["innerHTML"] + ), + "description" => $description, + "url" => + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ), + "date" => $date, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } + + private function parse_token($response, $cookie = false){ + + $this->fuckhtml->load($response["data"]); + + $scripts = + $this->fuckhtml + ->getElementsByTagName("script"); + + $found = false; + foreach($scripts as $script){ + + preg_match( + '/window\.location ?= ?\'\/search\?([^=]+).*&n=([0-9]+)/', + $script["innerHTML"], + $tokens + ); + + if(isset($tokens[1])){ + + $found = true; + break; + } + } + + if($found === false){ + + return false; + } + + $tokens = [ + $tokens[1], + $tokens[2] + ]; + + if($cookie !== false){ + + // we already specified a cookie, so use the one we have already + $tokens[] = $cookie; + apcu_store("greppr_token", $tokens); + + return $tokens; + } + + if(!isset($response["headers"]["set-cookie"])){ + + // server didn't send a cookie + return false; + } + + // get cookie + preg_match( + '/PHPSESSID=([^;]+)/', + $response["headers"]["set-cookie"], + $cookie + ); + + if(!isset($cookie[1])){ + + // server sent an unexpected cookie + return false; + } + + $tokens[] = $cookie[1]; + apcu_store("greppr_token", $tokens); + + return $tokens; + } +} diff --git a/scraper/sc.php b/scraper/sc.php index e2e7385..2b847c7 100644 --- a/scraper/sc.php +++ b/scraper/sc.php @@ -398,12 +398,17 @@ class sc{ if($token === false){ - $js = - $this->get( - $proxy, - "https://a-v2.sndcdn.com/assets/1-c3e4038d.js", - [] - ); + try{ + $js = + $this->get( + $proxy, + "https://a-v2.sndcdn.com/assets/1-c3e4038d.js", + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search token"); + } preg_match( '/client_id=([^"]+)/', diff --git a/settings.php b/settings.php index 662189c..33185e9 100644 --- a/settings.php +++ b/settings.php @@ -129,6 +129,10 @@ $settings = [ "value" => "yep", "text" => "Yep" ], + [ + "value" => "greppr", + "text" => "Greppr" + ], [ "value" => "crowdview", "text" => "Crowdview" -- cgit v1.2.3