added greppr support also btw im not dead

author: lolcat <will@lolcat.ca> 2024-05-16 17:22:49 -0400
committer: lolcat <will@lolcat.ca> 2024-05-16 17:22:49 -0400
commit: 0d98d7839d1c3da75b95ef29ce12ef54a2a20094 (patch)
tree: c51d5a0dcfc99d91211b65ed9692974305a72c90
parent: f8d46df1e858401d93c5fa885777113994a03c86 (diff)
14 files changed, 469 insertions, 28 deletions
diff --git a/README.md b/README.md
index 38ebe28..6cc82a7 100644
--- a/README.md
+++ b/README.md
@@ -36,11 +36,12 @@ tl;dr the best way to actually browse for shit.
 | Brave      | Brave        | DuckDuckGo | Brave      |            | DuckDuckGo    |
 | Yandex     | Yandex       | Brave      | Google     |            | Yandex        |
 | Google     | Google       | Yandex     | Qwant      |            | Google        |
-| Qwant      | Qwant        | Google     | Mojeek     |            | Yep           |
-| Yep        | Pinterest    | Qwant      |            |            | Marginalia    |
-| Crowdview  | Yep          |            |            |            | YouTube       |
-| Mwmbl      | Imgur        |            |            |            | Soundcloud    |
-| Mojeek     | FindThatMeme |            |            |            |               |
+| Qwant      | Qwant        | Google     | Mojeek     |            | Qwant         |
+| Yep        | Yep          | Qwant      |            |            | Yep           |
+| Greppr     | Imgur        |            |            |            | Marginalia    |
+| Crowdview  | FindThatMeme |            |            |            | YouTube       |
+| Mwmbl      |              |            |            |            | Soundcloud    |
+| Mojeek     |              |            |            |            |               |
 | Marginalia |              |            |            |            |               |
 | wiby       |              |            |            |            |               |
 | Curlie     |              |            |            |            |               |
diff --git a/api/v1/ac.php b/api/v1/ac.php
index 9d9f534..236dc7b 100644
--- a/api/v1/ac.php
+++ b/api/v1/ac.php
@@ -100,7 +100,7 @@ class autocomplete{
 						$_GET["s"],
 						$json
 					],
-					JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+					JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
 				);
 				break;
 			
@@ -135,7 +135,7 @@ class autocomplete{
 						$_GET["s"],
 						$json
 					],
-					JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+					JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
 				);
 				break;
 			
@@ -154,7 +154,7 @@ class autocomplete{
 						$_GET["s"],
 						$json
 					],
-					JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+					JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
 				);
 				break;
 			
@@ -167,7 +167,7 @@ class autocomplete{
 						$_GET["s"],
 						$json[1] // ensure it contains valid key 0
 					],
-					JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+					JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
 				);
 				break;
 		}
@@ -221,7 +221,7 @@ class autocomplete{
 		
 		echo json_encode(
 			["error" => $error],
-			JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+			JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
 		);
 		die();
 	}
@@ -233,7 +233,7 @@ class autocomplete{
 				$_GET["s"],
 				[]
 			],
-			JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+			JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
 		);
 		die();
 	}
diff --git a/api/v1/images.php b/api/v1/images.php
index 348dda7..de2c5a9 100644
--- a/api/v1/images.php
+++ b/api/v1/images.php
@@ -30,7 +30,7 @@ $get = $frontend->parsegetfilters($_GET, $filters);
 try{
 	echo json_encode(
 		$scraper->image($get),
-		JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+		JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
 	);
 	
 }catch(Exception $e){
diff --git a/api/v1/music.php b/api/v1/music.php
index a1359eb..58985e3 100644
--- a/api/v1/music.php
+++ b/api/v1/music.php
@@ -30,7 +30,7 @@ $get = $frontend->parsegetfilters($_GET, $filters);
 try{
 	echo json_encode(
 		$scraper->music($get),
-		JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+		JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
 	);
 	
 }catch(Exception $e){
diff --git a/api/v1/news.php b/api/v1/news.php
index ca11b13..ab38781 100644
--- a/api/v1/news.php
+++ b/api/v1/news.php
@@ -30,7 +30,7 @@ $get = $frontend->parsegetfilters($_GET, $filters);
 try{
 	echo json_encode(
 		$scraper->news($get),
-		JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+		JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
 	);
 	
 }catch(Exception $e){
diff --git a/api/v1/videos.php b/api/v1/videos.php
index c0a7507..1d23780 100644
--- a/api/v1/videos.php
+++ b/api/v1/videos.php
@@ -30,7 +30,7 @@ $get = $frontend->parsegetfilters($_GET, $filters);
 try{
 	echo json_encode(
 		$scraper->video($get),
-		JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+		JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
 	);
 	
 }catch(Exception $e){
diff --git a/api/v1/web.php b/api/v1/web.php
index df5cec1..6a9c030 100644
--- a/api/v1/web.php
+++ b/api/v1/web.php
@@ -43,7 +43,7 @@ try{
 	echo
 		json_encode(
 			$scraper->web($get),
-			JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+			JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
 		);
 	
 }catch(Exception $e){
diff --git a/data/config.php b/data/config.php
index 42a968a..13be0f4 100644
--- a/data/config.php
+++ b/data/config.php
@@ -43,7 +43,7 @@ class config{
 	
 	// If this regex expression matches on the user agent, it blocks the request
 	// Not useful at all against a targetted attack
-	const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider/i';
+	const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider|qwant/i';
 	
 	// Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!)
 	// Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"];
@@ -128,6 +128,7 @@ class config{
 	const PROXY_PINTEREST = false;
 	const PROXY_SEZNAM = false;
 	const PROXY_NAVER = false;
+	const PROXY_GREPPR = false;
 	const PROXY_CROWDVIEW = false;
 	const PROXY_MWMBL = false;
 	const PROXY_FTM = false; // findthatmeme
diff --git a/lib/backend.php b/lib/backend.php
index 7631ff3..cfb04a9 100644
--- a/lib/backend.php
+++ b/lib/backend.php
@@ -36,7 +36,7 @@ class backend{
 	}
 	
 	// this function is also called directly on nextpage
-	public function assign_proxy(&$curlproc, $ip){
+	public function assign_proxy(&$curlproc, string $ip){
 		
 		// parse proxy line
 		[
@@ -91,7 +91,7 @@ class backend{
 	/*
 		Next page stuff
 	*/
-	public function store($payload, $page, $proxy){
+	public function store(string $payload, string $page, string $proxy){
 		
 		$key = sodium_crypto_secretbox_keygen();
 		$nonce = random_bytes(SODIUM_CRYPTO_SECRETBOX_NONCEBYTES);
@@ -120,7 +120,7 @@ class backend{
 			rtrim(strtr(base64_encode($key), '+/', '-_'), '=');
 	}
 	
-	public function get($npt, $page){
+	public function get(string $npt, string $page){
 		
 		$page = $page[0];
 		$explode = explode(".", $npt, 2);
diff --git a/lib/frontend.php b/lib/frontend.php
index a48b722..1c3eb09 100644
--- a/lib/frontend.php
+++ b/lib/frontend.php
@@ -925,6 +925,7 @@ class frontend{
 						"google" => "Google",
 						"qwant" => "Qwant",
 						"yep" => "Yep",
+						"greppr" => "Greppr",
 						"crowdview" => "Crowdview",
 						"mwmbl" => "Mwmbl",
 						"mojeek" => "Mojeek",
diff --git a/lib/fuckhtml.php b/lib/fuckhtml.php
index 6895fbf..f3a6efe 100644
--- a/lib/fuckhtml.php
+++ b/lib/fuckhtml.php
@@ -321,11 +321,11 @@ class fuckhtml{
 				
 				throw new Exception("(getTextContent) Supplied array doesn't contain an innerHTML index");
 			}
+			
 			$html = $html["innerHTML"];
 		}
 		
-		$html =
-			preg_split('/\n|<\/?br>/i', $html);
+		$html = preg_split('/\n|<\/?br>/i', $html);
 		
 		$out = "";
 		for($i=0; $i<count($html); $i++){
diff --git a/scraper/greppr.php b/scraper/greppr.php
new file mode 100644
index 0000000..402c3d2
--- /dev/null
+++ b/scraper/greppr.php
@@ -0,0 +1,429 @@
+<?php
+
+class greppr{
+	
+	public function __construct(){
+		
+		include "lib/backend.php";
+		$this->backend = new backend("greppr");
+		
+		include "lib/fuckhtml.php";
+		$this->fuckhtml = new fuckhtml();
+	}
+	
+	public function getfilters($page){
+		
+		return [];
+	}
+	
+	private function get($proxy, $url, $get = [], $cookie = false){
+		
+		$curlproc = curl_init();
+		
+		if($get !== []){
+			$get = http_build_query($get);
+			$url .= "?" . $get;
+		}
+		
+		curl_setopt($curlproc, CURLOPT_URL, $url);
+		
+		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+		
+		if($cookie === false){
+			
+			curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+				["User-Agent: " . config::USER_AGENT,
+				"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+				"Accept-Language: en-US,en;q=0.5",
+				"Accept-Encoding: gzip",
+				"DNT: 1",
+				"Connection: keep-alive",
+				"Upgrade-Insecure-Requests: 1",
+				"Sec-Fetch-Dest: document",
+				"Sec-Fetch-Mode: navigate",
+				"Sec-Fetch-Site: none",
+				"Sec-Fetch-User: ?1"]
+			);
+		}else{
+			
+			curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+				["User-Agent: " . config::USER_AGENT,
+				"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+				"Accept-Language: en-US,en;q=0.5",
+				"Accept-Encoding: gzip",
+				"Cookie: PHPSESSID=" . $cookie,
+				"DNT: 1",
+				"Connection: keep-alive",
+				"Upgrade-Insecure-Requests: 1",
+				"Sec-Fetch-Dest: document",
+				"Sec-Fetch-Mode: navigate",
+				"Sec-Fetch-Site: none",
+				"Sec-Fetch-User: ?1"]
+			);
+		}
+		
+		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+		
+		$this->backend->assign_proxy($curlproc, $proxy);
+		
+		$headers = [];
+		
+		curl_setopt(
+			$curlproc,
+			CURLOPT_HEADERFUNCTION,
+			function($curlproc, $header) use (&$headers){
+				
+				$len = strlen($header);
+				$header = explode(':', $header, 2);
+				
+				if(count($header) < 2){
+					
+					// ignore invalid headers
+					return $len;
+				}
+				
+				$headers[strtolower(trim($header[0]))] = trim($header[1]);
+
+				return $len;
+			}
+		);
+				
+		$data = curl_exec($curlproc);
+		
+		if(curl_errno($curlproc)){
+			
+			throw new Exception(curl_error($curlproc));
+		}
+		
+		curl_close($curlproc);
+		
+		return [
+			"headers" => $headers,
+			"data" => $data
+		];
+	}
+	
+	public function web($get, $first_attempt = true){
+		
+		if($get["npt"]){
+			
+			[$q, $proxy] = $this->backend->get($get["npt"], "web");
+			
+			$q = json_decode($q, true);
+			
+		}else{
+			
+			$search = $get["s"];
+			if(strlen($search) === 0){
+				
+				throw new Exception("Search term is empty!");
+			}
+			
+			$proxy = $this->backend->get_ip();
+		}
+		
+		// get token
+		// token[0] = static token that changes once a day
+		// token[1] = dynamic token that changes on every request
+		// token[1] = PHPSESSID cookie
+		$tokens = apcu_fetch("greppr_token");
+		
+		if(
+			$tokens === false ||
+			$first_attempt === false // force token fetch
+		){
+			
+			// we haven't gotten the token yet, get it
+			try{
+				
+				$response =
+					$this->get(
+						$proxy,
+						"https://greppr.org",
+						[]
+					);
+			}catch(Exception $error){
+				
+				throw new Exception("Failed to fetch search tokens");
+			}
+			
+			$tokens = $this->parse_token($response);
+			
+			if($tokens === false){
+				
+				throw new Exception("Failed to grep search tokens");
+			}
+		}
+		
+		try{
+			
+			if($get["npt"]){
+				
+				$params = [
+					$tokens[0] => $q["q"],
+					"s" => $q["s"],
+					"l" => 30,
+					"n" => $tokens[1]
+				];
+			}else{
+				
+				$params = [
+					$tokens[0] => $search,
+					"n" => $tokens[1]
+				];
+			}
+			
+			$searchresults = $this->get(
+				$proxy,
+				"https://greppr.org/search",
+				$params,
+				$tokens[2]
+			);
+		}catch(Exception $error){
+			
+			throw new Exception("Failed to fetch search page");
+		}
+		
+		if(strlen($searchresults["data"]) === 0){
+			
+			// redirected to main page, which means we got old token
+			// generate a new one
+			
+			// ... unless we just tried to do that
+			if($first_attempt === false){
+				
+				throw new Exception("Failed to get a new search token");
+			}
+			
+			$this->get($get, false);
+		}
+		
+		// refresh the token with new data (this also triggers fuckhtml load)
+		$this->parse_token($searchresults, $tokens[2]);
+		
+		// response object
+		$out = [
+			"status" => "ok",
+			"spelling" => [
+				"type" => "no_correction",
+				"using" => null,
+				"correction" => null
+			],
+			"npt" => null,
+			"answer" => [],
+			"web" => [],
+			"image" => [],
+			"video" => [],
+			"news" => [],
+			"related" => []
+		];
+		
+		// get results for later
+		$results =
+			$this->fuckhtml
+			->getElementsByClassName(
+				"result",
+				"div"
+			);
+		
+		// check for next page
+		$next_elem =
+			$this->fuckhtml
+			->getElementsByClassName(
+				"pagination",
+				"ul"
+			);
+		
+		if(count($next_elem) !== 0){
+			
+			$this->fuckhtml->load($next_elem[0]);
+			
+			$as =
+				$this->fuckhtml
+				->getElementsByClassName(
+					"page-link",
+					"a"
+				);
+			
+			$break = false;
+			foreach($as as $a){
+				
+				if($break === true){
+					
+					parse_str(
+						$this->fuckhtml
+						->getTextContent(
+							$a["attributes"]["href"]
+						),
+						$values
+					);
+					
+					$values = array_values($values);
+					
+					$out["npt"] =
+						$this->backend->store(
+							json_encode(
+								[
+									"q" => $values[0],
+									"s" => $values[1]
+								]
+							),
+							"web",
+							$proxy
+						);
+					break;
+				}
+				
+				if($a["attributes"]["href"] == "#"){
+					
+					$break = true;
+				}
+			}
+		}
+		
+		// scrape results
+		foreach($results as $result){
+			
+			$this->fuckhtml->load($result);
+			
+			$a =
+				$this->fuckhtml
+				->getElementsByTagName(
+					"a"
+				)[0];
+			
+			$description =
+				$this->fuckhtml
+				->getElementsByFuzzyAttributeValue(
+					"style",
+					"color:#777777;",
+					"p"
+				);
+			
+			if(count($description) === 0){
+				
+				$description = null;
+			}else{
+				
+				$description =
+					$this->fuckhtml
+					->getTextContent(
+						$description[0]
+					);
+			}
+			
+			$date =
+				$this->fuckhtml
+				->getElementsByTagName(
+					"p"
+				);
+			
+			$date =
+				strtotime(
+					explode(
+						"Added:",
+						$this->fuckhtml
+						->getTextContent(
+							$date[count($date) - 1]["innerHTML"]
+						)
+					)[1]
+				);
+			
+			$out["web"][] = [
+				"title" =>
+					$this->fuckhtml
+					->getTextContent(
+						$a["innerHTML"]
+					),
+				"description" => $description,
+				"url" =>
+					$this->fuckhtml
+					->getTextContent(
+						$a["attributes"]["href"]
+					),
+				"date" => $date,
+				"type" => "web",
+				"thumb" => [
+					"url" => null,
+					"ratio" => null
+				],
+				"sublink" => [],
+				"table" => []
+			];
+		}
+		
+		return $out;
+	}
+	
+	private function parse_token($response, $cookie = false){
+		
+		$this->fuckhtml->load($response["data"]);
+
+		$scripts =
+			$this->fuckhtml
+			->getElementsByTagName("script");
+		
+		$found = false;
+		foreach($scripts as $script){
+			
+			preg_match(
+				'/window\.location ?= ?\'\/search\?([^=]+).*&n=([0-9]+)/',
+				$script["innerHTML"],
+				$tokens
+			);
+			
+			if(isset($tokens[1])){
+				
+				$found = true;
+				break;
+			}
+		}
+		
+		if($found === false){
+			
+			return false;
+		}
+		
+		$tokens = [
+			$tokens[1],
+			$tokens[2]
+		];
+		
+		if($cookie !== false){
+			
+			// we already specified a cookie, so use the one we have already
+			$tokens[] = $cookie;
+			apcu_store("greppr_token", $tokens);
+			
+			return $tokens;
+		}
+		
+		if(!isset($response["headers"]["set-cookie"])){
+			
+			// server didn't send a cookie
+			return false;
+		}
+		
+		// get cookie
+		preg_match(
+			'/PHPSESSID=([^;]+)/',
+			$response["headers"]["set-cookie"],
+			$cookie
+		);
+		
+		if(!isset($cookie[1])){
+			
+			// server sent an unexpected cookie
+			return false;
+		}
+		
+		$tokens[] = $cookie[1];
+		apcu_store("greppr_token", $tokens);
+		
+		return $tokens;
+	}
+}
diff --git a/scraper/sc.php b/scraper/sc.php
index e2e7385..2b847c7 100644
--- a/scraper/sc.php
+++ b/scraper/sc.php
@@ -398,12 +398,17 @@ class sc{
 		
 		if($token === false){
 			
-			$js =
-				$this->get(
-					$proxy,
-					"https://a-v2.sndcdn.com/assets/1-c3e4038d.js",
-					[]
-				);
+			try{
+				$js =
+					$this->get(
+						$proxy,
+						"https://a-v2.sndcdn.com/assets/1-c3e4038d.js",
+						[]
+					);
+			}catch(Exception $error){
+				
+				throw new Exception("Failed to fetch search token");
+			}
 			
 			preg_match(
 				'/client_id=([^"]+)/',
diff --git a/settings.php b/settings.php
index 662189c..33185e9 100644
--- a/settings.php
+++ b/settings.php
@@ -130,6 +130,10 @@ $settings = [
 						"text" => "Yep"
 					],
 					[
+						"value" => "greppr",
+						"text" => "Greppr"
+					],
+					[
 						"value" => "crowdview",
 						"text" => "Crowdview"
 					],
author	lolcat <will@lolcat.ca>	2024-05-16 17:22:49 -0400
committer	lolcat <will@lolcat.ca>	2024-05-16 17:22:49 -0400
commit	0d98d7839d1c3da75b95ef29ce12ef54a2a20094 (patch)
tree	c51d5a0dcfc99d91211b65ed9692974305a72c90
parent	f8d46df1e858401d93c5fa885777113994a03c86 (diff)