1 files changed, 199 insertions, 169 deletions
diff --git a/scraper/ddg.php b/scraper/ddg.php
index 1ce8e18..2d737ba 100644
--- a/scraper/ddg.php
+++ b/scraper/ddg.php
@@ -4,8 +4,11 @@ class ddg{
 	
 	public function __construct(){
 		
-		include "lib/nextpage.php";
-		$this->nextpage = new nextpage("ddg");
+		include "lib/backend.php";
+		$this->backend = new backend("ddg");
+		
+		include "lib/fuckhtml.php";
+		$this->fuckhtml = new fuckhtml();
 	}
 	
 	/*
@@ -14,7 +17,7 @@ class ddg{
 	private const req_web = 0;
 	private const req_xhr = 1;
 	
-	private function get($url, $get = [], $reqtype = self::req_web){
+	private function get($proxy, $url, $get = [], $reqtype = self::req_web){
 		
 		$curlproc = curl_init();
 		
@@ -28,7 +31,7 @@ class ddg{
 		switch($reqtype){
 			case self::req_web:
 				$headers =
-					["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
+					["User-Agent: " . config::USER_AGENT,
 					"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
 					"Accept-Encoding: gzip",
 					"Accept-Language: en-US,en;q=0.5",
@@ -43,7 +46,7 @@ class ddg{
 			
 			case self::req_xhr:
 				$headers =
-					["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
+					["User-Agent: " . config::USER_AGENT,
 					"Accept: */*",
 					"Accept-Encoding: gzip",
 					"Accept-Language: en-US,en;q=0.5",
@@ -57,6 +60,8 @@ class ddg{
 				break;
 		}
 		
+		$this->backend->assign_proxy($curlproc, $proxy);
+		
 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 		curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
 		
@@ -69,7 +74,6 @@ class ddg{
 		$data = curl_exec($curlproc);
 		
 		if(curl_errno($curlproc)){
-			
 			throw new Exception(curl_error($curlproc));
 		}
 		
@@ -541,9 +545,11 @@ class ddg{
 	
 	public function web($get){
 		
+		$proxy = null;
+		
 		if($get["npt"]){
 			
-			$jsgrep = $this->nextpage->get($get["npt"], "web");
+			[$jsgrep, $proxy] = $this->backend->get($get["npt"], "web");
 						
 			$extendedsearch = false;
 			$inithtml = "";
@@ -555,6 +561,7 @@ class ddg{
 				throw new Exception("Search term is empty!");
 			}
 			
+			$proxy = $this->backend->get_ip();
 			$country = $get["country"];
 			$nsfw = $get["nsfw"];
 			$older = $get["older"];
@@ -614,9 +621,9 @@ class ddg{
 			/*
 				Get html
 			*/
-			// https://duckduckgo.com/?q=minecraft&kz=1&k1=-1&kp=-2
 			try{
 				$inithtml = $this->get(
+					$proxy,
 					"https://duckduckgo.com/",
 					$get_filters
 				);
@@ -643,6 +650,7 @@ class ddg{
 		try{
 			
 			$js = $this->get(
+				$proxy,
 				"https://links.duckduckgo.com" . $jsgrep,
 				[],
 				ddg::req_xhr
@@ -692,6 +700,7 @@ class ddg{
 					
 					// get definition
 					$wordnikjs = $this->get(
+						$proxy,
 						"https://duckduckgo.com/js/spice/dictionary/definition/" . $wordnik,
 						[],
 						ddg::req_xhr
@@ -725,6 +734,7 @@ class ddg{
 						$wordnikaudio_json =
 							json_decode(
 								$this->get(
+									$proxy,
 									"https://duckduckgo.com/js/spice/dictionary/audio/" . $wordnik,
 									[],
 									ddg::req_xhr
@@ -922,6 +932,7 @@ class ddg{
 				
 				try{
 					$stackjs = $this->get(
+						$proxy,
 						"https://duckduckgo.com" . $stack,
 						[],
 						ddg::req_xhr
@@ -944,7 +955,7 @@ class ddg{
 						
 						$out["answer"][] = [
 							"title" => $stackjson["Heading"],
-							"description" => $this->htmltoarray($stackjson["Abstract"]),
+							"description" => $this->stackoverflow_parse($stackjson["Abstract"]),
 							"url" => str_replace(["http://", "ddg"], ["https://", ""], $stackjson["AbstractURL"]),
 							"thumb" => null,
 							"table" => [],
@@ -973,6 +984,7 @@ class ddg{
 				
 				try{
 					$lyricsjs = $this->get(
+						$proxy,
 						"https://duckduckgo.com" . $lyrics,
 						[],
 						ddg::req_xhr
@@ -1166,13 +1178,13 @@ class ddg{
 					
 					if(isset($answers[$i]["data"]["AbstractText"]) && !empty($answers[$i]["data"]["AbstractText"])){
 						
-						$description = $this->htmltoarray($answers[$i]["data"]["AbstractText"]);
+						$description = $this->stackoverflow_parse($answers[$i]["data"]["AbstractText"]);
 					}elseif(isset($answers[$i]["data"]["Abstract"]) && !empty($answers[$i]["data"]["Abstract"])){
 						
-						$description = $this->htmltoarray($answers[$i]["data"]["Abstract"]);
+						$description = $this->stackoverflow_parse($answers[$i]["data"]["Abstract"]);
 					}elseif(isset($answers[$i]["data"]["Answer"]) && !empty($answers[$i]["data"]["Answer"])){
 						
-						$description = $this->htmltoarray($answers[$i]["data"]["Answer"]);
+						$description = $this->stackoverflow_parse($answers[$i]["data"]["Answer"]);
 					}else{
 						
 						$description = [];
@@ -1310,6 +1322,7 @@ class ddg{
 					$description = [];
 						
 					$shitcoinjs = $this->get(
+						$proxy,
 						"https://duckduckgo.com/js/spice/cryptocurrency/{$shitcoins[1]}/{$shitcoins[2]}/1",
 						[],
 						ddg::req_xhr
@@ -1408,6 +1421,7 @@ class ddg{
 					
 					try{
 						$currencyjs = $this->get(
+							$proxy,
 							"https://duckduckgo.com/js/spice/currency/{$amount}/" . strtolower($currencies[1]) . "/" . strtolower($currencies[2]),
 							[],
 							ddg::req_xhr
@@ -1607,7 +1621,7 @@ class ddg{
 					// store next page token
 					if(isset($web[$i]["n"])){
 						
-						$out["npt"] = $this->nextpage->store($web[$i]["n"] . "&biaexp=b&eslexp=a&litexp=c&msvrtexp=b&wrap=1", "web");
+						$out["npt"] = $this->backend->store($web[$i]["n"] . "&biaexp=b&eslexp=a&litexp=c&msvrtexp=b&wrap=1", "web", $proxy);
 						continue;
 					}
 					
@@ -1874,10 +1888,11 @@ class ddg{
 		
 		if($get["npt"]){
 			
-			$npt = $this->nextpage->get($get["npt"], "images");
+			[$npt, $proxy] = $this->backend->get($get["npt"], "images");
 			
 			try{
 				$json = json_decode($this->get(
+					$proxy,
 					"https://duckduckgo.com/i.js?" . $npt,
 					[],
 					ddg::req_xhr
@@ -1895,6 +1910,7 @@ class ddg{
 				throw new Exception("Search term is empty!");
 			}
 			
+			$proxy = $this->backend->get_ip();
 			$country = $get["country"];
 			$nsfw = $get["nsfw"];
 			$date = $get["date"];
@@ -1934,6 +1950,7 @@ class ddg{
 			try{
 				
 				$html = $this->get(
+					$proxy,
 					"https://duckduckgo.com",
 					$get_filters,
 					ddg::req_web
@@ -1980,6 +1997,7 @@ class ddg{
 			
 			try{
 				$json = json_decode($this->get(
+					$proxy,
 					"https://duckduckgo.com/i.js",
 					$js_params,
 					ddg::req_xhr
@@ -2005,10 +2023,11 @@ class ddg{
 			}
 			
 			$out["npt"] =
-				$this->nextpage->store(
+				$this->backend->store(
 					explode("?", $json["next"])[1] . "&vqd=" .
 					$vqd,
-					"images"
+					"images",
+					$proxy
 				);
 		}
 		
@@ -2046,10 +2065,11 @@ class ddg{
 		
 		if($get["npt"]){
 			
-			$npt = $this->nextpage->get($get["npt"], "videos");
+			[$npt, $proxy] = $this->backend->get($get["npt"], "videos");
 			
 			try{
 				$json = json_decode($this->get(
+					$proxy,
 					"https://duckduckgo.com/v.js?" .
 					$npt,
 					[],
@@ -2068,6 +2088,7 @@ class ddg{
 				throw new Exception("Search term is empty!");
 			}
 			
+			$proxy = $this->backend->get_ip();
 			$country = $get["country"];
 			$nsfw = $get["nsfw"];
 			$date = $get["date"];
@@ -2099,6 +2120,7 @@ class ddg{
 			try{
 				
 				$html = $this->get(
+					$proxy,
 					"https://duckduckgo.com",
 					$get_filters,
 					ddg::req_web
@@ -2123,6 +2145,7 @@ class ddg{
 			
 			try{
 				$json = json_decode($this->get(
+					$proxy,
 					"https://duckduckgo.com/v.js",
 					[
 						"l" => "us-en",
@@ -2155,9 +2178,10 @@ class ddg{
 		if(isset($json["next"])){
 			
 			$out["npt"] =
-				$this->nextpage->store(
+				$this->backend->store(
 					explode("?", $json["next"])[1],
-					"videos"
+					"videos",
+					$proxy
 				);
 		}
 		
@@ -2213,11 +2237,12 @@ class ddg{
 		
 		if($get["npt"]){
 			
-			$req = $this->nextpage->get($get["npt"], "news");
+			[$req, $proxy] = $this->backend->get($get["npt"], "news");
 			
 			try{
 				
 				$json = json_decode($this->get(
+					$proxy,
 					"https://duckduckgo.com/news.js?" .
 					$req,
 					[],
@@ -2236,6 +2261,7 @@ class ddg{
 				throw new Exception("Search term is empty!");
 			}
 			
+			$proxy = $this->backend->get_ip();
 			$country = $get["country"];
 			$nsfw = $get["nsfw"];
 			$date = $get["date"];
@@ -2261,6 +2287,7 @@ class ddg{
 			try{
 				
 				$html = $this->get(
+					$proxy,
 					"https://duckduckgo.com",
 					$get_params,
 					ddg::req_web
@@ -2303,6 +2330,7 @@ class ddg{
 				}
 				
 				$json = json_decode($this->get(
+					$proxy,
 					"https://duckduckgo.com/news.js",
 					$js_params,
 					ddg::req_xhr
@@ -2323,9 +2351,10 @@ class ddg{
 		if(isset($json["next"])){
 			
 			$out["npt"] =
-				$this->nextpage->store(
+				$this->backend->store(
 					explode("?", $json["next"])[1],
-					"news"
+					"news",
+					$proxy
 				);
 		}
 		
@@ -2415,192 +2444,193 @@ class ddg{
 		return "https://" . $parse["host"] . "/th?id=" . urlencode($parts["id"]);
 	}
 	
-	private function htmltoarray($html){
+	private function appendtext($payload, &$text, &$index){
 		
-		$html = strip_tags($html, ["img", "pre", "code", "br", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "a"]);
-		
-		libxml_use_internal_errors(true);
-		$dom = new DOMDocument("1.0", "utf-8");
-		$dom->loadHTML('<div>' . $html . '</div>');
-		$xpath = new DOMXPath($dom);
-		$descendants = $xpath->query('//div/node()');
-		
-		$images = $xpath->query('//div/node()/img');
-		$imageiterator = 0;
+		if(trim($payload) == ""){
+			
+			return;
+		}
 		
-		if(count($descendants) === 0){
+		if(
+			$index !== 0 &&
+			$text[$index - 1]["type"] == "text"
+		){
 			
-			return [
+			$text[$index - 1]["value"] .= preg_replace('/  $/', " ", $payload);
+		}else{
+			
+			$text[] = [
 				"type" => "text",
-				"value" => $this->unescapehtml($html)
+				"value" => preg_replace('/  $/', " ", $payload)
 			];
+			$index++;
 		}
+	}
+	
+	private function stackoverflow_parse($html){
 		
-		$array = [];
-		$previoustype = null;
+		$i = 0;
+		$answer = [];
 		
-		foreach($descendants as $node){
-			
-			// $node->nodeValue = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $node->nodeValue);
+		$this->fuckhtml->load($html);
+		
+		$tags = $this->fuckhtml->getElementsByTagName("*");
+		
+		if(count($tags) === 0){
 			
-			// get node type
-			switch($node->nodeName){
-				case "#text":
-					$type = "text";
-					break;
-				
-				case "pre":
-					$type = "code";
-					break;
-				
-				case "code":
-					$type = "inline_code";
-					break;
-				
-				case "h1":
-				case "h2":
-				case "h3":
-				case "h4":
-				case "h5":
-				case "h6":
-					$type = "title";
-					break;
-				
-				case "blockquote":
-					$type = "quote";
-					break;
-				
-				case "a":
-					$type = "link";
-					break;
-				
-				case "img":
-					$type = "image";
-					break;
-			}
+			return [
+				[
+					"type" => "text",
+					"value" => htmlspecialchars_decode($html)
+				]
+			];
+		}
+		
+		foreach($tags as $snippet){
 			
-			// add node to array
-			switch($type){
+			switch($snippet["tagName"]){
 				
-				case "text":
-					$value = preg_replace(
-						'/ {2,}/',
-						" ",
-						$this->limitnewlines($this->unescapehtml($node->textContent))
-					);
+				case "p":
+					$this->fuckhtml->load($snippet["innerHTML"]);
 					
-					if(
-						$previoustype == "quote" ||
-						$previoustype === null ||
-						$previoustype == "image" ||
-						$previoustype == "title" ||
-						$previoustype == "code"
-					){
-						
-						$value = ltrim($value);
-					}
+					$codetags =
+						$this->fuckhtml
+						->getElementsByTagName("*");
 					
-					if($value == ""){
-						
-						$previoustype = $type;
-						continue 2;
-					}
+					$tmphtml = $snippet["innerHTML"];
 					
-					// merge with previous text node
-					if($previoustype == "text"){
+					foreach($codetags as $tag){
 						
-						$array[count($array) - 1]["value"] = trim($array[count($array) - 1]["value"]) . "\n" . $this->bstoutf8($value);
-					}else{
+						if(!isset($tag["outerHTML"])){
+							
+							continue;
+						}
 						
-						$array[] = [
-							"type" => "text",
-							"value" => $this->bstoutf8($value)
-						];
+						$tmphtml =
+							explode(
+								$tag["outerHTML"],
+								$tmphtml,
+								2
+							);
+						
+						$value = $this->fuckhtml->getTextContent($tmphtml[0], false, false);
+						$this->appendtext($value, $answer, $i);
+						
+						$type = null;
+						switch($tag["tagName"]){
+							
+							case "code": $type = "inline_code"; break;
+							case "em": $type = "italic"; break;
+							case "blockquote": $type = "quote"; break;
+							default: $type = "text";
+						}
+						
+						if($type !== null){
+							$value = $this->fuckhtml->getTextContent($tag, false, false);
+							
+							if(trim($value) != ""){
+								
+								$answer[] = [
+									"type" => $type,
+									"value" => rtrim($value)
+								];
+								$i++;
+							}
+						}
+						
+						if(count($tmphtml) === 2){
+							
+							$tmphtml = $tmphtml[1] . "\n";
+						}else{
+							
+							break;
+						}
 					}
-					break;
-				
-				case "inline_code":
-				case "bold":
-					$array[] = [
-						"type" => "inline_code",
-						"value" => $this->bstoutf8(trim($this->limitnewlines($this->unescapehtml($node->textContent))))
-					];
-					break;
-				
-				case "link":
-					// check for link nested inside of image
 					
-					if(strlen($node->childNodes->item(0)->textContent) !== 0){
+					if(is_array($tmphtml)){
 						
-						$array[] = [
-							"type" => "link",
-							"value" => $this->bstoutf8(trim($this->unescapehtml($node->textContent))),
-							"url" => $this->bstoutf8(preg_replace('/\/ddg$/', "", preg_replace('/^http:\/\//', "https://", $this->sanitizeurl($node->getAttribute("href")))))
-						];
-						break;
+						$tmphtml = $tmphtml[0];
 					}
 					
-					$type = "image";
-					
-					if($previoustype == "text"){
+					if(strlen($tmphtml) !== 0){
 						
-						$array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]);
+						$value = $this->fuckhtml->getTextContent($tmphtml, true, false);
+						$this->appendtext($value, $answer, $i);
 					}
-					
-					$array[] = [
-						"type" => "image",
-						"url" => $this->bstoutf8(preg_replace('/^http:\/\//', "https://", preg_replace('/^\/\/images\.duckduckgo\.com\/iu\/\?u=/', "", $images->item($imageiterator)->getAttribute("src"))))
-					];
-					
-					$imageiterator++;
-					
 					break;
 				
-				case "image":
-					
-					if($previoustype == "text"){
-						
-						$array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]);
-					}
-					
-					$array[] = [
+				case "img":
+					$answer[] = [
 						"type" => "image",
-						"url" => $this->bstoutf8(preg_replace('/^http:\/\//', "https://", preg_replace('/^\/\/images\.duckduckgo\.com\/iu\/\?u=/', "", $node->getAttribute("src"))))
+						"url" =>
+							$this->fuckhtml
+							->getTextContent(
+								$tag["attributes"]["src"]
+							)
 					];
+					$i++;
 					break;
 				
-				case "quote":
-				case "title":
-				case "code":
-					if($previoustype == "text"){
+				case "pre":
+					switch($answer[$i - 1]["type"]){
 						
-						$array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]);
+						case "text":
+						case "italic":
+							$answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]);
+							break;
 					}
-					// no break
-				
-				default:
 					
-					$value = trim($this->limitnewlines($this->unescapehtml($node->textContent)));
-					if($type != "code"){
-						
-						$value = preg_replace(
-							'/ {2,}/',
-							" ",
-							$value
+					$answer[] =
+						[
+							"type" => "code",
+							"value" =>
+								rtrim(
+									$this->fuckhtml
+									->getTextContent(
+										$snippet,
+										true,
+										false
+									)
+								)
+						];
+					$i++;
+					
+					break;
+				
+				case "ol":
+					$o = 0;
+					
+					$this->fuckhtml->load($snippet);
+					$li =
+						$this->fuckhtml
+						->getElementsByTagName("li");
+					
+					foreach($li as $elem){
+						$o++;
+						
+						$this->appendtext(
+							$o . ". " .
+							$this->fuckhtml
+							->getTextContent(
+								$elem
+							),
+							$answer,
+							$i
 						);
 					}
-					
-					$array[] = [
-						"type" => $type,
-						"value" => $this->bstoutf8($value)
-					];
 					break;
 			}
+		}
+		
+		if(
+			$i !== 0 &&
+			$answer[$i - 1]["type"] == "text"
+		){
 			
-			$previoustype = $type;
+			$answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]);
 		}
 		
-		return $array;
+		return $answer;
 	}
 	
 	private function bstoutf8($bs){