diff options
Diffstat (limited to 'scraper/ddg.php')
-rw-r--r-- | scraper/ddg.php | 368 |
1 files changed, 199 insertions, 169 deletions
diff --git a/scraper/ddg.php b/scraper/ddg.php index 1ce8e18..2d737ba 100644 --- a/scraper/ddg.php +++ b/scraper/ddg.php @@ -4,8 +4,11 @@ class ddg{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("ddg"); + include "lib/backend.php"; + $this->backend = new backend("ddg"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); } /* @@ -14,7 +17,7 @@ class ddg{ private const req_web = 0; private const req_xhr = 1; - private function get($url, $get = [], $reqtype = self::req_web){ + private function get($proxy, $url, $get = [], $reqtype = self::req_web){ $curlproc = curl_init(); @@ -28,7 +31,7 @@ class ddg{ switch($reqtype){ case self::req_web: $headers = - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Encoding: gzip", "Accept-Language: en-US,en;q=0.5", @@ -43,7 +46,7 @@ class ddg{ case self::req_xhr: $headers = - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + ["User-Agent: " . config::USER_AGENT, "Accept: */*", "Accept-Encoding: gzip", "Accept-Language: en-US,en;q=0.5", @@ -57,6 +60,8 @@ class ddg{ break; } + $this->backend->assign_proxy($curlproc, $proxy); + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); @@ -69,7 +74,6 @@ class ddg{ $data = curl_exec($curlproc); if(curl_errno($curlproc)){ - throw new Exception(curl_error($curlproc)); } @@ -541,9 +545,11 @@ class ddg{ public function web($get){ + $proxy = null; + if($get["npt"]){ - $jsgrep = $this->nextpage->get($get["npt"], "web"); + [$jsgrep, $proxy] = $this->backend->get($get["npt"], "web"); $extendedsearch = false; $inithtml = ""; @@ -555,6 +561,7 @@ class ddg{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $older = $get["older"]; @@ -614,9 +621,9 @@ class ddg{ /* Get html */ - // https://duckduckgo.com/?q=minecraft&kz=1&k1=-1&kp=-2 try{ $inithtml = $this->get( + $proxy, "https://duckduckgo.com/", $get_filters ); @@ -643,6 +650,7 @@ class ddg{ try{ $js = $this->get( + $proxy, "https://links.duckduckgo.com" . $jsgrep, [], ddg::req_xhr @@ -692,6 +700,7 @@ class ddg{ // get definition $wordnikjs = $this->get( + $proxy, "https://duckduckgo.com/js/spice/dictionary/definition/" . $wordnik, [], ddg::req_xhr @@ -725,6 +734,7 @@ class ddg{ $wordnikaudio_json = json_decode( $this->get( + $proxy, "https://duckduckgo.com/js/spice/dictionary/audio/" . $wordnik, [], ddg::req_xhr @@ -922,6 +932,7 @@ class ddg{ try{ $stackjs = $this->get( + $proxy, "https://duckduckgo.com" . $stack, [], ddg::req_xhr @@ -944,7 +955,7 @@ class ddg{ $out["answer"][] = [ "title" => $stackjson["Heading"], - "description" => $this->htmltoarray($stackjson["Abstract"]), + "description" => $this->stackoverflow_parse($stackjson["Abstract"]), "url" => str_replace(["http://", "ddg"], ["https://", ""], $stackjson["AbstractURL"]), "thumb" => null, "table" => [], @@ -973,6 +984,7 @@ class ddg{ try{ $lyricsjs = $this->get( + $proxy, "https://duckduckgo.com" . $lyrics, [], ddg::req_xhr @@ -1166,13 +1178,13 @@ class ddg{ if(isset($answers[$i]["data"]["AbstractText"]) && !empty($answers[$i]["data"]["AbstractText"])){ - $description = $this->htmltoarray($answers[$i]["data"]["AbstractText"]); + $description = $this->stackoverflow_parse($answers[$i]["data"]["AbstractText"]); }elseif(isset($answers[$i]["data"]["Abstract"]) && !empty($answers[$i]["data"]["Abstract"])){ - $description = $this->htmltoarray($answers[$i]["data"]["Abstract"]); + $description = $this->stackoverflow_parse($answers[$i]["data"]["Abstract"]); }elseif(isset($answers[$i]["data"]["Answer"]) && !empty($answers[$i]["data"]["Answer"])){ - $description = $this->htmltoarray($answers[$i]["data"]["Answer"]); + $description = $this->stackoverflow_parse($answers[$i]["data"]["Answer"]); }else{ $description = []; @@ -1310,6 +1322,7 @@ class ddg{ $description = []; $shitcoinjs = $this->get( + $proxy, "https://duckduckgo.com/js/spice/cryptocurrency/{$shitcoins[1]}/{$shitcoins[2]}/1", [], ddg::req_xhr @@ -1408,6 +1421,7 @@ class ddg{ try{ $currencyjs = $this->get( + $proxy, "https://duckduckgo.com/js/spice/currency/{$amount}/" . strtolower($currencies[1]) . "/" . strtolower($currencies[2]), [], ddg::req_xhr @@ -1607,7 +1621,7 @@ class ddg{ // store next page token if(isset($web[$i]["n"])){ - $out["npt"] = $this->nextpage->store($web[$i]["n"] . "&biaexp=b&eslexp=a&litexp=c&msvrtexp=b&wrap=1", "web"); + $out["npt"] = $this->backend->store($web[$i]["n"] . "&biaexp=b&eslexp=a&litexp=c&msvrtexp=b&wrap=1", "web", $proxy); continue; } @@ -1874,10 +1888,11 @@ class ddg{ if($get["npt"]){ - $npt = $this->nextpage->get($get["npt"], "images"); + [$npt, $proxy] = $this->backend->get($get["npt"], "images"); try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/i.js?" . $npt, [], ddg::req_xhr @@ -1895,6 +1910,7 @@ class ddg{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $date = $get["date"]; @@ -1934,6 +1950,7 @@ class ddg{ try{ $html = $this->get( + $proxy, "https://duckduckgo.com", $get_filters, ddg::req_web @@ -1980,6 +1997,7 @@ class ddg{ try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/i.js", $js_params, ddg::req_xhr @@ -2005,10 +2023,11 @@ class ddg{ } $out["npt"] = - $this->nextpage->store( + $this->backend->store( explode("?", $json["next"])[1] . "&vqd=" . $vqd, - "images" + "images", + $proxy ); } @@ -2046,10 +2065,11 @@ class ddg{ if($get["npt"]){ - $npt = $this->nextpage->get($get["npt"], "videos"); + [$npt, $proxy] = $this->backend->get($get["npt"], "videos"); try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/v.js?" . $npt, [], @@ -2068,6 +2088,7 @@ class ddg{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $date = $get["date"]; @@ -2099,6 +2120,7 @@ class ddg{ try{ $html = $this->get( + $proxy, "https://duckduckgo.com", $get_filters, ddg::req_web @@ -2123,6 +2145,7 @@ class ddg{ try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/v.js", [ "l" => "us-en", @@ -2155,9 +2178,10 @@ class ddg{ if(isset($json["next"])){ $out["npt"] = - $this->nextpage->store( + $this->backend->store( explode("?", $json["next"])[1], - "videos" + "videos", + $proxy ); } @@ -2213,11 +2237,12 @@ class ddg{ if($get["npt"]){ - $req = $this->nextpage->get($get["npt"], "news"); + [$req, $proxy] = $this->backend->get($get["npt"], "news"); try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/news.js?" . $req, [], @@ -2236,6 +2261,7 @@ class ddg{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $date = $get["date"]; @@ -2261,6 +2287,7 @@ class ddg{ try{ $html = $this->get( + $proxy, "https://duckduckgo.com", $get_params, ddg::req_web @@ -2303,6 +2330,7 @@ class ddg{ } $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/news.js", $js_params, ddg::req_xhr @@ -2323,9 +2351,10 @@ class ddg{ if(isset($json["next"])){ $out["npt"] = - $this->nextpage->store( + $this->backend->store( explode("?", $json["next"])[1], - "news" + "news", + $proxy ); } @@ -2415,192 +2444,193 @@ class ddg{ return "https://" . $parse["host"] . "/th?id=" . urlencode($parts["id"]); } - private function htmltoarray($html){ + private function appendtext($payload, &$text, &$index){ - $html = strip_tags($html, ["img", "pre", "code", "br", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "a"]); - - libxml_use_internal_errors(true); - $dom = new DOMDocument("1.0", "utf-8"); - $dom->loadHTML('<div>' . $html . '</div>'); - $xpath = new DOMXPath($dom); - $descendants = $xpath->query('//div/node()'); - - $images = $xpath->query('//div/node()/img'); - $imageiterator = 0; + if(trim($payload) == ""){ + + return; + } - if(count($descendants) === 0){ + if( + $index !== 0 && + $text[$index - 1]["type"] == "text" + ){ - return [ + $text[$index - 1]["value"] .= preg_replace('/ $/', " ", $payload); + }else{ + + $text[] = [ "type" => "text", - "value" => $this->unescapehtml($html) + "value" => preg_replace('/ $/', " ", $payload) ]; + $index++; } + } + + private function stackoverflow_parse($html){ - $array = []; - $previoustype = null; + $i = 0; + $answer = []; - foreach($descendants as $node){ - - // $node->nodeValue = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $node->nodeValue); + $this->fuckhtml->load($html); + + $tags = $this->fuckhtml->getElementsByTagName("*"); + + if(count($tags) === 0){ - // get node type - switch($node->nodeName){ - case "#text": - $type = "text"; - break; - - case "pre": - $type = "code"; - break; - - case "code": - $type = "inline_code"; - break; - - case "h1": - case "h2": - case "h3": - case "h4": - case "h5": - case "h6": - $type = "title"; - break; - - case "blockquote": - $type = "quote"; - break; - - case "a": - $type = "link"; - break; - - case "img": - $type = "image"; - break; - } + return [ + [ + "type" => "text", + "value" => htmlspecialchars_decode($html) + ] + ]; + } + + foreach($tags as $snippet){ - // add node to array - switch($type){ + switch($snippet["tagName"]){ - case "text": - $value = preg_replace( - '/ {2,}/', - " ", - $this->limitnewlines($this->unescapehtml($node->textContent)) - ); + case "p": + $this->fuckhtml->load($snippet["innerHTML"]); - if( - $previoustype == "quote" || - $previoustype === null || - $previoustype == "image" || - $previoustype == "title" || - $previoustype == "code" - ){ - - $value = ltrim($value); - } + $codetags = + $this->fuckhtml + ->getElementsByTagName("*"); - if($value == ""){ - - $previoustype = $type; - continue 2; - } + $tmphtml = $snippet["innerHTML"]; - // merge with previous text node - if($previoustype == "text"){ + foreach($codetags as $tag){ - $array[count($array) - 1]["value"] = trim($array[count($array) - 1]["value"]) . "\n" . $this->bstoutf8($value); - }else{ + if(!isset($tag["outerHTML"])){ + + continue; + } - $array[] = [ - "type" => "text", - "value" => $this->bstoutf8($value) - ]; + $tmphtml = + explode( + $tag["outerHTML"], + $tmphtml, + 2 + ); + + $value = $this->fuckhtml->getTextContent($tmphtml[0], false, false); + $this->appendtext($value, $answer, $i); + + $type = null; + switch($tag["tagName"]){ + + case "code": $type = "inline_code"; break; + case "em": $type = "italic"; break; + case "blockquote": $type = "quote"; break; + default: $type = "text"; + } + + if($type !== null){ + $value = $this->fuckhtml->getTextContent($tag, false, false); + + if(trim($value) != ""){ + + $answer[] = [ + "type" => $type, + "value" => rtrim($value) + ]; + $i++; + } + } + + if(count($tmphtml) === 2){ + + $tmphtml = $tmphtml[1] . "\n"; + }else{ + + break; + } } - break; - - case "inline_code": - case "bold": - $array[] = [ - "type" => "inline_code", - "value" => $this->bstoutf8(trim($this->limitnewlines($this->unescapehtml($node->textContent)))) - ]; - break; - - case "link": - // check for link nested inside of image - if(strlen($node->childNodes->item(0)->textContent) !== 0){ + if(is_array($tmphtml)){ - $array[] = [ - "type" => "link", - "value" => $this->bstoutf8(trim($this->unescapehtml($node->textContent))), - "url" => $this->bstoutf8(preg_replace('/\/ddg$/', "", preg_replace('/^http:\/\//', "https://", $this->sanitizeurl($node->getAttribute("href"))))) - ]; - break; + $tmphtml = $tmphtml[0]; } - $type = "image"; - - if($previoustype == "text"){ + if(strlen($tmphtml) !== 0){ - $array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]); + $value = $this->fuckhtml->getTextContent($tmphtml, true, false); + $this->appendtext($value, $answer, $i); } - - $array[] = [ - "type" => "image", - "url" => $this->bstoutf8(preg_replace('/^http:\/\//', "https://", preg_replace('/^\/\/images\.duckduckgo\.com\/iu\/\?u=/', "", $images->item($imageiterator)->getAttribute("src")))) - ]; - - $imageiterator++; - break; - case "image": - - if($previoustype == "text"){ - - $array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]); - } - - $array[] = [ + case "img": + $answer[] = [ "type" => "image", - "url" => $this->bstoutf8(preg_replace('/^http:\/\//', "https://", preg_replace('/^\/\/images\.duckduckgo\.com\/iu\/\?u=/', "", $node->getAttribute("src")))) + "url" => + $this->fuckhtml + ->getTextContent( + $tag["attributes"]["src"] + ) ]; + $i++; break; - case "quote": - case "title": - case "code": - if($previoustype == "text"){ + case "pre": + switch($answer[$i - 1]["type"]){ - $array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]); + case "text": + case "italic": + $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]); + break; } - // no break - - default: - $value = trim($this->limitnewlines($this->unescapehtml($node->textContent))); - if($type != "code"){ - - $value = preg_replace( - '/ {2,}/', - " ", - $value + $answer[] = + [ + "type" => "code", + "value" => + rtrim( + $this->fuckhtml + ->getTextContent( + $snippet, + true, + false + ) + ) + ]; + $i++; + + break; + + case "ol": + $o = 0; + + $this->fuckhtml->load($snippet); + $li = + $this->fuckhtml + ->getElementsByTagName("li"); + + foreach($li as $elem){ + $o++; + + $this->appendtext( + $o . ". " . + $this->fuckhtml + ->getTextContent( + $elem + ), + $answer, + $i ); } - - $array[] = [ - "type" => $type, - "value" => $this->bstoutf8($value) - ]; break; } + } + + if( + $i !== 0 && + $answer[$i - 1]["type"] == "text" + ){ - $previoustype = $type; + $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]); } - return $array; + return $answer; } private function bstoutf8($bs){ |