diff options
Diffstat (limited to 'scraper')
-rw-r--r-- | scraper/brave.php | 338 | ||||
-rw-r--r-- | scraper/ddg.php | 368 | ||||
-rw-r--r-- | scraper/facebook.php | 5 | ||||
-rw-r--r-- | scraper/ftm.php | 43 | ||||
-rw-r--r-- | scraper/google.php | 84 | ||||
-rw-r--r-- | scraper/imgur.php | 37 | ||||
-rw-r--r-- | scraper/marginalia.php | 17 | ||||
-rw-r--r-- | scraper/mojeek.php | 427 | ||||
-rw-r--r-- | scraper/pinterest.php | 5 | ||||
-rw-r--r-- | scraper/sc.php | 53 | ||||
-rw-r--r-- | scraper/wiby.php | 26 | ||||
-rw-r--r-- | scraper/yandex.php | 85 | ||||
-rw-r--r-- | scraper/yep.php | 16 | ||||
-rw-r--r-- | scraper/youtube.php | 37 |
14 files changed, 938 insertions, 603 deletions
diff --git a/scraper/brave.php b/scraper/brave.php index 93256a8..91e3f9e 100644 --- a/scraper/brave.php +++ b/scraper/brave.php @@ -7,8 +7,8 @@ class brave{ include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); - include "lib/nextpage.php"; - $this->nextpage = new nextpage("brave"); + include "lib/backend.php"; + $this->backend = new backend("brave"); } public function getfilters($page){ @@ -138,13 +138,20 @@ class brave{ "maybe" => "Maybe", "no" => "No" ] + ], + "spellcheck" => [ + "display" => "Spellcheck", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] ] ]; break; } } - private function get($url, $get = [], $nsfw, $country){ + private function get($proxy, $url, $get = [], $nsfw, $country){ switch($nsfw){ @@ -159,7 +166,7 @@ class brave{ } $headers = [ - "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -190,11 +197,12 @@ class brave{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); if(curl_errno($curlproc)){ - throw new Exception(curl_error($curlproc)); } @@ -207,7 +215,9 @@ class brave{ if($get["npt"]){ // get next page data - $q = json_decode($this->nextpage->get($get["npt"], "web"), true); + [$q, $proxy] = $this->backend->get($get["npt"], "web"); + + $q = json_decode($q, true); $search = $q["q"]; $q["spellcheck"] = "0"; @@ -222,7 +232,6 @@ class brave{ // get _GET data instead $search = $get["s"]; - if(strlen($search) === 0){ throw new Exception("Search term is empty!"); @@ -230,9 +239,10 @@ class brave{ if(strlen($search) > 2048){ - throw new Exception("Search query is too long!"); + throw new Exception("Search term is too long!"); } + $proxy = $this->backend->get_ip(); $nsfw = $get["nsfw"]; $country = $get["country"]; $older = $get["older"]; @@ -288,6 +298,7 @@ class brave{ try{ $html = $this->get( + $proxy, "https://search.brave.com/search", $q, $nsfw, @@ -361,9 +372,10 @@ class brave{ $q["country"] = $country; $out["npt"] = - $this->nextpage->store( + $this->backend->store( json_encode($q), - "web" + "web", + $proxy ); } } @@ -759,7 +771,9 @@ class brave{ "description" => isset($result["review"]["description"]) ? $this->limitstrlen( - $result["review"]["description"] + strip_tags( + $result["review"]["description"] + ) ) : $this->titledots( $this->fuckhtml @@ -839,6 +853,32 @@ class brave{ "value" => $this->titledots($info["long_desc"]) ]; } + + // parse ratings + if( + isset($info["ratings"]) && + $info["ratings"] != "void 0" + ){ + + $description[] = [ + "type" => "title", + "value" => "Ratings" + ]; + + foreach($info["ratings"] as $rating){ + + $description[] = [ + "type" => "link", + "url" => $rating["profile"]["url"], + "value" => $rating["profile"]["name"] + ]; + + $description[] = [ + "type" => "text", + "value" => ": " . $rating["ratingValue"] . "/" . $rating["bestRating"] . "\n" + ]; + } + } } $table = []; @@ -908,9 +948,9 @@ class brave{ $out["video"][] = [ "title" => $this->titledots($video["title"]), "description" => $this->titledots($video["description"]), - "date" => isset($video["age"]) ? strtotime($video["age"]) : null, - "duration" => isset($video["video"]["duration"]) ? $this->hms2int($video["video"]["duration"]) : null, - "views" => null, + "date" => isset($video["age"]) && $video["age"] != "void 0" ? strtotime($video["age"]) : null, + "duration" => isset($video["video"]["duration"]) && $video["video"]["duration"] != "void 0" ? $this->hms2int($video["video"]["duration"]) : null, + "views" => isset($video["video"]["views"]) && $video["video"]["views"] != "void 0" ? (int)$video["video"]["views"] : null, "thumb" => isset($video["thumbnail"]["src"]) ? [ @@ -1008,37 +1048,75 @@ class brave{ public function news($get){ - $search = $get["s"]; - if(strlen($search) === 0){ + if($get["npt"]){ - throw new Exception("Search term is empty!"); - } - - $nsfw = $get["nsfw"]; - $country = $get["country"]; - - if(strlen($search) > 2048){ + [$req, $proxy] = $this->backend->get($get["npt"], "news"); - throw new Exception("Search query is too long!"); - } - /* - $handle = fopen("scraper/brave-news.html", "r"); - $html = fread($handle, filesize("scraper/brave-news.html")); - fclose($handle);*/ - try{ - $html = - $this->get( - "https://search.brave.com/news", - [ - "q" => $search - ], - $nsfw, - $country - ); + $req = json_decode($req, true); - }catch(Exception $error){ + $search = $req["q"]; + $country = $req["country"]; + $nsfw = $req["nsfw"]; + $offset = $req["offset"]; + $spellcheck = $req["spellcheck"]; - throw new Exception("Could not fetch search page"); + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/news", + [ + "q" => $search, + "offset" => $offset, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + + }else{ + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search term is too long!"); + } + + $proxy = $this->backend->get_ip(); + $nsfw = $get["nsfw"]; + $country = $get["country"]; + $spellcheck = $get["spellcheck"] == "yes" ? "1" : "0"; + + /* + $handle = fopen("scraper/brave-news.html", "r"); + $html = fread($handle, filesize("scraper/brave-news.html")); + fclose($handle);*/ + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/news", + [ + "q" => $search, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } } $out = [ @@ -1050,6 +1128,17 @@ class brave{ // load html $this->fuckhtml->load($html); + // get npt + $out["npt"] = + $this->generatenextpagetoken( + $search, + $nsfw, + $country, + $spellcheck, + "news", + $proxy + ); + $news = $this->fuckhtml ->getElementsByClassName( @@ -1183,8 +1272,19 @@ class brave{ public function image($get){ $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search term is too long!"); + } + $country = $get["country"]; $nsfw = $get["nsfw"]; + $spellcheck = $get["spellcheck"] == "yes" ? "1" : "0"; $out = [ "status" => "ok", @@ -1195,9 +1295,11 @@ class brave{ try{ $html = $this->get( + $this->backend->get_ip(), // no nextpage right now, pass proxy directly "https://search.brave.com/images", [ - "q" => $search + "q" => $search, + "spellcheck" => $spellcheck ], $nsfw, $country @@ -1261,9 +1363,75 @@ class brave{ public function video($get){ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; + if($get["npt"]){ + + [$npt, $proxy] = $this->backend->get($get["npt"], "videos"); + + $npt = json_decode($npt, true); + $search = $npt["q"]; + $offset = $npt["offset"]; + $spellcheck = $npt["spellcheck"]; + $country = $npt["country"]; + $nsfw = $npt["nsfw"]; + + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/videos", + [ + "q" => $search, + "offset" => $offset, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search term is too long!"); + } + + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $spellcheck = $get["spellcheck"] == "yes" ? "1" : "0"; + + $proxy = $this->backend->get_ip(); + + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/videos", + [ + "q" => $search, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + } + + $this->fuckhtml->load($html); $out = [ "status" => "ok", @@ -1275,21 +1443,17 @@ class brave{ "reel" => [] ]; - try{ - $html = - $this->get( - "https://search.brave.com/videos", - [ - "q" => $search - ], - $nsfw, - $country - ); - - }catch(Exception $error){ - - throw new Exception("Could not fetch search page"); - } + // get npt + $out["npt"] = + $this->generatenextpagetoken( + $search, + $nsfw, + $country, + $spellcheck, + "videos", + $proxy + ); + /* $handle = fopen("scraper/brave-video.html", "r"); $html = fread($handle, filesize("scraper/brave-video.html")); @@ -1606,7 +1770,7 @@ class brave{ $data["table"][trim($html[0])] = trim($html[1]); } } - + /* private function getimagelinkfromstyle($thumb){ $thumb = @@ -1646,13 +1810,13 @@ class brave{ "url" => $url, "ratio" => "16:9" ]; - } + }*/ private function limitstrlen($text){ return explode("\n", wordwrap($text, 300, "\n"))[0]; } - + /* private function limitwhitespace($text){ return @@ -1661,7 +1825,7 @@ class brave{ " ", $text ); - } + }*/ private function titledots($title){ @@ -1678,6 +1842,52 @@ class brave{ return trim($title); } + private function generatenextpagetoken($q, $nsfw, $country, $spellcheck, $page, $proxy){ + + $nextpage = + $this->fuckhtml + ->getElementsByClassName("btn", "a"); + + if(count($nextpage) !== 0){ + + $nextpage = + $nextpage[count($nextpage) - 1]; + + if( + strtolower( + $this->fuckhtml + ->getTextContent( + $nextpage + ) + ) == "next" + ){ + + preg_match( + '/offset=([0-9]+)/', + $this->fuckhtml->getTextContent($nextpage["attributes"]["href"]), + $nextpage + ); + + return + $this->backend->store( + json_encode( + [ + "q" => $q, + "offset" => (int)$nextpage[1], + "nsfw" => $nsfw, + "country" => $country, + "spellcheck" => $spellcheck + ] + ), + $page, + $proxy + ); + } + } + + return null; + } + private function unshiturl($url){ // https://imgs.search.brave.com/XFnbR8Sl7ge82MBDEH7ju0UHImRovMVmQ2qnDvgNTuA/rs:fit:844:225:1/g:ce/aHR0cHM6Ly90c2U0/Lm1tLmJpbmcubmV0/L3RoP2lkPU9JUC54/UWotQXU5N2ozVndT/RDJnNG9BNVhnSGFF/SyZwaWQ9QXBp.jpeg diff --git a/scraper/ddg.php b/scraper/ddg.php index 1ce8e18..2d737ba 100644 --- a/scraper/ddg.php +++ b/scraper/ddg.php @@ -4,8 +4,11 @@ class ddg{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("ddg"); + include "lib/backend.php"; + $this->backend = new backend("ddg"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); } /* @@ -14,7 +17,7 @@ class ddg{ private const req_web = 0; private const req_xhr = 1; - private function get($url, $get = [], $reqtype = self::req_web){ + private function get($proxy, $url, $get = [], $reqtype = self::req_web){ $curlproc = curl_init(); @@ -28,7 +31,7 @@ class ddg{ switch($reqtype){ case self::req_web: $headers = - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Encoding: gzip", "Accept-Language: en-US,en;q=0.5", @@ -43,7 +46,7 @@ class ddg{ case self::req_xhr: $headers = - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + ["User-Agent: " . config::USER_AGENT, "Accept: */*", "Accept-Encoding: gzip", "Accept-Language: en-US,en;q=0.5", @@ -57,6 +60,8 @@ class ddg{ break; } + $this->backend->assign_proxy($curlproc, $proxy); + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); @@ -69,7 +74,6 @@ class ddg{ $data = curl_exec($curlproc); if(curl_errno($curlproc)){ - throw new Exception(curl_error($curlproc)); } @@ -541,9 +545,11 @@ class ddg{ public function web($get){ + $proxy = null; + if($get["npt"]){ - $jsgrep = $this->nextpage->get($get["npt"], "web"); + [$jsgrep, $proxy] = $this->backend->get($get["npt"], "web"); $extendedsearch = false; $inithtml = ""; @@ -555,6 +561,7 @@ class ddg{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $older = $get["older"]; @@ -614,9 +621,9 @@ class ddg{ /* Get html */ - // https://duckduckgo.com/?q=minecraft&kz=1&k1=-1&kp=-2 try{ $inithtml = $this->get( + $proxy, "https://duckduckgo.com/", $get_filters ); @@ -643,6 +650,7 @@ class ddg{ try{ $js = $this->get( + $proxy, "https://links.duckduckgo.com" . $jsgrep, [], ddg::req_xhr @@ -692,6 +700,7 @@ class ddg{ // get definition $wordnikjs = $this->get( + $proxy, "https://duckduckgo.com/js/spice/dictionary/definition/" . $wordnik, [], ddg::req_xhr @@ -725,6 +734,7 @@ class ddg{ $wordnikaudio_json = json_decode( $this->get( + $proxy, "https://duckduckgo.com/js/spice/dictionary/audio/" . $wordnik, [], ddg::req_xhr @@ -922,6 +932,7 @@ class ddg{ try{ $stackjs = $this->get( + $proxy, "https://duckduckgo.com" . $stack, [], ddg::req_xhr @@ -944,7 +955,7 @@ class ddg{ $out["answer"][] = [ "title" => $stackjson["Heading"], - "description" => $this->htmltoarray($stackjson["Abstract"]), + "description" => $this->stackoverflow_parse($stackjson["Abstract"]), "url" => str_replace(["http://", "ddg"], ["https://", ""], $stackjson["AbstractURL"]), "thumb" => null, "table" => [], @@ -973,6 +984,7 @@ class ddg{ try{ $lyricsjs = $this->get( + $proxy, "https://duckduckgo.com" . $lyrics, [], ddg::req_xhr @@ -1166,13 +1178,13 @@ class ddg{ if(isset($answers[$i]["data"]["AbstractText"]) && !empty($answers[$i]["data"]["AbstractText"])){ - $description = $this->htmltoarray($answers[$i]["data"]["AbstractText"]); + $description = $this->stackoverflow_parse($answers[$i]["data"]["AbstractText"]); }elseif(isset($answers[$i]["data"]["Abstract"]) && !empty($answers[$i]["data"]["Abstract"])){ - $description = $this->htmltoarray($answers[$i]["data"]["Abstract"]); + $description = $this->stackoverflow_parse($answers[$i]["data"]["Abstract"]); }elseif(isset($answers[$i]["data"]["Answer"]) && !empty($answers[$i]["data"]["Answer"])){ - $description = $this->htmltoarray($answers[$i]["data"]["Answer"]); + $description = $this->stackoverflow_parse($answers[$i]["data"]["Answer"]); }else{ $description = []; @@ -1310,6 +1322,7 @@ class ddg{ $description = []; $shitcoinjs = $this->get( + $proxy, "https://duckduckgo.com/js/spice/cryptocurrency/{$shitcoins[1]}/{$shitcoins[2]}/1", [], ddg::req_xhr @@ -1408,6 +1421,7 @@ class ddg{ try{ $currencyjs = $this->get( + $proxy, "https://duckduckgo.com/js/spice/currency/{$amount}/" . strtolower($currencies[1]) . "/" . strtolower($currencies[2]), [], ddg::req_xhr @@ -1607,7 +1621,7 @@ class ddg{ // store next page token if(isset($web[$i]["n"])){ - $out["npt"] = $this->nextpage->store($web[$i]["n"] . "&biaexp=b&eslexp=a&litexp=c&msvrtexp=b&wrap=1", "web"); + $out["npt"] = $this->backend->store($web[$i]["n"] . "&biaexp=b&eslexp=a&litexp=c&msvrtexp=b&wrap=1", "web", $proxy); continue; } @@ -1874,10 +1888,11 @@ class ddg{ if($get["npt"]){ - $npt = $this->nextpage->get($get["npt"], "images"); + [$npt, $proxy] = $this->backend->get($get["npt"], "images"); try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/i.js?" . $npt, [], ddg::req_xhr @@ -1895,6 +1910,7 @@ class ddg{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $date = $get["date"]; @@ -1934,6 +1950,7 @@ class ddg{ try{ $html = $this->get( + $proxy, "https://duckduckgo.com", $get_filters, ddg::req_web @@ -1980,6 +1997,7 @@ class ddg{ try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/i.js", $js_params, ddg::req_xhr @@ -2005,10 +2023,11 @@ class ddg{ } $out["npt"] = - $this->nextpage->store( + $this->backend->store( explode("?", $json["next"])[1] . "&vqd=" . $vqd, - "images" + "images", + $proxy ); } @@ -2046,10 +2065,11 @@ class ddg{ if($get["npt"]){ - $npt = $this->nextpage->get($get["npt"], "videos"); + [$npt, $proxy] = $this->backend->get($get["npt"], "videos"); try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/v.js?" . $npt, [], @@ -2068,6 +2088,7 @@ class ddg{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $date = $get["date"]; @@ -2099,6 +2120,7 @@ class ddg{ try{ $html = $this->get( + $proxy, "https://duckduckgo.com", $get_filters, ddg::req_web @@ -2123,6 +2145,7 @@ class ddg{ try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/v.js", [ "l" => "us-en", @@ -2155,9 +2178,10 @@ class ddg{ if(isset($json["next"])){ $out["npt"] = - $this->nextpage->store( + $this->backend->store( explode("?", $json["next"])[1], - "videos" + "videos", + $proxy ); } @@ -2213,11 +2237,12 @@ class ddg{ if($get["npt"]){ - $req = $this->nextpage->get($get["npt"], "news"); + [$req, $proxy] = $this->backend->get($get["npt"], "news"); try{ $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/news.js?" . $req, [], @@ -2236,6 +2261,7 @@ class ddg{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $date = $get["date"]; @@ -2261,6 +2287,7 @@ class ddg{ try{ $html = $this->get( + $proxy, "https://duckduckgo.com", $get_params, ddg::req_web @@ -2303,6 +2330,7 @@ class ddg{ } $json = json_decode($this->get( + $proxy, "https://duckduckgo.com/news.js", $js_params, ddg::req_xhr @@ -2323,9 +2351,10 @@ class ddg{ if(isset($json["next"])){ $out["npt"] = - $this->nextpage->store( + $this->backend->store( explode("?", $json["next"])[1], - "news" + "news", + $proxy ); } @@ -2415,192 +2444,193 @@ class ddg{ return "https://" . $parse["host"] . "/th?id=" . urlencode($parts["id"]); } - private function htmltoarray($html){ + private function appendtext($payload, &$text, &$index){ - $html = strip_tags($html, ["img", "pre", "code", "br", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "a"]); - - libxml_use_internal_errors(true); - $dom = new DOMDocument("1.0", "utf-8"); - $dom->loadHTML('<div>' . $html . '</div>'); - $xpath = new DOMXPath($dom); - $descendants = $xpath->query('//div/node()'); - - $images = $xpath->query('//div/node()/img'); - $imageiterator = 0; + if(trim($payload) == ""){ + + return; + } - if(count($descendants) === 0){ + if( + $index !== 0 && + $text[$index - 1]["type"] == "text" + ){ - return [ + $text[$index - 1]["value"] .= preg_replace('/ $/', " ", $payload); + }else{ + + $text[] = [ "type" => "text", - "value" => $this->unescapehtml($html) + "value" => preg_replace('/ $/', " ", $payload) ]; + $index++; } + } + + private function stackoverflow_parse($html){ - $array = []; - $previoustype = null; + $i = 0; + $answer = []; - foreach($descendants as $node){ - - // $node->nodeValue = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $node->nodeValue); + $this->fuckhtml->load($html); + + $tags = $this->fuckhtml->getElementsByTagName("*"); + + if(count($tags) === 0){ - // get node type - switch($node->nodeName){ - case "#text": - $type = "text"; - break; - - case "pre": - $type = "code"; - break; - - case "code": - $type = "inline_code"; - break; - - case "h1": - case "h2": - case "h3": - case "h4": - case "h5": - case "h6": - $type = "title"; - break; - - case "blockquote": - $type = "quote"; - break; - - case "a": - $type = "link"; - break; - - case "img": - $type = "image"; - break; - } + return [ + [ + "type" => "text", + "value" => htmlspecialchars_decode($html) + ] + ]; + } + + foreach($tags as $snippet){ - // add node to array - switch($type){ + switch($snippet["tagName"]){ - case "text": - $value = preg_replace( - '/ {2,}/', - " ", - $this->limitnewlines($this->unescapehtml($node->textContent)) - ); + case "p": + $this->fuckhtml->load($snippet["innerHTML"]); - if( - $previoustype == "quote" || - $previoustype === null || - $previoustype == "image" || - $previoustype == "title" || - $previoustype == "code" - ){ - - $value = ltrim($value); - } + $codetags = + $this->fuckhtml + ->getElementsByTagName("*"); - if($value == ""){ - - $previoustype = $type; - continue 2; - } + $tmphtml = $snippet["innerHTML"]; - // merge with previous text node - if($previoustype == "text"){ + foreach($codetags as $tag){ - $array[count($array) - 1]["value"] = trim($array[count($array) - 1]["value"]) . "\n" . $this->bstoutf8($value); - }else{ + if(!isset($tag["outerHTML"])){ + + continue; + } - $array[] = [ - "type" => "text", - "value" => $this->bstoutf8($value) - ]; + $tmphtml = + explode( + $tag["outerHTML"], + $tmphtml, + 2 + ); + + $value = $this->fuckhtml->getTextContent($tmphtml[0], false, false); + $this->appendtext($value, $answer, $i); + + $type = null; + switch($tag["tagName"]){ + + case "code": $type = "inline_code"; break; + case "em": $type = "italic"; break; + case "blockquote": $type = "quote"; break; + default: $type = "text"; + } + + if($type !== null){ + $value = $this->fuckhtml->getTextContent($tag, false, false); + + if(trim($value) != ""){ + + $answer[] = [ + "type" => $type, + "value" => rtrim($value) + ]; + $i++; + } + } + + if(count($tmphtml) === 2){ + + $tmphtml = $tmphtml[1] . "\n"; + }else{ + + break; + } } - break; - - case "inline_code": - case "bold": - $array[] = [ - "type" => "inline_code", - "value" => $this->bstoutf8(trim($this->limitnewlines($this->unescapehtml($node->textContent)))) - ]; - break; - - case "link": - // check for link nested inside of image - if(strlen($node->childNodes->item(0)->textContent) !== 0){ + if(is_array($tmphtml)){ - $array[] = [ - "type" => "link", - "value" => $this->bstoutf8(trim($this->unescapehtml($node->textContent))), - "url" => $this->bstoutf8(preg_replace('/\/ddg$/', "", preg_replace('/^http:\/\//', "https://", $this->sanitizeurl($node->getAttribute("href"))))) - ]; - break; + $tmphtml = $tmphtml[0]; } - $type = "image"; - - if($previoustype == "text"){ + if(strlen($tmphtml) !== 0){ - $array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]); + $value = $this->fuckhtml->getTextContent($tmphtml, true, false); + $this->appendtext($value, $answer, $i); } - - $array[] = [ - "type" => "image", - "url" => $this->bstoutf8(preg_replace('/^http:\/\//', "https://", preg_replace('/^\/\/images\.duckduckgo\.com\/iu\/\?u=/', "", $images->item($imageiterator)->getAttribute("src")))) - ]; - - $imageiterator++; - break; - case "image": - - if($previoustype == "text"){ - - $array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]); - } - - $array[] = [ + case "img": + $answer[] = [ "type" => "image", - "url" => $this->bstoutf8(preg_replace('/^http:\/\//', "https://", preg_replace('/^\/\/images\.duckduckgo\.com\/iu\/\?u=/', "", $node->getAttribute("src")))) + "url" => + $this->fuckhtml + ->getTextContent( + $tag["attributes"]["src"] + ) ]; + $i++; break; - case "quote": - case "title": - case "code": - if($previoustype == "text"){ + case "pre": + switch($answer[$i - 1]["type"]){ - $array[count($array) - 1]["value"] = rtrim($array[count($array) - 1]["value"]); + case "text": + case "italic": + $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]); + break; } - // no break - - default: - $value = trim($this->limitnewlines($this->unescapehtml($node->textContent))); - if($type != "code"){ - - $value = preg_replace( - '/ {2,}/', - " ", - $value + $answer[] = + [ + "type" => "code", + "value" => + rtrim( + $this->fuckhtml + ->getTextContent( + $snippet, + true, + false + ) + ) + ]; + $i++; + + break; + + case "ol": + $o = 0; + + $this->fuckhtml->load($snippet); + $li = + $this->fuckhtml + ->getElementsByTagName("li"); + + foreach($li as $elem){ + $o++; + + $this->appendtext( + $o . ". " . + $this->fuckhtml + ->getTextContent( + $elem + ), + $answer, + $i ); } - - $array[] = [ - "type" => $type, - "value" => $this->bstoutf8($value) - ]; break; } + } + + if( + $i !== 0 && + $answer[$i - 1]["type"] == "text" + ){ - $previoustype = $type; + $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]); } - return $array; + return $answer; } private function bstoutf8($bs){ diff --git a/scraper/facebook.php b/scraper/facebook.php index 7bd576b..395a863 100644 --- a/scraper/facebook.php +++ b/scraper/facebook.php @@ -9,6 +9,9 @@ class facebook{ include "lib/nextpage.php"; $this->nextpage = new nextpage("fb"); + + include "lib/proxy_pool.php"; + $this->proxy = new proxy_pool("facebook"); } public function getfilters($page){ @@ -104,6 +107,8 @@ class facebook{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->proxy->assign_proxy($curlproc); $data = curl_exec($curlproc); diff --git a/scraper/ftm.php b/scraper/ftm.php index af39c12..0cdfbb3 100644 --- a/scraper/ftm.php +++ b/scraper/ftm.php @@ -4,8 +4,8 @@ class ftm{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("ftm"); + include "lib/backend.php"; + $this->backend = new backend("ftm"); } public function getfilters($page){ @@ -13,7 +13,7 @@ class ftm{ return []; } - private function get($url, $search, $offset){ + private function get($proxy, $url, $search, $offset){ $curlproc = curl_init(); @@ -29,7 +29,7 @@ class ftm{ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -56,6 +56,8 @@ class ftm{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -70,8 +72,6 @@ class ftm{ public function image($get){ - $search = $get["s"]; - $out = [ "status" => "ok", "npt" => null, @@ -80,16 +80,28 @@ class ftm{ if($get["npt"]){ - $count = (int)$this->nextpage->get($get["npt"], "images"); + [$data, $proxy] = $this->backend->get($get["npt"], "images"); + $data = json_decode($data, true); + + $count = $data["count"]; + $search = $data["search"]; }else{ + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + $count = 0; + $proxy = $this->backend->get_ip(); } try{ $json = json_decode( $this->get( + $proxy, "https://findthatmeme.com/api/v1/search", $search, $count @@ -134,14 +146,15 @@ class ftm{ ]; } - if($count === 50){ - - $out["npt"] = - $this->nextpage->store( - $count, - "images" - ); - } + $out["npt"] = + $this->backend->store( + json_encode([ + "count" => $count, + "search" => $search + ]), + "images", + $proxy + ); return $out; } diff --git a/scraper/google.php b/scraper/google.php index ca77231..055d12a 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -10,8 +10,8 @@ class google{ include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); - include "lib/nextpage.php"; - $this->nextpage = new nextpage("google"); + include "lib/backend.php"; + $this->backend = new backend("google"); } public function getfilters($page){ @@ -727,7 +727,7 @@ class google{ } } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $headers = [ "User-Agent: Mozilla/5.0 (Linux; U; Android 2.3.3; pt-pt; LG-P500h-parrot Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MMS/LG-Android-MMS-V1.0/1.2", @@ -760,6 +760,8 @@ class google{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -771,7 +773,7 @@ class google{ curl_close($curlproc); return $data; } - + /* public function web($get){ $search = $get["s"]; @@ -877,9 +879,9 @@ class google{ if(count($title) !== 0){ - /* - Container is a web link - */ + // + // Container is a web link + // $web = [ "title" => $this->titledots( @@ -1051,9 +1053,9 @@ class google{ continue; } - /* - Parse rating object - */ + // + // Parse rating object + // if($is_rating >= -1){ @@ -1102,9 +1104,9 @@ class google{ continue; } - /* - Parse standalone text - */ + // + // Parse standalone text + // $additional_info[] = $innertext; } } @@ -1194,9 +1196,9 @@ class google{ $container_title == "people also search for" ){ - /* - Parse related searches - */ + // + // Parse related searches + // $as = $this->fuckhtml ->getElementsByTagName("a"); @@ -1212,9 +1214,9 @@ class google{ continue; } - /* - Parse image carousel - */ + // + // Parse image carousel + // $title_container = $this->fuckhtml ->getElementsByClassName( @@ -1239,9 +1241,9 @@ class google{ if($title_container == "imagesview all"){ - /* - Image carousel - */ + // + // Image carousel + // $pcitem = $this->fuckhtml ->getElementsByClassName( @@ -1316,9 +1318,9 @@ class google{ } } - /* - Get next page - */ + // + // Get next page + // $as = $this->fuckhtml ->getElementsByTagName("a"); @@ -1340,7 +1342,7 @@ class google{ } return $out; - } + }*/ public function image($get){ @@ -1348,17 +1350,22 @@ class google{ // generate parameters if($get["npt"]){ - $params = - json_decode( - $this->nextpage->get( - $get["npt"], - "images" - ), - true + [$params, $proxy] = + $this->backend->get( + $get["npt"], + "images" ); + + $params = json_decode($params, true); }else{ $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); $country = $get["country"]; $nsfw = $get["nsfw"]; $lang = $get["lang"]; @@ -1475,6 +1482,7 @@ class google{ try{ $html = $this->get( + $proxy, "https://www.google.com/search", $params ); @@ -1578,9 +1586,10 @@ class google{ $params["ijn"] = (int)$params["ijn"] + 1; $out["npt"] = - $this->nextpage->store( + $this->backend->store( json_encode($params), - "images" + "images", + $proxy ); }else{ @@ -1628,9 +1637,10 @@ class google{ $params["imgvl"] = $imgvl; $out["npt"] = - $this->nextpage->store( + $this->backend->store( json_encode($params), - "images" + "images", + $proxy ); } } diff --git a/scraper/imgur.php b/scraper/imgur.php index 4a16de7..23efe00 100644 --- a/scraper/imgur.php +++ b/scraper/imgur.php @@ -4,11 +4,11 @@ class imgur{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("imgur"); - include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); + + include "lib/backend.php"; + $this->backend = new backend("imgur"); } public function getfilters($page){ @@ -57,7 +57,7 @@ class imgur{ ]; } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $curlproc = curl_init(); @@ -70,7 +70,7 @@ class imgur{ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -89,6 +89,8 @@ class imgur{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -105,15 +107,14 @@ class imgur{ if($get["npt"]){ - $filter = - json_decode( - $this->nextpage->get( - $get["npt"], - "images" - ), - true + [$filter, $proxy] = + $this->backend->get( + $get["npt"], + "images" ); + $filter = json_decode($filter, true); + $search = $filter["s"]; unset($filter["s"]); @@ -134,6 +135,12 @@ class imgur{ }else{ $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); $sort = $get["sort"]; $time = $get["time"]; $format = $get["format"]; @@ -165,6 +172,7 @@ class imgur{ try{ $html = $this->get( + $proxy, "https://imgur.com/search/$sort/$time/page/$page", $filter ); @@ -238,9 +246,10 @@ class imgur{ $filter["page"] = $page + 1; $out["npt"] = - $this->nextpage->store( + $this->backend->store( json_encode($filter), - "images" + "images", + $proxy ); } diff --git a/scraper/marginalia.php b/scraper/marginalia.php index c8ab09f..b790a97 100644 --- a/scraper/marginalia.php +++ b/scraper/marginalia.php @@ -3,7 +3,8 @@ class marginalia{ public function __construct(){ - $this->key = "public"; + include "lib/backend.php"; + $this->backend = new backend("marginalia"); } public function getfilters($page){ @@ -76,10 +77,10 @@ class marginalia{ } } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $headers = [ - "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -109,6 +110,8 @@ class marginalia{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -124,6 +127,11 @@ class marginalia{ public function web($get){ $search = [$get["s"]]; + if(strlen($get["s"]) === 0){ + + throw new Exception("Search term is empty!"); + } + $profile = $get["profile"]; $format = $get["format"]; $file = $get["file"]; @@ -184,7 +192,8 @@ class marginalia{ try{ $json = $this->get( - "https://api.marginalia.nu/{$this->key}/search/" . urlencode($search), + $this->backend->get_ip(), // no nextpage + "https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search), $params ); }catch(Exception $error){ diff --git a/scraper/mojeek.php b/scraper/mojeek.php index e7e8abc..3d91c09 100644 --- a/scraper/mojeek.php +++ b/scraper/mojeek.php @@ -6,8 +6,8 @@ class mojeek{ include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); - include "lib/nextpage.php"; - $this->nextpage = new nextpage("mojeek"); + include "lib/backend.php"; + $this->backend = new backend("mojeek"); } public function getfilters($page){ @@ -371,10 +371,10 @@ class mojeek{ } } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $headers = [ - "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -404,6 +404,8 @@ class mojeek{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -420,11 +422,12 @@ class mojeek{ if($get["npt"]){ - $token = $this->nextpage->get($get["npt"], "web"); + [$token, $proxy] = $this->backend->get($get["npt"], "web"); try{ $html = $this->get( + $proxy, "https://www.mojeek.com" . $token, [] ); @@ -485,9 +488,12 @@ class mojeek{ $params["si"] = $domain; } + $proxy = $this->backend->get_ip(); + try{ $html = $this->get( + $proxy, "https://www.mojeek.com/search", $params ); @@ -529,88 +535,90 @@ class mojeek{ return $out; } - $this->fuckhtml->load($results[0]); - /* - Get search results + Get all search result divs */ - $results = - $this->fuckhtml - ->getElementsByTagName("li"); - - foreach($results as $result){ - - $data = [ - "title" => null, - "description" => null, - "url" => null, - "date" => null, - "type" => "web", - "thumb" => [ - "url" => null, - "ratio" => null - ], - "sublink" => [], - "table" => [] - ]; - - $this->fuckhtml->load($result); + foreach($results as $container){ - $title = + $this->fuckhtml->load($container); + $results = $this->fuckhtml - ->getElementsByClassName("title", "a")[0]; + ->getElementsByTagName("li"); - $data["title"] = - html_entity_decode( + foreach($results as $result){ + + $data = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + $this->fuckhtml->load($result); + + $title = $this->fuckhtml - ->getTextContent( - $title["innerHTML"] - ) - ); - - $data["url"] = - html_entity_decode( + ->getElementsByClassName("title", "a")[0]; + + $data["title"] = + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $title["innerHTML"] + ) + ); + + $data["url"] = + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $title["attributes"]["href"] + ) + ); + + $description = $this->fuckhtml - ->getTextContent( - $title["attributes"]["href"] - ) - ); - - $description = - $this->fuckhtml - ->getElementsByClassName( - "s", "p" - ); - - if(count($description) !== 0){ + ->getElementsByClassName( + "s", "p" + ); - $data["description"] = - $this->titledots( - html_entity_decode( - $this->fuckhtml - ->getTextContent( - $description[0] + if(count($description) !== 0){ + + $data["description"] = + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $description[0] + ) ) + ); + } + + $data["date"] = + explode( + " - ", + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName("i", "p")[1] ) ); + + $data["date"] = + strtotime( + $data["date"][count($data["date"]) - 1] + ); + + $out["web"][] = $data; } - - $data["date"] = - explode( - " - ", - $this->fuckhtml - ->getTextContent( - $this->fuckhtml - ->getElementsByClassName("i", "p")[1] - ) - ); - - $data["date"] = - strtotime( - $data["date"][count($data["date"]) - 1] - ); - - $out["web"][] = $data; } /* @@ -969,12 +977,13 @@ class mojeek{ if($a["innerHTML"] == "Next"){ - $out["npt"] = $this->nextpage->store( + $out["npt"] = $this->backend->store( $this->fuckhtml ->getTextContent( $a["attributes"]["href"] ), - "web" + "web", + $proxy ); } } @@ -1001,6 +1010,7 @@ class mojeek{ try{ $html = $this->get( + $this->backend->get_ip(), "https://www.mojeek.com/search", [ "q" => $search, @@ -1011,168 +1021,139 @@ class mojeek{ throw new Exception("Failed to get HTML"); } - /* $handle = fopen("scraper/mojeek.html", "r"); $html = fread($handle, filesize("scraper/mojeek.html")); - fclose($handle);*/ - - /* - Get big, standard and smaller nodes + fclose($handle); */ - foreach( - [ - "results-extended", - "results-standard" - ] - as $categoryname - ){ + + $this->fuckhtml->load($html); + + $articles = + $this->fuckhtml->getElementsByTagName("article"); + + foreach($articles as $article){ + + $this->fuckhtml->load($article); + + $data = [ + "title" => null, + "author" => null, + "description" => null, + "date" => null, + "thumb" => + [ + "url" => null, + "ratio" => null + ], + "url" => null + ]; + + $a = $this->fuckhtml->getElementsByTagName("a")[0]; + + $data["title"] = + $this->fuckhtml + ->getTextContent( + $a["attributes"]["title"] + ); + + $data["url"] = + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ); + + $p = $this->fuckhtml->getElementsByTagName("p"); + + $data["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "s", + $p + )[0] + ) + ); - $this->fuckhtml->load($html); + if($data["description"] == ""){ + + $data["description"] = null; + } - $categories = + // get date from big node + $date = $this->fuckhtml ->getElementsByClassName( - $categoryname, - "ul" + "date", + $p ); - - foreach($categories as $category){ + + if(count($date) !== 0){ + + $data["date"] = + strtotime( + $this->fuckhtml + ->getTextContent( + $date[0] + ) + ); + } + + // grep date + author + $s = + $this->fuckhtml + ->getElementsByClassName( + "i", + $p + )[0]; + + $this->fuckhtml->load($s); + + $a = + $this->fuckhtml + ->getElementsByTagName("a"); + + if(count($a) !== 0){ - $this->fuckhtml->load($category); + // parse big node information + $data["author"] = + $this->fuckhtml + ->getTextContent( + $a[0]["innerHTML"] + ); + }else{ - $nodes = + // parse smaller nodes + $replace = $this->fuckhtml - ->getElementsByTagName("li"); + ->getElementsByTagName("time")[0]; - foreach($nodes as $node){ - - $data = [ - "title" => null, - "author" => null, - "description" => null, - "date" => null, - "thumb" => - [ - "url" => null, - "ratio" => null - ], - "url" => null - ]; - - /* - Parse the results - */ - $this->fuckhtml->load($node); - - // get title + url - $a = - $this->fuckhtml - ->getElementsByTagName("a")[0]; - - $data["title"] = - $this->fuckhtml - ->getTextContent( - $a["attributes"]["title"] - ); - - $data["url"] = + $data["date"] = + strtotime( $this->fuckhtml ->getTextContent( - $a["attributes"]["href"] - ); - - // get image - $image = - $this->fuckhtml - ->getElementsByTagName("img"); - - if(count($image) !== 0){ - - $data["thumb"] = [ - "url" => - urldecode( - str_replace( - "/image?img=", - "", - $this->fuckhtml - ->getTextContent( - $image[0]["attributes"]["src"] - ) - ) - ), - "ratio" => "16:9" - ]; - } - - // get description - $description = - $this->fuckhtml - ->getElementsByClassName("s", "p"); - - if(count($description) !== 0){ - - $data["description"] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $description[0] - ) - ); - } - - // get date + time - $date = - $this->fuckhtml - ->getElementsByClassName( - "date", - "p" - ); - - $i = - $this->fuckhtml - ->getElementsByClassName("i", "p"); - - if(count($date) !== 0){ - - // we're inside a big node - $data["date"] = strtotime($date[0]["innerHTML"]); - - if(count($i) !== 0){ - - $this->fuckhtml->load($i[0]); - - $a = - $this->fuckhtml - ->getElementsByTagName("a"); - - if(count($a) !== 0){ - - $data["author"] = - $this->fuckhtml - ->getTextContent($a[0]); - } - } - }else{ - - // we're inside a small node - if(count($i) !== 0){ - - $i = - explode( - " - ", - $this->fuckhtml - ->getTextContent($i[0]) - ); - - $data["date"] = strtotime(array_pop($i)); - $data["author"] = implode(" - ", $i); - } - } - - $out["news"][] = $data; - } + $replace + ) + ); + + $s["innerHTML"] = + str_replace( + $replace["outerHTML"], + "", + $s["innerHTML"] + ); + + $data["author"] = + preg_replace( + '/ • $/', + "", + $s["innerHTML"] + ); } + + $out["news"][] = $data; } return $out; diff --git a/scraper/pinterest.php b/scraper/pinterest.php index 2bb5b71..37473a1 100644 --- a/scraper/pinterest.php +++ b/scraper/pinterest.php @@ -6,6 +6,9 @@ class pinterest{ include "lib/nextpage.php"; $this->nextpage = new nextpage("pinterest"); + + include "lib/proxy_pool.php"; + $this->proxy = new proxy_pool("pinterest"); } public function getfilters($page){ @@ -44,6 +47,8 @@ class pinterest{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->proxy->assign_proxy($curlproc); $data = curl_exec($curlproc); diff --git a/scraper/sc.php b/scraper/sc.php index 1f49f95..16d3931 100644 --- a/scraper/sc.php +++ b/scraper/sc.php @@ -4,10 +4,8 @@ class sc{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("sc"); - $this->client_id = "ArYppSEotE3YiXCO4Nsgid2LLqJutiww"; - $this->user_id = "766585-580597-163310-929698"; + include "lib/backend.php"; + $this->backend = new backend("sc"); } public function getfilters($page){ @@ -27,7 +25,7 @@ class sc{ ]; } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $curlproc = curl_init(); @@ -40,7 +38,7 @@ class sc{ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0", + ["User-Agent: " . config::USER_AGENT, "Accept: application/json, text/javascript, */*; q=0.01", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -58,6 +56,8 @@ class sc{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -74,7 +74,7 @@ class sc{ if($get["npt"]){ - $params = $this->nextpage->get($get["npt"], "music"); + [$params, $proxy] = $this->backend->get($get["npt"], "music"); $params = json_decode($params, true); $url = $params["url"]; @@ -101,7 +101,13 @@ class sc{ // https://api-v2.soundcloud.com/search/playlists_without_albums?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + $type = $get["type"]; + $proxy = $this->backend->get_ip(); switch($type){ @@ -111,8 +117,8 @@ class sc{ "q" => $search, "variant_ids" => "", "facet" => "model", - "user_id" => $this->user_id, - "client_id" => $this->client_id, + "user_id" => config::SC_USER_ID, + "client_id" => config::SC_CLIENT_TOKEN, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, @@ -127,8 +133,8 @@ class sc{ "q" => $search, "variant_ids" => "", "facet_genre" => "", - "user_id" => $this->user_id, - "client_id" => $this->client_id, + "user_id" => config::SC_USER_ID, + "client_id" => config::SC_CLIENT_TOKEN, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, @@ -143,8 +149,8 @@ class sc{ "q" => $search, "variant_ids" => "", "facet" => "place", - "user_id" => $this->user_id, - "client_id" => $this->client_id, + "user_id" => config::SC_USER_ID, + "client_id" => config::SC_CLIENT_TOKEN, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, @@ -159,8 +165,8 @@ class sc{ "q" => $search, "variant_ids" => "", "facet" => "genre", - "user_id" => $this->user_id, - "client_id" => $this->client_id, + "user_id" => config::SC_USER_ID, + "client_id" => config::SC_CLIENT_TOKEN, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, @@ -175,8 +181,8 @@ class sc{ "q" => $search, "variant_ids" => "", "facet" => "genre", - "user_id" => $this->user_id, - "client_id" => $this->client_id, + "user_id" => config::SC_USER_ID, + "client_id" => config::SC_CLIENT_TOKEN, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, @@ -192,8 +198,8 @@ class sc{ "variant_ids" => "", "filter.content_tier" => "SUB_HIGH_TIER", "facet" => "genre", - "user_id" => $this->user_id, - "client_id" => $this->client_id, + "user_id" => config::SC_USER_ID, + "client_id" => config::SC_CLIENT_TOKEN, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, @@ -206,7 +212,7 @@ class sc{ try{ - $json = $this->get($url, $params); + $json = $this->get($proxy, $url, $params); }catch(Exception $error){ @@ -244,9 +250,10 @@ class sc{ $params["url"] = $url; // we will remove this later $out["npt"] = - $this->nextpage->store( + $this->backend->store( json_encode($params), - "music" + "music", + $proxy ); } @@ -342,7 +349,7 @@ class sc{ "endpoint" => "audio_sc", "url" => $item["media"]["transcodings"][0]["url"] . - "?client_id=" . $this->client_id . + "?client_id=" . config::SC_CLIENT_TOKEN . "&track_authorization=" . $item["track_authorization"] ]; diff --git a/scraper/wiby.php b/scraper/wiby.php index a1daf57..e8351bc 100644 --- a/scraper/wiby.php +++ b/scraper/wiby.php @@ -4,8 +4,8 @@ class wiby{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("wiby"); + include "lib/backend.php"; + $this->backend = new backend("wiby"); } public function getfilters($page){ @@ -36,7 +36,7 @@ class wiby{ ]; } - private function get($url, $get = [], $nsfw){ + private function get($proxy, $url, $get = [], $nsfw){ $curlproc = curl_init(); @@ -45,11 +45,13 @@ class wiby{ $url .= "?" . $get; } + print_r([$proxy, $url]); + curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -69,6 +71,8 @@ class wiby{ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + $this->backend->assign_proxy($curlproc, $proxy); + $data = curl_exec($curlproc); if(curl_errno($curlproc)){ @@ -84,11 +88,8 @@ class wiby{ if($get["npt"]){ - $q = - json_decode( - $this->nextpage->get($get["npt"], "web"), - true - ); + [$q, $proxy] = $this->backend->get($get["npt"], "web"); + $q = json_decode($q, true); $nsfw = $q["nsfw"]; unset($q["nsfw"]); @@ -100,6 +101,7 @@ class wiby{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $date = $get["date"]; $nsfw = $get["nsfw"] == "yes" ? "0" : "1"; @@ -150,6 +152,7 @@ class wiby{ try{ $html = $this->get( + $proxy, "https://wiby.me/", $q, $nsfw @@ -171,13 +174,14 @@ class wiby{ }else{ $nextpage = - $this->nextpage->store( + $this->backend->store( json_encode([ "q" => $q["q"], "p" => (int)$nextpage[1], "nsfw" => $nsfw ]), - "web" + "web", + $proxy ); } diff --git a/scraper/yandex.php b/scraper/yandex.php index 65abe73..7335edc 100644 --- a/scraper/yandex.php +++ b/scraper/yandex.php @@ -10,11 +10,11 @@ class yandex{ include "lib/fuckhtml.php"; $this->fuckhtml = new fuckhtml(); - include "lib/nextpage.php"; - $this->nextpage = new nextpage("yandex"); + include "lib/backend.php"; + // backend included in the scraper functions } - private function get($url, $get = [], $nsfw){ + private function get($proxy, $url, $get = [], $nsfw){ $curlproc = curl_init(); @@ -32,7 +32,7 @@ class yandex{ } $headers = - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Encoding: gzip", "Accept-Language: en-US,en;q=0.5", @@ -54,6 +54,8 @@ class yandex{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -207,6 +209,8 @@ class yandex{ public function web($get){ + $this->backend = new backend("yandex_w"); + // has captcha // https://yandex.com/search/touch/?text=lol&app_platform=android&appsearch_header=1&ui=webmobileapp.yandex&app_version=23070603&app_id=ru.yandex.searchplugin&search_source=yandexcom_touch_native&clid=2218567 @@ -215,10 +219,11 @@ class yandex{ if($get["npt"]){ - $npt = $this->nextpage->get($get["npt"], "web"); + [$npt, $proxy] = $this->backend->get($get["npt"], "web"); $html = $this->get( + $proxy, "https://yandex.com" . $npt, [], "yes" @@ -226,6 +231,12 @@ class yandex{ }else{ $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); $lang = $get["lang"]; $older = $get["older"]; $newer = $get["newer"]; @@ -269,6 +280,7 @@ class yandex{ try{ $html = $this->get( + $proxy, "https://yandex.com/search/site/", $params, "yes" @@ -313,7 +325,7 @@ class yandex{ if(count($npt) !== 0){ $out["npt"] = - $this->nextpage->store( + $this->backend->store( $this->fuckhtml ->getTextContent( $npt @@ -321,7 +333,8 @@ class yandex{ ["attributes"] ["href"] ), - "web" + "web", + $proxy ); } @@ -386,17 +399,18 @@ class yandex{ public function image($get){ + $this->backend = new backend("yandex_i"); + if($get["npt"]){ - $request = - json_decode( - $this->nextpage->get( - $get["npt"], - "images" - ), - true + [$request, $proxy] = + $this->backend->get( + $get["npt"], + "images" ); + $request = json_decode($request, true); + $nsfw = $request["nsfw"]; unset($request["nsfw"]); }else{ @@ -407,6 +421,7 @@ class yandex{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $nsfw = $get["nsfw"]; $time = $get["time"]; $size = $get["size"]; @@ -611,9 +626,11 @@ class yandex{ try{ $json = $this->get( + $proxy, "https://yandex.com/images/search", $request, - $nsfw + $nsfw, + "yandex_i" ); }catch(Exception $err){ @@ -676,7 +693,12 @@ class yandex{ $request["p"] = 1; } - $out["npt"] = $this->nextpage->store(json_encode($request), "images"); + $out["npt"] = + $this->backend->store( + json_encode($request), + "images", + $proxy + ); } // get search results @@ -744,21 +766,29 @@ class yandex{ public function video($get){ + $this->backend = new backend("yandex_v"); + if($get["npt"]){ - $params = - json_decode( - $this->nextpage->get( - $get["npt"], - "web" - ), - true + [$params, $proxy] = + $this->backend->get( + $get["npt"], + "video" ); + $params = json_decode($params, true); + $nsfw = $params["nsfw"]; unset($params["nsfw"]); }else{ + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); $nsfw = $get["nsfw"]; $time = $get["time"]; $duration = $get["duration"]; @@ -865,9 +895,11 @@ class yandex{ try{ $json = $this->get( + $proxy, "https://yandex.com/video/search", $params, - $nsfw + $nsfw, + "yandex_v" ); }catch(Exception $error){ @@ -926,9 +958,10 @@ class yandex{ $params["p"] = "1"; $params["nsfw"] = $nsfw; $out["npt"] = - $this->nextpage->store( + $this->backend->store( json_encode($params), - "web" + "video", + $proxy ); } diff --git a/scraper/yep.php b/scraper/yep.php index 8ff4a57..7a73635 100644 --- a/scraper/yep.php +++ b/scraper/yep.php @@ -4,8 +4,8 @@ class yep{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("yep"); + include "lib/backend.php"; + $this->backend = new backend("yep"); } public function getfilters($page){ @@ -238,7 +238,7 @@ class yep{ ]; } - private function get($url, $get = []){ + private function get($proxy, $url, $get = []){ $curlproc = curl_init(); @@ -251,7 +251,7 @@ class yep{ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + ["User-Agent: " . config::USER_AGENT, "Accept: */*", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -269,6 +269,8 @@ class yep{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -284,6 +286,11 @@ class yep{ public function image($get){ $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + $country = $get["country"]; $nsfw = $get["nsfw"]; @@ -305,6 +312,7 @@ class yep{ $json = json_decode( $this->get( + $this->backend->get_ip(), // no nextpage! "https://api.yep.com/fs/2/search", [ "client" => "web", diff --git a/scraper/youtube.php b/scraper/youtube.php index 83a68ba..526b026 100644 --- a/scraper/youtube.php +++ b/scraper/youtube.php @@ -8,8 +8,8 @@ class youtube{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("yt"); + include "lib/backend.php"; + $this->backend = new backend("yt"); } public function getfilters($page){ @@ -340,7 +340,7 @@ class youtube{ const req_web = 0; const req_xhr = 1; - private function get($url, $get = [], $reqtype = self::req_web, $continuation = null){ + private function get($proxy, $url, $get = [], $reqtype = self::req_web, $continuation = null){ $curlproc = curl_init(); @@ -354,7 +354,7 @@ class youtube{ switch($reqtype){ case self::req_web: $headers = - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -370,7 +370,7 @@ class youtube{ case self::req_xhr: $headers = - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:110.0) Gecko/20100101 Firefox/110.0", + ["User-Agent: " . config::USER_AGENT, "Accept: */*", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", @@ -397,6 +397,8 @@ class youtube{ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); $data = curl_exec($curlproc); @@ -430,17 +432,17 @@ class youtube{ $json = fread($handle, filesize("nextpage.json")); fclose($handle);*/ - $npt = - json_decode( - $this->nextpage->get( - $get["npt"], - "videos" - ), - true + [$npt, $proxy] = + $this->backend->get( + $get["npt"], + "videos" ); + $npt = json_decode($npt, true); + try{ $json = $this->get( + $proxy, "https://www.youtube.com/youtubei/v1/search", [ "key" => $npt["key"], @@ -507,6 +509,7 @@ class youtube{ throw new Exception("Search term is empty!"); } + $proxy = $this->backend->get_ip(); $date = $get["date"]; $type = $get["type"]; $duration = $get["duration"]; @@ -537,6 +540,7 @@ class youtube{ try{ $json = $this->get( + $proxy, "https://www.youtube.com/results", $get ); @@ -942,7 +946,14 @@ class youtube{ if($this->out["npt"] !== null){ - $this->out["npt"] = $this->nextpage->store(json_encode($this->out["npt"]), "videos"); + $this->out["npt"] = + $this->backend->store( + json_encode( + $this->out["npt"] + ), + "videos", + $proxy + ); } return $this->out; |