diff options
Diffstat (limited to 'scraper')
-rw-r--r-- | scraper/greppr.php | 429 | ||||
-rw-r--r-- | scraper/sc.php | 17 |
2 files changed, 440 insertions, 6 deletions
diff --git a/scraper/greppr.php b/scraper/greppr.php new file mode 100644 index 0000000..402c3d2 --- /dev/null +++ b/scraper/greppr.php @@ -0,0 +1,429 @@ +<?php + +class greppr{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("greppr"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return []; + } + + private function get($proxy, $url, $get = [], $cookie = false){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($cookie === false){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Cookie: PHPSESSID=" . $cookie, + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"] + ); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $headers = []; + + curl_setopt( + $curlproc, + CURLOPT_HEADERFUNCTION, + function($curlproc, $header) use (&$headers){ + + $len = strlen($header); + $header = explode(':', $header, 2); + + if(count($header) < 2){ + + // ignore invalid headers + return $len; + } + + $headers[strtolower(trim($header[0]))] = trim($header[1]); + + return $len; + } + ); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + + return [ + "headers" => $headers, + "data" => $data + ]; + } + + public function web($get, $first_attempt = true){ + + if($get["npt"]){ + + [$q, $proxy] = $this->backend->get($get["npt"], "web"); + + $q = json_decode($q, true); + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + } + + // get token + // token[0] = static token that changes once a day + // token[1] = dynamic token that changes on every request + // token[1] = PHPSESSID cookie + $tokens = apcu_fetch("greppr_token"); + + if( + $tokens === false || + $first_attempt === false // force token fetch + ){ + + // we haven't gotten the token yet, get it + try{ + + $response = + $this->get( + $proxy, + "https://greppr.org", + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search tokens"); + } + + $tokens = $this->parse_token($response); + + if($tokens === false){ + + throw new Exception("Failed to grep search tokens"); + } + } + + try{ + + if($get["npt"]){ + + $params = [ + $tokens[0] => $q["q"], + "s" => $q["s"], + "l" => 30, + "n" => $tokens[1] + ]; + }else{ + + $params = [ + $tokens[0] => $search, + "n" => $tokens[1] + ]; + } + + $searchresults = $this->get( + $proxy, + "https://greppr.org/search", + $params, + $tokens[2] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + if(strlen($searchresults["data"]) === 0){ + + // redirected to main page, which means we got old token + // generate a new one + + // ... unless we just tried to do that + if($first_attempt === false){ + + throw new Exception("Failed to get a new search token"); + } + + $this->get($get, false); + } + + // refresh the token with new data (this also triggers fuckhtml load) + $this->parse_token($searchresults, $tokens[2]); + + // response object + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // get results for later + $results = + $this->fuckhtml + ->getElementsByClassName( + "result", + "div" + ); + + // check for next page + $next_elem = + $this->fuckhtml + ->getElementsByClassName( + "pagination", + "ul" + ); + + if(count($next_elem) !== 0){ + + $this->fuckhtml->load($next_elem[0]); + + $as = + $this->fuckhtml + ->getElementsByClassName( + "page-link", + "a" + ); + + $break = false; + foreach($as as $a){ + + if($break === true){ + + parse_str( + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ), + $values + ); + + $values = array_values($values); + + $out["npt"] = + $this->backend->store( + json_encode( + [ + "q" => $values[0], + "s" => $values[1] + ] + ), + "web", + $proxy + ); + break; + } + + if($a["attributes"]["href"] == "#"){ + + $break = true; + } + } + } + + // scrape results + foreach($results as $result){ + + $this->fuckhtml->load($result); + + $a = + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0]; + + $description = + $this->fuckhtml + ->getElementsByFuzzyAttributeValue( + "style", + "color:#777777;", + "p" + ); + + if(count($description) === 0){ + + $description = null; + }else{ + + $description = + $this->fuckhtml + ->getTextContent( + $description[0] + ); + } + + $date = + $this->fuckhtml + ->getElementsByTagName( + "p" + ); + + $date = + strtotime( + explode( + "Added:", + $this->fuckhtml + ->getTextContent( + $date[count($date) - 1]["innerHTML"] + ) + )[1] + ); + + $out["web"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $a["innerHTML"] + ), + "description" => $description, + "url" => + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ), + "date" => $date, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } + + private function parse_token($response, $cookie = false){ + + $this->fuckhtml->load($response["data"]); + + $scripts = + $this->fuckhtml + ->getElementsByTagName("script"); + + $found = false; + foreach($scripts as $script){ + + preg_match( + '/window\.location ?= ?\'\/search\?([^=]+).*&n=([0-9]+)/', + $script["innerHTML"], + $tokens + ); + + if(isset($tokens[1])){ + + $found = true; + break; + } + } + + if($found === false){ + + return false; + } + + $tokens = [ + $tokens[1], + $tokens[2] + ]; + + if($cookie !== false){ + + // we already specified a cookie, so use the one we have already + $tokens[] = $cookie; + apcu_store("greppr_token", $tokens); + + return $tokens; + } + + if(!isset($response["headers"]["set-cookie"])){ + + // server didn't send a cookie + return false; + } + + // get cookie + preg_match( + '/PHPSESSID=([^;]+)/', + $response["headers"]["set-cookie"], + $cookie + ); + + if(!isset($cookie[1])){ + + // server sent an unexpected cookie + return false; + } + + $tokens[] = $cookie[1]; + apcu_store("greppr_token", $tokens); + + return $tokens; + } +} diff --git a/scraper/sc.php b/scraper/sc.php index e2e7385..2b847c7 100644 --- a/scraper/sc.php +++ b/scraper/sc.php @@ -398,12 +398,17 @@ class sc{ if($token === false){ - $js = - $this->get( - $proxy, - "https://a-v2.sndcdn.com/assets/1-c3e4038d.js", - [] - ); + try{ + $js = + $this->get( + $proxy, + "https://a-v2.sndcdn.com/assets/1-c3e4038d.js", + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search token"); + } preg_match( '/client_id=([^"]+)/', |