summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
Diffstat (limited to 'scraper')
-rw-r--r--scraper/greppr.php429
-rw-r--r--scraper/sc.php17
2 files changed, 440 insertions, 6 deletions
diff --git a/scraper/greppr.php b/scraper/greppr.php
new file mode 100644
index 0000000..402c3d2
--- /dev/null
+++ b/scraper/greppr.php
@@ -0,0 +1,429 @@
+<?php
+
+class greppr{
+
+ public function __construct(){
+
+ include "lib/backend.php";
+ $this->backend = new backend("greppr");
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+ }
+
+ public function getfilters($page){
+
+ return [];
+ }
+
+ private function get($proxy, $url, $get = [], $cookie = false){
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+
+ if($cookie === false){
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1"]
+ );
+ }else{
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "Cookie: PHPSESSID=" . $cookie,
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1"]
+ );
+ }
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ $headers = [];
+
+ curl_setopt(
+ $curlproc,
+ CURLOPT_HEADERFUNCTION,
+ function($curlproc, $header) use (&$headers){
+
+ $len = strlen($header);
+ $header = explode(':', $header, 2);
+
+ if(count($header) < 2){
+
+ // ignore invalid headers
+ return $len;
+ }
+
+ $headers[strtolower(trim($header[0]))] = trim($header[1]);
+
+ return $len;
+ }
+ );
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+
+ return [
+ "headers" => $headers,
+ "data" => $data
+ ];
+ }
+
+ public function web($get, $first_attempt = true){
+
+ if($get["npt"]){
+
+ [$q, $proxy] = $this->backend->get($get["npt"], "web");
+
+ $q = json_decode($q, true);
+
+ }else{
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $proxy = $this->backend->get_ip();
+ }
+
+ // get token
+ // token[0] = static token that changes once a day
+ // token[1] = dynamic token that changes on every request
+ // token[1] = PHPSESSID cookie
+ $tokens = apcu_fetch("greppr_token");
+
+ if(
+ $tokens === false ||
+ $first_attempt === false // force token fetch
+ ){
+
+ // we haven't gotten the token yet, get it
+ try{
+
+ $response =
+ $this->get(
+ $proxy,
+ "https://greppr.org",
+ []
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search tokens");
+ }
+
+ $tokens = $this->parse_token($response);
+
+ if($tokens === false){
+
+ throw new Exception("Failed to grep search tokens");
+ }
+ }
+
+ try{
+
+ if($get["npt"]){
+
+ $params = [
+ $tokens[0] => $q["q"],
+ "s" => $q["s"],
+ "l" => 30,
+ "n" => $tokens[1]
+ ];
+ }else{
+
+ $params = [
+ $tokens[0] => $search,
+ "n" => $tokens[1]
+ ];
+ }
+
+ $searchresults = $this->get(
+ $proxy,
+ "https://greppr.org/search",
+ $params,
+ $tokens[2]
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ if(strlen($searchresults["data"]) === 0){
+
+ // redirected to main page, which means we got old token
+ // generate a new one
+
+ // ... unless we just tried to do that
+ if($first_attempt === false){
+
+ throw new Exception("Failed to get a new search token");
+ }
+
+ $this->get($get, false);
+ }
+
+ // refresh the token with new data (this also triggers fuckhtml load)
+ $this->parse_token($searchresults, $tokens[2]);
+
+ // response object
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ // get results for later
+ $results =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "result",
+ "div"
+ );
+
+ // check for next page
+ $next_elem =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "pagination",
+ "ul"
+ );
+
+ if(count($next_elem) !== 0){
+
+ $this->fuckhtml->load($next_elem[0]);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "page-link",
+ "a"
+ );
+
+ $break = false;
+ foreach($as as $a){
+
+ if($break === true){
+
+ parse_str(
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ ),
+ $values
+ );
+
+ $values = array_values($values);
+
+ $out["npt"] =
+ $this->backend->store(
+ json_encode(
+ [
+ "q" => $values[0],
+ "s" => $values[1]
+ ]
+ ),
+ "web",
+ $proxy
+ );
+ break;
+ }
+
+ if($a["attributes"]["href"] == "#"){
+
+ $break = true;
+ }
+ }
+ }
+
+ // scrape results
+ foreach($results as $result){
+
+ $this->fuckhtml->load($result);
+
+ $a =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ )[0];
+
+ $description =
+ $this->fuckhtml
+ ->getElementsByFuzzyAttributeValue(
+ "style",
+ "color:#777777;",
+ "p"
+ );
+
+ if(count($description) === 0){
+
+ $description = null;
+ }else{
+
+ $description =
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ );
+ }
+
+ $date =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "p"
+ );
+
+ $date =
+ strtotime(
+ explode(
+ "Added:",
+ $this->fuckhtml
+ ->getTextContent(
+ $date[count($date) - 1]["innerHTML"]
+ )
+ )[1]
+ );
+
+ $out["web"][] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $a["innerHTML"]
+ ),
+ "description" => $description,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ ),
+ "date" => $date,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => []
+ ];
+ }
+
+ return $out;
+ }
+
+ private function parse_token($response, $cookie = false){
+
+ $this->fuckhtml->load($response["data"]);
+
+ $scripts =
+ $this->fuckhtml
+ ->getElementsByTagName("script");
+
+ $found = false;
+ foreach($scripts as $script){
+
+ preg_match(
+ '/window\.location ?= ?\'\/search\?([^=]+).*&n=([0-9]+)/',
+ $script["innerHTML"],
+ $tokens
+ );
+
+ if(isset($tokens[1])){
+
+ $found = true;
+ break;
+ }
+ }
+
+ if($found === false){
+
+ return false;
+ }
+
+ $tokens = [
+ $tokens[1],
+ $tokens[2]
+ ];
+
+ if($cookie !== false){
+
+ // we already specified a cookie, so use the one we have already
+ $tokens[] = $cookie;
+ apcu_store("greppr_token", $tokens);
+
+ return $tokens;
+ }
+
+ if(!isset($response["headers"]["set-cookie"])){
+
+ // server didn't send a cookie
+ return false;
+ }
+
+ // get cookie
+ preg_match(
+ '/PHPSESSID=([^;]+)/',
+ $response["headers"]["set-cookie"],
+ $cookie
+ );
+
+ if(!isset($cookie[1])){
+
+ // server sent an unexpected cookie
+ return false;
+ }
+
+ $tokens[] = $cookie[1];
+ apcu_store("greppr_token", $tokens);
+
+ return $tokens;
+ }
+}
diff --git a/scraper/sc.php b/scraper/sc.php
index e2e7385..2b847c7 100644
--- a/scraper/sc.php
+++ b/scraper/sc.php
@@ -398,12 +398,17 @@ class sc{
if($token === false){
- $js =
- $this->get(
- $proxy,
- "https://a-v2.sndcdn.com/assets/1-c3e4038d.js",
- []
- );
+ try{
+ $js =
+ $this->get(
+ $proxy,
+ "https://a-v2.sndcdn.com/assets/1-c3e4038d.js",
+ []
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search token");
+ }
preg_match(
'/client_id=([^"]+)/',