summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2024-05-16 17:22:49 -0400
committerlolcat <will@lolcat.ca>2024-05-16 17:22:49 -0400
commit0d98d7839d1c3da75b95ef29ce12ef54a2a20094 (patch)
treec51d5a0dcfc99d91211b65ed9692974305a72c90
parentf8d46df1e858401d93c5fa885777113994a03c86 (diff)
added greppr support also btw im not dead
-rw-r--r--README.md11
-rw-r--r--api/v1/ac.php12
-rw-r--r--api/v1/images.php2
-rw-r--r--api/v1/music.php2
-rw-r--r--api/v1/news.php2
-rw-r--r--api/v1/videos.php2
-rw-r--r--api/v1/web.php2
-rw-r--r--data/config.php3
-rw-r--r--lib/backend.php6
-rw-r--r--lib/frontend.php1
-rw-r--r--lib/fuckhtml.php4
-rw-r--r--scraper/greppr.php429
-rw-r--r--scraper/sc.php17
-rw-r--r--settings.php4
14 files changed, 469 insertions, 28 deletions
diff --git a/README.md b/README.md
index 38ebe28..6cc82a7 100644
--- a/README.md
+++ b/README.md
@@ -36,11 +36,12 @@ tl;dr the best way to actually browse for shit.
| Brave | Brave | DuckDuckGo | Brave | | DuckDuckGo |
| Yandex | Yandex | Brave | Google | | Yandex |
| Google | Google | Yandex | Qwant | | Google |
-| Qwant | Qwant | Google | Mojeek | | Yep |
-| Yep | Pinterest | Qwant | | | Marginalia |
-| Crowdview | Yep | | | | YouTube |
-| Mwmbl | Imgur | | | | Soundcloud |
-| Mojeek | FindThatMeme | | | | |
+| Qwant | Qwant | Google | Mojeek | | Qwant |
+| Yep | Yep | Qwant | | | Yep |
+| Greppr | Imgur | | | | Marginalia |
+| Crowdview | FindThatMeme | | | | YouTube |
+| Mwmbl | | | | | Soundcloud |
+| Mojeek | | | | | |
| Marginalia | | | | | |
| wiby | | | | | |
| Curlie | | | | | |
diff --git a/api/v1/ac.php b/api/v1/ac.php
index 9d9f534..236dc7b 100644
--- a/api/v1/ac.php
+++ b/api/v1/ac.php
@@ -100,7 +100,7 @@ class autocomplete{
$_GET["s"],
$json
],
- JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+ JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
);
break;
@@ -135,7 +135,7 @@ class autocomplete{
$_GET["s"],
$json
],
- JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+ JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
);
break;
@@ -154,7 +154,7 @@ class autocomplete{
$_GET["s"],
$json
],
- JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+ JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
);
break;
@@ -167,7 +167,7 @@ class autocomplete{
$_GET["s"],
$json[1] // ensure it contains valid key 0
],
- JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+ JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
);
break;
}
@@ -221,7 +221,7 @@ class autocomplete{
echo json_encode(
["error" => $error],
- JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+ JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
);
die();
}
@@ -233,7 +233,7 @@ class autocomplete{
$_GET["s"],
[]
],
- JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+ JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
);
die();
}
diff --git a/api/v1/images.php b/api/v1/images.php
index 348dda7..de2c5a9 100644
--- a/api/v1/images.php
+++ b/api/v1/images.php
@@ -30,7 +30,7 @@ $get = $frontend->parsegetfilters($_GET, $filters);
try{
echo json_encode(
$scraper->image($get),
- JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+ JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
);
}catch(Exception $e){
diff --git a/api/v1/music.php b/api/v1/music.php
index a1359eb..58985e3 100644
--- a/api/v1/music.php
+++ b/api/v1/music.php
@@ -30,7 +30,7 @@ $get = $frontend->parsegetfilters($_GET, $filters);
try{
echo json_encode(
$scraper->music($get),
- JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+ JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
);
}catch(Exception $e){
diff --git a/api/v1/news.php b/api/v1/news.php
index ca11b13..ab38781 100644
--- a/api/v1/news.php
+++ b/api/v1/news.php
@@ -30,7 +30,7 @@ $get = $frontend->parsegetfilters($_GET, $filters);
try{
echo json_encode(
$scraper->news($get),
- JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+ JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
);
}catch(Exception $e){
diff --git a/api/v1/videos.php b/api/v1/videos.php
index c0a7507..1d23780 100644
--- a/api/v1/videos.php
+++ b/api/v1/videos.php
@@ -30,7 +30,7 @@ $get = $frontend->parsegetfilters($_GET, $filters);
try{
echo json_encode(
$scraper->video($get),
- JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+ JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
);
}catch(Exception $e){
diff --git a/api/v1/web.php b/api/v1/web.php
index df5cec1..6a9c030 100644
--- a/api/v1/web.php
+++ b/api/v1/web.php
@@ -43,7 +43,7 @@ try{
echo
json_encode(
$scraper->web($get),
- JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES
+ JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE
);
}catch(Exception $e){
diff --git a/data/config.php b/data/config.php
index 42a968a..13be0f4 100644
--- a/data/config.php
+++ b/data/config.php
@@ -43,7 +43,7 @@ class config{
// If this regex expression matches on the user agent, it blocks the request
// Not useful at all against a targetted attack
- const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider/i';
+ const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider|qwant/i';
// Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!)
// Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"];
@@ -128,6 +128,7 @@ class config{
const PROXY_PINTEREST = false;
const PROXY_SEZNAM = false;
const PROXY_NAVER = false;
+ const PROXY_GREPPR = false;
const PROXY_CROWDVIEW = false;
const PROXY_MWMBL = false;
const PROXY_FTM = false; // findthatmeme
diff --git a/lib/backend.php b/lib/backend.php
index 7631ff3..cfb04a9 100644
--- a/lib/backend.php
+++ b/lib/backend.php
@@ -36,7 +36,7 @@ class backend{
}
// this function is also called directly on nextpage
- public function assign_proxy(&$curlproc, $ip){
+ public function assign_proxy(&$curlproc, string $ip){
// parse proxy line
[
@@ -91,7 +91,7 @@ class backend{
/*
Next page stuff
*/
- public function store($payload, $page, $proxy){
+ public function store(string $payload, string $page, string $proxy){
$key = sodium_crypto_secretbox_keygen();
$nonce = random_bytes(SODIUM_CRYPTO_SECRETBOX_NONCEBYTES);
@@ -120,7 +120,7 @@ class backend{
rtrim(strtr(base64_encode($key), '+/', '-_'), '=');
}
- public function get($npt, $page){
+ public function get(string $npt, string $page){
$page = $page[0];
$explode = explode(".", $npt, 2);
diff --git a/lib/frontend.php b/lib/frontend.php
index a48b722..1c3eb09 100644
--- a/lib/frontend.php
+++ b/lib/frontend.php
@@ -925,6 +925,7 @@ class frontend{
"google" => "Google",
"qwant" => "Qwant",
"yep" => "Yep",
+ "greppr" => "Greppr",
"crowdview" => "Crowdview",
"mwmbl" => "Mwmbl",
"mojeek" => "Mojeek",
diff --git a/lib/fuckhtml.php b/lib/fuckhtml.php
index 6895fbf..f3a6efe 100644
--- a/lib/fuckhtml.php
+++ b/lib/fuckhtml.php
@@ -321,11 +321,11 @@ class fuckhtml{
throw new Exception("(getTextContent) Supplied array doesn't contain an innerHTML index");
}
+
$html = $html["innerHTML"];
}
- $html =
- preg_split('/\n|<\/?br>/i', $html);
+ $html = preg_split('/\n|<\/?br>/i', $html);
$out = "";
for($i=0; $i<count($html); $i++){
diff --git a/scraper/greppr.php b/scraper/greppr.php
new file mode 100644
index 0000000..402c3d2
--- /dev/null
+++ b/scraper/greppr.php
@@ -0,0 +1,429 @@
+<?php
+
+class greppr{
+
+ public function __construct(){
+
+ include "lib/backend.php";
+ $this->backend = new backend("greppr");
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+ }
+
+ public function getfilters($page){
+
+ return [];
+ }
+
+ private function get($proxy, $url, $get = [], $cookie = false){
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+
+ if($cookie === false){
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1"]
+ );
+ }else{
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "Cookie: PHPSESSID=" . $cookie,
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1"]
+ );
+ }
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ $headers = [];
+
+ curl_setopt(
+ $curlproc,
+ CURLOPT_HEADERFUNCTION,
+ function($curlproc, $header) use (&$headers){
+
+ $len = strlen($header);
+ $header = explode(':', $header, 2);
+
+ if(count($header) < 2){
+
+ // ignore invalid headers
+ return $len;
+ }
+
+ $headers[strtolower(trim($header[0]))] = trim($header[1]);
+
+ return $len;
+ }
+ );
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+
+ return [
+ "headers" => $headers,
+ "data" => $data
+ ];
+ }
+
+ public function web($get, $first_attempt = true){
+
+ if($get["npt"]){
+
+ [$q, $proxy] = $this->backend->get($get["npt"], "web");
+
+ $q = json_decode($q, true);
+
+ }else{
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $proxy = $this->backend->get_ip();
+ }
+
+ // get token
+ // token[0] = static token that changes once a day
+ // token[1] = dynamic token that changes on every request
+ // token[1] = PHPSESSID cookie
+ $tokens = apcu_fetch("greppr_token");
+
+ if(
+ $tokens === false ||
+ $first_attempt === false // force token fetch
+ ){
+
+ // we haven't gotten the token yet, get it
+ try{
+
+ $response =
+ $this->get(
+ $proxy,
+ "https://greppr.org",
+ []
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search tokens");
+ }
+
+ $tokens = $this->parse_token($response);
+
+ if($tokens === false){
+
+ throw new Exception("Failed to grep search tokens");
+ }
+ }
+
+ try{
+
+ if($get["npt"]){
+
+ $params = [
+ $tokens[0] => $q["q"],
+ "s" => $q["s"],
+ "l" => 30,
+ "n" => $tokens[1]
+ ];
+ }else{
+
+ $params = [
+ $tokens[0] => $search,
+ "n" => $tokens[1]
+ ];
+ }
+
+ $searchresults = $this->get(
+ $proxy,
+ "https://greppr.org/search",
+ $params,
+ $tokens[2]
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ if(strlen($searchresults["data"]) === 0){
+
+ // redirected to main page, which means we got old token
+ // generate a new one
+
+ // ... unless we just tried to do that
+ if($first_attempt === false){
+
+ throw new Exception("Failed to get a new search token");
+ }
+
+ $this->get($get, false);
+ }
+
+ // refresh the token with new data (this also triggers fuckhtml load)
+ $this->parse_token($searchresults, $tokens[2]);
+
+ // response object
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ // get results for later
+ $results =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "result",
+ "div"
+ );
+
+ // check for next page
+ $next_elem =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "pagination",
+ "ul"
+ );
+
+ if(count($next_elem) !== 0){
+
+ $this->fuckhtml->load($next_elem[0]);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "page-link",
+ "a"
+ );
+
+ $break = false;
+ foreach($as as $a){
+
+ if($break === true){
+
+ parse_str(
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ ),
+ $values
+ );
+
+ $values = array_values($values);
+
+ $out["npt"] =
+ $this->backend->store(
+ json_encode(
+ [
+ "q" => $values[0],
+ "s" => $values[1]
+ ]
+ ),
+ "web",
+ $proxy
+ );
+ break;
+ }
+
+ if($a["attributes"]["href"] == "#"){
+
+ $break = true;
+ }
+ }
+ }
+
+ // scrape results
+ foreach($results as $result){
+
+ $this->fuckhtml->load($result);
+
+ $a =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ )[0];
+
+ $description =
+ $this->fuckhtml
+ ->getElementsByFuzzyAttributeValue(
+ "style",
+ "color:#777777;",
+ "p"
+ );
+
+ if(count($description) === 0){
+
+ $description = null;
+ }else{
+
+ $description =
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ );
+ }
+
+ $date =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "p"
+ );
+
+ $date =
+ strtotime(
+ explode(
+ "Added:",
+ $this->fuckhtml
+ ->getTextContent(
+ $date[count($date) - 1]["innerHTML"]
+ )
+ )[1]
+ );
+
+ $out["web"][] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $a["innerHTML"]
+ ),
+ "description" => $description,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ ),
+ "date" => $date,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => []
+ ];
+ }
+
+ return $out;
+ }
+
+ private function parse_token($response, $cookie = false){
+
+ $this->fuckhtml->load($response["data"]);
+
+ $scripts =
+ $this->fuckhtml
+ ->getElementsByTagName("script");
+
+ $found = false;
+ foreach($scripts as $script){
+
+ preg_match(
+ '/window\.location ?= ?\'\/search\?([^=]+).*&n=([0-9]+)/',
+ $script["innerHTML"],
+ $tokens
+ );
+
+ if(isset($tokens[1])){
+
+ $found = true;
+ break;
+ }
+ }
+
+ if($found === false){
+
+ return false;
+ }
+
+ $tokens = [
+ $tokens[1],
+ $tokens[2]
+ ];
+
+ if($cookie !== false){
+
+ // we already specified a cookie, so use the one we have already
+ $tokens[] = $cookie;
+ apcu_store("greppr_token", $tokens);
+
+ return $tokens;
+ }
+
+ if(!isset($response["headers"]["set-cookie"])){
+
+ // server didn't send a cookie
+ return false;
+ }
+
+ // get cookie
+ preg_match(
+ '/PHPSESSID=([^;]+)/',
+ $response["headers"]["set-cookie"],
+ $cookie
+ );
+
+ if(!isset($cookie[1])){
+
+ // server sent an unexpected cookie
+ return false;
+ }
+
+ $tokens[] = $cookie[1];
+ apcu_store("greppr_token", $tokens);
+
+ return $tokens;
+ }
+}
diff --git a/scraper/sc.php b/scraper/sc.php
index e2e7385..2b847c7 100644
--- a/scraper/sc.php
+++ b/scraper/sc.php
@@ -398,12 +398,17 @@ class sc{
if($token === false){
- $js =
- $this->get(
- $proxy,
- "https://a-v2.sndcdn.com/assets/1-c3e4038d.js",
- []
- );
+ try{
+ $js =
+ $this->get(
+ $proxy,
+ "https://a-v2.sndcdn.com/assets/1-c3e4038d.js",
+ []
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search token");
+ }
preg_match(
'/client_id=([^"]+)/',
diff --git a/settings.php b/settings.php
index 662189c..33185e9 100644
--- a/settings.php
+++ b/settings.php
@@ -130,6 +130,10 @@ $settings = [
"text" => "Yep"
],
[
+ "value" => "greppr",
+ "text" => "Greppr"
+ ],
+ [
"value" => "crowdview",
"text" => "Crowdview"
],