summaryrefslogtreecommitdiff
path: root/scraper/marginalia.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2023-07-22 14:41:14 -0400
committerlolcat <will@lolcat.ca>2023-07-22 14:41:14 -0400
commitbca265aea67ec62499aaa113a6490ce9ec7fe730 (patch)
tree3f05ec5ea542e41b474947e180034f42e99648e9 /scraper/marginalia.php
still missing things on google scraper
Diffstat (limited to 'scraper/marginalia.php')
-rw-r--r--scraper/marginalia.php242
1 files changed, 242 insertions, 0 deletions
diff --git a/scraper/marginalia.php b/scraper/marginalia.php
new file mode 100644
index 0000000..c8ab09f
--- /dev/null
+++ b/scraper/marginalia.php
@@ -0,0 +1,242 @@
+<?php
+
+class marginalia{
+ public function __construct(){
+
+ $this->key = "public";
+ }
+
+ public function getfilters($page){
+
+ switch($page){
+
+ case "web":
+ return [
+ "profile" => [
+ "display" => "Profile",
+ "option" => [
+ "any" => "Default",
+ "modern" => "Modern"
+ ]
+ ],
+ "format" => [
+ "display" => "Format",
+ "option" => [
+ "any" => "Any",
+ "html5" => "html5",
+ "xhtml" => "xhtml",
+ "html123" => "html123"
+ ]
+ ],
+ "file" => [
+ "display" => "File",
+ "option" => [
+ "any" => "Any",
+ "nomedia" => "Deny media",
+ "media" => "Contains media",
+ "audio" => "Contains audio",
+ "video" => "Contains video",
+ "archive" => "Contains archive",
+ "document" => "Contains document"
+ ]
+ ],
+ "javascript" => [
+ "display" => "Javascript",
+ "option" => [
+ "any" => "Allow JS",
+ "deny" => "Deny JS",
+ "require" => "Require JS"
+ ]
+ ],
+ "trackers" => [
+ "display" => "Trackers",
+ "option" => [
+ "any" => "Allow trackers",
+ "deny" => "Deny trackers",
+ "require" => "Require trackers"
+ ]
+ ],
+ "cookies" => [
+ "display" => "Cookies",
+ "option" => [
+ "any" => "Allow cookies",
+ "deny" => "Deny cookies",
+ "require" => "Require cookies"
+ ]
+ ],
+ "affiliate" => [
+ "display" => "Affiliate links in body",
+ "option" => [
+ "any" => "Allow affiliate links",
+ "deny" => "Deny affiliate links",
+ "require" => "Require affiliate links"
+ ]
+ ]
+ ];
+ }
+ }
+
+ private function get($url, $get = []){
+
+ $headers = [
+ "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1"
+ ];
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+ return $data;
+ }
+
+ public function web($get){
+
+ $search = [$get["s"]];
+ $profile = $get["profile"];
+ $format = $get["format"];
+ $file = $get["file"];
+
+ foreach(
+ [
+ "javascript" => $get["javascript"],
+ "trackers" => $get["trackers"],
+ "cookies" => $get["cookies"],
+ "affiliate" => $get["affiliate"]
+ ]
+ as $key => $value
+ ){
+
+ if($value == "any"){ continue; }
+
+ switch($key){
+
+ case "javascript": $str = "js:true"; break;
+ case "trackers": $str = "special:tracking"; break;
+ case "cookies": $str = "special:cookies"; break;
+ case "affiliate": $str = "special:affiliate"; break;
+ }
+
+ if($value == "deny"){
+ $str = "-" . $str;
+ }
+
+ $search[] = $str;
+ }
+
+ if($format != "any"){
+
+ $search[] = "format:$format";
+ }
+
+ switch($file){
+
+ case "any": break;
+ case "nomedia": $search[] = "-special:media"; break;
+ case "media": $search[] = "special:media"; break;
+
+ default:
+ $search[] = "file:$file";
+ }
+
+ $search = implode(" ", $search);
+
+ $params = [
+ "count" => 20
+ ];
+
+ if($profile == "modern"){
+
+ $params["index"] = 1;
+ }
+
+ try{
+ $json =
+ $this->get(
+ "https://api.marginalia.nu/{$this->key}/search/" . urlencode($search),
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get JSON");
+ }
+
+ if($json == "Slow down"){
+
+ throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
+ }
+
+ $json = json_decode($json, true);
+ /*
+ $handle = fopen("scraper/marginalia.json", "r");
+ $json = json_decode(fread($handle, filesize("scraper/marginalia.json")), true);
+ fclose($handle);*/
+
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ foreach($json["results"] as $result){
+
+ $out["web"][] = [
+ "title" => $result["title"],
+ "description" => str_replace("\n", " ", $result["description"]),
+ "url" => $result["url"],
+ "date" => null,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => []
+ ];
+ }
+
+ return $out;
+ }
+}
+