diff options
author | lolcat <will@lolcat.ca> | 2023-07-22 14:41:14 -0400 |
---|---|---|
committer | lolcat <will@lolcat.ca> | 2023-07-22 14:41:14 -0400 |
commit | bca265aea67ec62499aaa113a6490ce9ec7fe730 (patch) | |
tree | 3f05ec5ea542e41b474947e180034f42e99648e9 /scraper/marginalia.php |
still missing things on google scraper
Diffstat (limited to 'scraper/marginalia.php')
-rw-r--r-- | scraper/marginalia.php | 242 |
1 files changed, 242 insertions, 0 deletions
diff --git a/scraper/marginalia.php b/scraper/marginalia.php new file mode 100644 index 0000000..c8ab09f --- /dev/null +++ b/scraper/marginalia.php @@ -0,0 +1,242 @@ +<?php + +class marginalia{ + public function __construct(){ + + $this->key = "public"; + } + + public function getfilters($page){ + + switch($page){ + + case "web": + return [ + "profile" => [ + "display" => "Profile", + "option" => [ + "any" => "Default", + "modern" => "Modern" + ] + ], + "format" => [ + "display" => "Format", + "option" => [ + "any" => "Any", + "html5" => "html5", + "xhtml" => "xhtml", + "html123" => "html123" + ] + ], + "file" => [ + "display" => "File", + "option" => [ + "any" => "Any", + "nomedia" => "Deny media", + "media" => "Contains media", + "audio" => "Contains audio", + "video" => "Contains video", + "archive" => "Contains archive", + "document" => "Contains document" + ] + ], + "javascript" => [ + "display" => "Javascript", + "option" => [ + "any" => "Allow JS", + "deny" => "Deny JS", + "require" => "Require JS" + ] + ], + "trackers" => [ + "display" => "Trackers", + "option" => [ + "any" => "Allow trackers", + "deny" => "Deny trackers", + "require" => "Require trackers" + ] + ], + "cookies" => [ + "display" => "Cookies", + "option" => [ + "any" => "Allow cookies", + "deny" => "Deny cookies", + "require" => "Require cookies" + ] + ], + "affiliate" => [ + "display" => "Affiliate links in body", + "option" => [ + "any" => "Allow affiliate links", + "deny" => "Deny affiliate links", + "require" => "Require affiliate links" + ] + ] + ]; + } + } + + private function get($url, $get = []){ + + $headers = [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1" + ]; + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + $search = [$get["s"]]; + $profile = $get["profile"]; + $format = $get["format"]; + $file = $get["file"]; + + foreach( + [ + "javascript" => $get["javascript"], + "trackers" => $get["trackers"], + "cookies" => $get["cookies"], + "affiliate" => $get["affiliate"] + ] + as $key => $value + ){ + + if($value == "any"){ continue; } + + switch($key){ + + case "javascript": $str = "js:true"; break; + case "trackers": $str = "special:tracking"; break; + case "cookies": $str = "special:cookies"; break; + case "affiliate": $str = "special:affiliate"; break; + } + + if($value == "deny"){ + $str = "-" . $str; + } + + $search[] = $str; + } + + if($format != "any"){ + + $search[] = "format:$format"; + } + + switch($file){ + + case "any": break; + case "nomedia": $search[] = "-special:media"; break; + case "media": $search[] = "special:media"; break; + + default: + $search[] = "file:$file"; + } + + $search = implode(" ", $search); + + $params = [ + "count" => 20 + ]; + + if($profile == "modern"){ + + $params["index"] = 1; + } + + try{ + $json = + $this->get( + "https://api.marginalia.nu/{$this->key}/search/" . urlencode($search), + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get JSON"); + } + + if($json == "Slow down"){ + + throw new Exception("The API key used is rate limited. Please try again in a few minutes."); + } + + $json = json_decode($json, true); + /* + $handle = fopen("scraper/marginalia.json", "r"); + $json = json_decode(fread($handle, filesize("scraper/marginalia.json")), true); + fclose($handle);*/ + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + foreach($json["results"] as $result){ + + $out["web"][] = [ + "title" => $result["title"], + "description" => str_replace("\n", " ", $result["description"]), + "url" => $result["url"], + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } +} + |