From 130358a9e0504a55cf3f86b2d7035feb7f4e84de Mon Sep 17 00:00:00 2001 From: lolcat Date: Sun, 21 Apr 2024 19:31:56 -0400 Subject: v8 --- scraper/marginalia.php | 344 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 243 insertions(+), 101 deletions(-) (limited to 'scraper/marginalia.php') diff --git a/scraper/marginalia.php b/scraper/marginalia.php index b790a97..2a2c1e6 100644 --- a/scraper/marginalia.php +++ b/scraper/marginalia.php @@ -3,78 +3,103 @@ class marginalia{ public function __construct(){ + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + include "lib/backend.php"; $this->backend = new backend("marginalia"); } public function getfilters($page){ - switch($page){ + if(config::MARGINALIA_API_KEY === null){ - case "web": - return [ - "profile" => [ - "display" => "Profile", - "option" => [ - "any" => "Default", - "modern" => "Modern" - ] - ], - "format" => [ - "display" => "Format", - "option" => [ - "any" => "Any", - "html5" => "html5", - "xhtml" => "xhtml", - "html123" => "html123" - ] - ], - "file" => [ - "display" => "File", - "option" => [ - "any" => "Any", - "nomedia" => "Deny media", - "media" => "Contains media", - "audio" => "Contains audio", - "video" => "Contains video", - "archive" => "Contains archive", - "document" => "Contains document" - ] - ], - "javascript" => [ - "display" => "Javascript", - "option" => [ - "any" => "Allow JS", - "deny" => "Deny JS", - "require" => "Require JS" - ] - ], - "trackers" => [ - "display" => "Trackers", - "option" => [ - "any" => "Allow trackers", - "deny" => "Deny trackers", - "require" => "Require trackers" - ] - ], - "cookies" => [ - "display" => "Cookies", - "option" => [ - "any" => "Allow cookies", - "deny" => "Deny cookies", - "require" => "Require cookies" - ] - ], - "affiliate" => [ - "display" => "Affiliate links in body", - "option" => [ - "any" => "Allow affiliate links", - "deny" => "Deny affiliate links", - "require" => "Require affiliate links" - ] + $base = [ + "adtech" => [ + "display" => "Reduce adtech", + "option" => [ + "no" => "No", + "yes" => "Yes" ] - ]; + ], + "recent" => [ + "display" => "Recent results", + "option" => [ + "no" => "No", + "yes" => "Yes" + ] + ], + "intitle" => [ + "display" => "Search in title", + "option" => [ + "no" => "No", + "yes" => "Yes" + ] + ] + ]; + }else{ + + $base = []; } + + return array_merge( + $base, + [ + "format" => [ + "display" => "Format", + "option" => [ + "any" => "Any format", + "html5" => "html5", + "xhtml" => "xhtml", + "html123" => "html123" + ] + ], + "file" => [ + "display" => "Filetype", + "option" => [ + "any" => "Any filetype", + "nomedia" => "Deny media", + "media" => "Contains media", + "audio" => "Contains audio", + "video" => "Contains video", + "archive" => "Contains archive", + "document" => "Contains document" + ] + ], + "javascript" => [ + "display" => "Javascript", + "option" => [ + "any" => "Allow JS", + "deny" => "Deny JS", + "require" => "Require JS" + ] + ], + "trackers" => [ + "display" => "Trackers", + "option" => [ + "any" => "Allow trackers", + "deny" => "Deny trackers", + "require" => "Require trackers" + ] + ], + "cookies" => [ + "display" => "Cookies", + "option" => [ + "any" => "Allow cookies", + "deny" => "Deny cookies", + "require" => "Require cookies" + ] + ], + "affiliate" => [ + "display" => "Affiliate links in body", + "option" => [ + "any" => "Allow affiliate links", + "deny" => "Deny affiliate links", + "require" => "Require affiliate links" + ] + ] + ] + ); } private function get($proxy, $url, $get = []){ @@ -132,7 +157,6 @@ class marginalia{ throw new Exception("Search term is empty!"); } - $profile = $get["profile"]; $format = $get["format"]; $file = $get["file"]; @@ -180,38 +204,6 @@ class marginalia{ $search = implode(" ", $search); - $params = [ - "count" => 20 - ]; - - if($profile == "modern"){ - - $params["index"] = 1; - } - - try{ - $json = - $this->get( - $this->backend->get_ip(), // no nextpage - "https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search), - $params - ); - }catch(Exception $error){ - - throw new Exception("Failed to get JSON"); - } - - if($json == "Slow down"){ - - throw new Exception("The API key used is rate limited. Please try again in a few minutes."); - } - - $json = json_decode($json, true); - /* - $handle = fopen("scraper/marginalia.json", "r"); - $json = json_decode(fread($handle, filesize("scraper/marginalia.json")), true); - fclose($handle);*/ - $out = [ "status" => "ok", "spelling" => [ @@ -228,19 +220,169 @@ class marginalia{ "related" => [] ]; - foreach($json["results"] as $result){ + if(config::MARGINALIA_API_KEY !== null){ + + try{ + $json = + $this->get( + $this->backend->get_ip(), // no nextpage + "https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search), + [ + "count" => 20 + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get JSON"); + } + + if($json == "Slow down"){ + + throw new Exception("The API key used is rate limited. Please try again in a few minutes."); + } + + $json = json_decode($json, true); + + foreach($json["results"] as $result){ + + $out["web"][] = [ + "title" => $result["title"], + "description" => str_replace("\n", " ", $result["description"]), + "url" => $result["url"], + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } + + // no more cloudflare!! Parse html by default + $params = [ + "query" => $search + ]; + + foreach(["adtech", "recent", "intitle"] as $v){ + + if($get[$v] == "yes"){ + + switch($v){ + + case "adtech": $params["adtech"] = "reduce"; break; + case "recent": $params["recent"] = "recent"; break; + case "adtech": $params["searchTitle"] = "title"; break; + } + } + } + + try{ + $html = + $this->get( + $this->backend->get_ip(), + "https://search.marginalia.nu/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + $this->fuckhtml->load($html); + + $sections = + $this->fuckhtml + ->getElementsByClassName( + "card search-result", + "section" + ); + + foreach($sections as $section){ + + $this->fuckhtml->load($section); + + $title = + $this->fuckhtml + ->getElementsByClassName( + "title", + "a" + )[0]; + + $description = + $this->fuckhtml + ->getElementsByClassName( + "description", + "p" + ); + + if(count($description) !== 0){ + + $description = + $this->fuckhtml + ->getTextContent( + $description[0] + ); + }else{ + + $description = null; + } + + $sublinks = []; + $sublink_html = + $this->fuckhtml + ->getElementsByClassName("additional-results"); + + if(count($sublink_html) !== 0){ + + $this->fuckhtml->load($sublink_html[0]); + + $links = + $this->fuckhtml + ->getElementsByTagName("a"); + + foreach($links as $link){ + + $sublinks[] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $link + ), + "date" => null, + "description" => null, + "url" => + $this->fuckhtml + ->getTextContent( + $link["attributes"]["href"] + ) + ]; + } + } $out["web"][] = [ - "title" => $result["title"], - "description" => str_replace("\n", " ", $result["description"]), - "url" => $result["url"], + "title" => + $this->fuckhtml + ->getTextContent( + $title + ), + "description" => $description, + "url" => + $this->fuckhtml + ->getTextContent( + $title["attributes"]["href"] + ), "date" => null, "type" => "web", "thumb" => [ "url" => null, "ratio" => null ], - "sublink" => [], + "sublink" => $sublinks, "table" => [] ]; } -- cgit v1.2.3