summaryrefslogtreecommitdiff
path: root/scraper/marginalia.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2024-04-21 19:31:56 -0400
committerlolcat <will@lolcat.ca>2024-04-21 19:31:56 -0400
commit130358a9e0504a55cf3f86b2d7035feb7f4e84de (patch)
tree81f59790f7ead0b393a0e0b25caa082216245fcd /scraper/marginalia.php
parent9e18327df69542e07fad2ef471a3ebdbe9b08ae8 (diff)
v8
Diffstat (limited to 'scraper/marginalia.php')
-rw-r--r--scraper/marginalia.php344
1 files changed, 243 insertions, 101 deletions
diff --git a/scraper/marginalia.php b/scraper/marginalia.php
index b790a97..2a2c1e6 100644
--- a/scraper/marginalia.php
+++ b/scraper/marginalia.php
@@ -3,78 +3,103 @@
class marginalia{
public function __construct(){
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+
include "lib/backend.php";
$this->backend = new backend("marginalia");
}
public function getfilters($page){
- switch($page){
+ if(config::MARGINALIA_API_KEY === null){
- case "web":
- return [
- "profile" => [
- "display" => "Profile",
- "option" => [
- "any" => "Default",
- "modern" => "Modern"
- ]
- ],
- "format" => [
- "display" => "Format",
- "option" => [
- "any" => "Any",
- "html5" => "html5",
- "xhtml" => "xhtml",
- "html123" => "html123"
- ]
- ],
- "file" => [
- "display" => "File",
- "option" => [
- "any" => "Any",
- "nomedia" => "Deny media",
- "media" => "Contains media",
- "audio" => "Contains audio",
- "video" => "Contains video",
- "archive" => "Contains archive",
- "document" => "Contains document"
- ]
- ],
- "javascript" => [
- "display" => "Javascript",
- "option" => [
- "any" => "Allow JS",
- "deny" => "Deny JS",
- "require" => "Require JS"
- ]
- ],
- "trackers" => [
- "display" => "Trackers",
- "option" => [
- "any" => "Allow trackers",
- "deny" => "Deny trackers",
- "require" => "Require trackers"
- ]
- ],
- "cookies" => [
- "display" => "Cookies",
- "option" => [
- "any" => "Allow cookies",
- "deny" => "Deny cookies",
- "require" => "Require cookies"
- ]
- ],
- "affiliate" => [
- "display" => "Affiliate links in body",
- "option" => [
- "any" => "Allow affiliate links",
- "deny" => "Deny affiliate links",
- "require" => "Require affiliate links"
- ]
+ $base = [
+ "adtech" => [
+ "display" => "Reduce adtech",
+ "option" => [
+ "no" => "No",
+ "yes" => "Yes"
]
- ];
+ ],
+ "recent" => [
+ "display" => "Recent results",
+ "option" => [
+ "no" => "No",
+ "yes" => "Yes"
+ ]
+ ],
+ "intitle" => [
+ "display" => "Search in title",
+ "option" => [
+ "no" => "No",
+ "yes" => "Yes"
+ ]
+ ]
+ ];
+ }else{
+
+ $base = [];
}
+
+ return array_merge(
+ $base,
+ [
+ "format" => [
+ "display" => "Format",
+ "option" => [
+ "any" => "Any format",
+ "html5" => "html5",
+ "xhtml" => "xhtml",
+ "html123" => "html123"
+ ]
+ ],
+ "file" => [
+ "display" => "Filetype",
+ "option" => [
+ "any" => "Any filetype",
+ "nomedia" => "Deny media",
+ "media" => "Contains media",
+ "audio" => "Contains audio",
+ "video" => "Contains video",
+ "archive" => "Contains archive",
+ "document" => "Contains document"
+ ]
+ ],
+ "javascript" => [
+ "display" => "Javascript",
+ "option" => [
+ "any" => "Allow JS",
+ "deny" => "Deny JS",
+ "require" => "Require JS"
+ ]
+ ],
+ "trackers" => [
+ "display" => "Trackers",
+ "option" => [
+ "any" => "Allow trackers",
+ "deny" => "Deny trackers",
+ "require" => "Require trackers"
+ ]
+ ],
+ "cookies" => [
+ "display" => "Cookies",
+ "option" => [
+ "any" => "Allow cookies",
+ "deny" => "Deny cookies",
+ "require" => "Require cookies"
+ ]
+ ],
+ "affiliate" => [
+ "display" => "Affiliate links in body",
+ "option" => [
+ "any" => "Allow affiliate links",
+ "deny" => "Deny affiliate links",
+ "require" => "Require affiliate links"
+ ]
+ ]
+ ]
+ );
}
private function get($proxy, $url, $get = []){
@@ -132,7 +157,6 @@ class marginalia{
throw new Exception("Search term is empty!");
}
- $profile = $get["profile"];
$format = $get["format"];
$file = $get["file"];
@@ -180,38 +204,6 @@ class marginalia{
$search = implode(" ", $search);
- $params = [
- "count" => 20
- ];
-
- if($profile == "modern"){
-
- $params["index"] = 1;
- }
-
- try{
- $json =
- $this->get(
- $this->backend->get_ip(), // no nextpage
- "https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
- $params
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to get JSON");
- }
-
- if($json == "Slow down"){
-
- throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
- }
-
- $json = json_decode($json, true);
- /*
- $handle = fopen("scraper/marginalia.json", "r");
- $json = json_decode(fread($handle, filesize("scraper/marginalia.json")), true);
- fclose($handle);*/
-
$out = [
"status" => "ok",
"spelling" => [
@@ -228,19 +220,169 @@ class marginalia{
"related" => []
];
- foreach($json["results"] as $result){
+ if(config::MARGINALIA_API_KEY !== null){
+
+ try{
+ $json =
+ $this->get(
+ $this->backend->get_ip(), // no nextpage
+ "https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
+ [
+ "count" => 20
+ ]
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get JSON");
+ }
+
+ if($json == "Slow down"){
+
+ throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
+ }
+
+ $json = json_decode($json, true);
+
+ foreach($json["results"] as $result){
+
+ $out["web"][] = [
+ "title" => $result["title"],
+ "description" => str_replace("\n", " ", $result["description"]),
+ "url" => $result["url"],
+ "date" => null,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => []
+ ];
+ }
+
+ return $out;
+ }
+
+ // no more cloudflare!! Parse html by default
+ $params = [
+ "query" => $search
+ ];
+
+ foreach(["adtech", "recent", "intitle"] as $v){
+
+ if($get[$v] == "yes"){
+
+ switch($v){
+
+ case "adtech": $params["adtech"] = "reduce"; break;
+ case "recent": $params["recent"] = "recent"; break;
+ case "adtech": $params["searchTitle"] = "title"; break;
+ }
+ }
+ }
+
+ try{
+ $html =
+ $this->get(
+ $this->backend->get_ip(),
+ "https://search.marginalia.nu/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ $this->fuckhtml->load($html);
+
+ $sections =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "card search-result",
+ "section"
+ );
+
+ foreach($sections as $section){
+
+ $this->fuckhtml->load($section);
+
+ $title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "title",
+ "a"
+ )[0];
+
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "description",
+ "p"
+ );
+
+ if(count($description) !== 0){
+
+ $description =
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ );
+ }else{
+
+ $description = null;
+ }
+
+ $sublinks = [];
+ $sublink_html =
+ $this->fuckhtml
+ ->getElementsByClassName("additional-results");
+
+ if(count($sublink_html) !== 0){
+
+ $this->fuckhtml->load($sublink_html[0]);
+
+ $links =
+ $this->fuckhtml
+ ->getElementsByTagName("a");
+
+ foreach($links as $link){
+
+ $sublinks[] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $link
+ ),
+ "date" => null,
+ "description" => null,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $link["attributes"]["href"]
+ )
+ ];
+ }
+ }
$out["web"][] = [
- "title" => $result["title"],
- "description" => str_replace("\n", " ", $result["description"]),
- "url" => $result["url"],
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $title
+ ),
+ "description" => $description,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $title["attributes"]["href"]
+ ),
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
- "sublink" => [],
+ "sublink" => $sublinks,
"table" => []
];
}