From a20d4de1e4d1a00c0d152136e5d7cd81481ace7b Mon Sep 17 00:00:00 2001 From: lolcat Date: Mon, 26 Feb 2024 11:31:52 -0500 Subject: added mwmbl scraper --- data/config.php | 1 + docs/tor.md | 2 + lib/frontend.php | 6 ++ scraper/google.php | 4 +- scraper/mwmbl.php | 168 +++++++++++++++++++++++++++++++++++++++++++++++++++++ settings.php | 4 ++ 6 files changed, 182 insertions(+), 3 deletions(-) create mode 100644 scraper/mwmbl.php diff --git a/data/config.php b/data/config.php index 3e0120d..fd9071e 100644 --- a/data/config.php +++ b/data/config.php @@ -104,6 +104,7 @@ class config{ const PROXY_PINTEREST = false; const PROXY_SEZNAM = false; const PROXY_NAVER = false; + const PROXY_MWMBL = false; const PROXY_FTM = false; // findthatmeme const PROXY_IMGUR = false; const PROXY_YANDEX_W = false; // yandex web diff --git a/docs/tor.md b/docs/tor.md index 15521af..b29ac3d 100644 --- a/docs/tor.md +++ b/docs/tor.md @@ -12,3 +12,5 @@ This guide assumes that there is already a configured webserver sitting on port 5. Restart the tor service using `service tor restart` 6. Wait for a while... 7. Run `cat /var/lib/tor/4get/hostname`. That is your onion address! + +# Specify your own tor address diff --git a/lib/frontend.php b/lib/frontend.php index 738ad83..7e3b6fb 100644 --- a/lib/frontend.php +++ b/lib/frontend.php @@ -902,6 +902,7 @@ class frontend{ "yandex" => "Yandex", "google" => "Google", "yep" => "Yep", + "mwmbl" => "Mwmbl", "mojeek" => "Mojeek", "marginalia" => "Marginalia", "wiby" => "wiby", @@ -1018,6 +1019,11 @@ class frontend{ $lib = new facebook(); break;*/ + case "mwmbl": + include "scraper/mwmbl.php"; + $lib = new mwmbl(); + break; + case "mojeek": include "scraper/mojeek.php"; $lib = new mojeek(); diff --git a/scraper/google.php b/scraper/google.php index 37adf47..b0e4ded 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -538,8 +538,7 @@ class google{ $url .= "?" . $get; } - //curl_setopt($curlproc, CURLOPT_URL, $url); - curl_setopt($curlproc, CURLOPT_URL, "https://ifconfig.co"); + curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); @@ -560,7 +559,6 @@ class google{ } curl_close($curlproc); - echo $data; return $data; } diff --git a/scraper/mwmbl.php b/scraper/mwmbl.php new file mode 100644 index 0000000..671ec78 --- /dev/null +++ b/scraper/mwmbl.php @@ -0,0 +1,168 @@ +backend = new backend("mwmbl"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return []; + } + + private function get($proxy, $url, $get = []){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + try{ + $html = $this->get( + $this->backend->get_ip(), // no next page! + "https://mwmbl.org/app/home/", + [ + "q" => $search + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch HTML"); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + $results = + $this->fuckhtml + ->getElementsByClassName( + "result", + "li" + ); + + foreach($results as $result){ + + $this->fuckhtml->load($result); + + $p = + $this->fuckhtml + ->getElementsByTagName("p"); + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "title", + $p + )[0] + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "extract", + $p + )[0] + ) + ), + "url" => + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName("a") + [0] + ["attributes"] + ["href"] + ), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } + + private function titledots($title){ + + return rtrim($title, "…"); + } +} diff --git a/settings.php b/settings.php index 5572b19..49ba166 100644 --- a/settings.php +++ b/settings.php @@ -125,6 +125,10 @@ $settings = [ "value" => "yep", "text" => "Yep" ], + [ + "value" => "mwmbl", + "text" => "Mwmbl" + ], [ "value" => "mojeek", "text" => "Mojeek" -- cgit v1.2.3