summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2024-02-26 11:31:52 -0500
committerlolcat <will@lolcat.ca>2024-02-26 11:31:52 -0500
commita20d4de1e4d1a00c0d152136e5d7cd81481ace7b (patch)
tree185e231969aa116136a8e8314aa726687e94a0a2
parent8944ca68944929f1da49a0bfb310675a6287fdc3 (diff)
added mwmbl scraper
-rw-r--r--data/config.php1
-rw-r--r--docs/tor.md2
-rw-r--r--lib/frontend.php6
-rw-r--r--scraper/google.php4
-rw-r--r--scraper/mwmbl.php168
-rw-r--r--settings.php4
6 files changed, 182 insertions, 3 deletions
diff --git a/data/config.php b/data/config.php
index 3e0120d..fd9071e 100644
--- a/data/config.php
+++ b/data/config.php
@@ -104,6 +104,7 @@ class config{
const PROXY_PINTEREST = false;
const PROXY_SEZNAM = false;
const PROXY_NAVER = false;
+ const PROXY_MWMBL = false;
const PROXY_FTM = false; // findthatmeme
const PROXY_IMGUR = false;
const PROXY_YANDEX_W = false; // yandex web
diff --git a/docs/tor.md b/docs/tor.md
index 15521af..b29ac3d 100644
--- a/docs/tor.md
+++ b/docs/tor.md
@@ -12,3 +12,5 @@ This guide assumes that there is already a configured webserver sitting on port
5. Restart the tor service using `service tor restart`
6. Wait for a while...
7. Run `cat /var/lib/tor/4get/hostname`. That is your onion address!
+
+# Specify your own tor address
diff --git a/lib/frontend.php b/lib/frontend.php
index 738ad83..7e3b6fb 100644
--- a/lib/frontend.php
+++ b/lib/frontend.php
@@ -902,6 +902,7 @@ class frontend{
"yandex" => "Yandex",
"google" => "Google",
"yep" => "Yep",
+ "mwmbl" => "Mwmbl",
"mojeek" => "Mojeek",
"marginalia" => "Marginalia",
"wiby" => "wiby",
@@ -1018,6 +1019,11 @@ class frontend{
$lib = new facebook();
break;*/
+ case "mwmbl":
+ include "scraper/mwmbl.php";
+ $lib = new mwmbl();
+ break;
+
case "mojeek":
include "scraper/mojeek.php";
$lib = new mojeek();
diff --git a/scraper/google.php b/scraper/google.php
index 37adf47..b0e4ded 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -538,8 +538,7 @@ class google{
$url .= "?" . $get;
}
- //curl_setopt($curlproc, CURLOPT_URL, $url);
- curl_setopt($curlproc, CURLOPT_URL, "https://ifconfig.co");
+ curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
@@ -560,7 +559,6 @@ class google{
}
curl_close($curlproc);
- echo $data;
return $data;
}
diff --git a/scraper/mwmbl.php b/scraper/mwmbl.php
new file mode 100644
index 0000000..671ec78
--- /dev/null
+++ b/scraper/mwmbl.php
@@ -0,0 +1,168 @@
+<?php
+
+class mwmbl{
+
+ public function __construct(){
+
+ include "lib/backend.php";
+ $this->backend = new backend("mwmbl");
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+ }
+
+ public function getfilters($page){
+
+ return [];
+ }
+
+ private function get($proxy, $url, $get = []){
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1"]
+ );
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+ return $data;
+ }
+
+ public function web($get){
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ try{
+ $html = $this->get(
+ $this->backend->get_ip(), // no next page!
+ "https://mwmbl.org/app/home/",
+ [
+ "q" => $search
+ ]
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch HTML");
+ }
+
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ $this->fuckhtml->load($html);
+
+ $results =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "result",
+ "li"
+ );
+
+ foreach($results as $result){
+
+ $this->fuckhtml->load($result);
+
+ $p =
+ $this->fuckhtml
+ ->getElementsByTagName("p");
+
+ $out["web"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "title",
+ $p
+ )[0]
+ )
+ ),
+ "description" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "extract",
+ $p
+ )[0]
+ )
+ ),
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByTagName("a")
+ [0]
+ ["attributes"]
+ ["href"]
+ ),
+ "date" => null,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => []
+ ];
+ }
+
+ return $out;
+ }
+
+ private function titledots($title){
+
+ return rtrim($title, "…");
+ }
+}
diff --git a/settings.php b/settings.php
index 5572b19..49ba166 100644
--- a/settings.php
+++ b/settings.php
@@ -126,6 +126,10 @@ $settings = [
"text" => "Yep"
],
[
+ "value" => "mwmbl",
+ "text" => "Mwmbl"
+ ],
+ [
"value" => "mojeek",
"text" => "Mojeek"
],