summaryrefslogtreecommitdiff
path: root/scraper/mojeek.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2023-07-22 14:41:14 -0400
committerlolcat <will@lolcat.ca>2023-07-22 14:41:14 -0400
commitbca265aea67ec62499aaa113a6490ce9ec7fe730 (patch)
tree3f05ec5ea542e41b474947e180034f42e99648e9 /scraper/mojeek.php
still missing things on google scraper
Diffstat (limited to 'scraper/mojeek.php')
-rw-r--r--scraper/mojeek.php1182
1 files changed, 1182 insertions, 0 deletions
diff --git a/scraper/mojeek.php b/scraper/mojeek.php
new file mode 100644
index 0000000..a0b5016
--- /dev/null
+++ b/scraper/mojeek.php
@@ -0,0 +1,1182 @@
+<?php
+
+class mojeek{
+ public function __construct(){
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+
+ include "lib/nextpage.php";
+ $this->nextpage = new nextpage("mojeek");
+ }
+
+ public function getfilters($page){
+
+ switch($page){
+
+ case "web":
+ return [
+ "focus" => [
+ "display" => "Focus",
+ "option" => [
+ "any" => "No focus",
+ "blogs" => "Blogs",
+ "Dictionary" => "Dictionary",
+ "Recipes" => "Recipes",
+ "Time" => "Time",
+ "Weather" => "Weather"
+ ]
+ ],
+ "lang" => [
+ "display" => "Language",
+ "option" => [
+ "any" => "Any language",
+ "af" => "Afrikaans",
+ "sq" => "Albanian",
+ "an" => "Aragonese",
+ "ay" => "Aymara",
+ "bi" => "Bislama",
+ "br" => "Breton",
+ "ca" => "Catalan",
+ "kw" => "Cornish",
+ "co" => "Corsican",
+ "hr" => "Croatian",
+ "da" => "Danish",
+ "nl" => "Dutch",
+ "dz" => "Dzongkha",
+ "en" => "English",
+ "fj" => "Fijian",
+ "fi" => "Finnish",
+ "fr" => "French",
+ "gd" => "Gaelic",
+ "gl" => "Galician",
+ "de" => "German",
+ "ht" => "Haitian",
+ "io" => "Ido",
+ "id" => "Indonesian",
+ "ia" => "Interlingua",
+ "ie" => "Interlingue",
+ "ga" => "Irish",
+ "it" => "Italian",
+ "rw" => "Kinyarwanda",
+ "la" => "Latin",
+ "li" => "Limburgish",
+ "lb" => "Luxembourgish",
+ "no" => "Norwegian",
+ "nb" => "Norwegian Bokmål",
+ "nn" => "Norwegian Nynorsk",
+ "oc" => "Occitan (post 1500)",
+ "pl" => "Polish",
+ "pt" => "Portuguese",
+ "rm" => "Romansh",
+ "rn" => "Rundi",
+ "sg" => "Sango",
+ "so" => "Somali",
+ "es" => "Spanish",
+ "sw" => "Swahili",
+ "ss" => "Swati",
+ "sv" => "Swedish",
+ "ty" => "Tahitian",
+ "to" => "Tonga (Tonga Islands)",
+ "ts" => "Tsonga",
+ "vo" => "Volapük",
+ "wa" => "Walloon",
+ "cy" => "Welsh",
+ "xh" => "Xhosa",
+ "zu" => "Zulu"
+ ]
+ ],
+ "country" => [
+ "display" => "Country",
+ "option" => [
+ "any" => "No location bias",
+ "af" => "Afghanistan",
+ "ax" => "Åland Islands",
+ "al" => "Albania",
+ "dz" => "Algeria",
+ "as" => "American Samoa",
+ "ad" => "Andorra",
+ "ao" => "Angola",
+ "ai" => "Anguilla",
+ "aq" => "Antarctica",
+ "ag" => "Antigua and Barbuda",
+ "ar" => "Argentina",
+ "am" => "Armenia",
+ "aw" => "Aruba",
+ "au" => "Australia",
+ "at" => "Austria",
+ "az" => "Azerbaijan",
+ "bs" => "Bahamas",
+ "bh" => "Bahrain",
+ "bd" => "Bangladesh",
+ "bb" => "Barbados",
+ "by" => "Belarus",
+ "be" => "Belgium",
+ "bz" => "Belize",
+ "bj" => "Benin",
+ "bm" => "Bermuda",
+ "bt" => "Bhutan",
+ "bo" => "Bolivia (Plurinational State of)",
+ "bq" => "Bonaire, Sint Eustatius and Saba",
+ "ba" => "Bosnia and Herzegovina",
+ "bw" => "Botswana",
+ "bv" => "Bouvet Island",
+ "br" => "Brazil",
+ "io" => "British Indian Ocean Territory",
+ "bn" => "Brunei Darussalam",
+ "bg" => "Bulgaria",
+ "bf" => "Burkina Faso",
+ "bi" => "Burundi",
+ "cv" => "Cabo Verde",
+ "kh" => "Cambodia",
+ "cm" => "Cameroon",
+ "ca" => "Canada",
+ "ky" => "Cayman Islands",
+ "cf" => "Central African Republic",
+ "td" => "Chad",
+ "cl" => "Chile",
+ "cn" => "China",
+ "cx" => "Christmas Island",
+ "cc" => "Cocos (Keeling) Islands",
+ "co" => "Colombia",
+ "km" => "Comoros",
+ "cg" => "Congo",
+ "cd" => "Congo (Democratic Republic of the)",
+ "ck" => "Cook Islands",
+ "cr" => "Costa Rica",
+ "ci" => "Côte d'Ivoire",
+ "hr" => "Croatia",
+ "cu" => "Cuba",
+ "cw" => "Curaçao",
+ "cy" => "Cyprus",
+ "cz" => "Czechia",
+ "dk" => "Denmark",
+ "dj" => "Djibouti",
+ "dm" => "Dominica",
+ "do" => "Dominican Republic",
+ "ec" => "Ecuador",
+ "eg" => "Egypt",
+ "sv" => "El Salvador",
+ "gq" => "Equatorial Guinea",
+ "er" => "Eritrea",
+ "ee" => "Estonia",
+ "et" => "Ethiopia",
+ "fk" => "Falkland Islands (Malvinas)",
+ "fo" => "Faroe Islands",
+ "fj" => "Fiji",
+ "fi" => "Finland",
+ "fr" => "France",
+ "gf" => "French Guiana",
+ "pf" => "French Polynesia",
+ "tf" => "French Southern Territories",
+ "ga" => "Gabon",
+ "gm" => "Gambia",
+ "ge" => "Georgia",
+ "de" => "Germany",
+ "gh" => "Ghana",
+ "gi" => "Gibraltar",
+ "gr" => "Greece",
+ "gl" => "Greenland",
+ "gd" => "Grenada",
+ "gp" => "Guadeloupe",
+ "gu" => "Guam",
+ "gt" => "Guatemala",
+ "gg" => "Guernsey",
+ "gn" => "Guinea",
+ "gw" => "Guinea-Bissau",
+ "gy" => "Guyana",
+ "ht" => "Haiti",
+ "hm" => "Heard Island and McDonald Islands",
+ "va" => "Holy See",
+ "hn" => "Honduras",
+ "hk" => "Hong Kong",
+ "hu" => "Hungary",
+ "is" => "Iceland",
+ "in" => "India",
+ "id" => "Indonesia",
+ "ir" => "Iran (Islamic Republic of)",
+ "iq" => "Iraq",
+ "ie" => "Ireland",
+ "im" => "Isle of Man",
+ "il" => "Israel",
+ "it" => "Italy",
+ "jm" => "Jamaica",
+ "jp" => "Japan",
+ "je" => "Jersey",
+ "jo" => "Jordan",
+ "kz" => "Kazakhstan",
+ "ke" => "Kenya",
+ "ki" => "Kiribati",
+ "kp" => "Korea (Democratic People's Republic of)",
+ "kr" => "Korea (Republic of)",
+ "kw" => "Kuwait",
+ "kg" => "Kyrgyzstan",
+ "la" => "Lao People's Democratic Republic",
+ "lv" => "Latvia",
+ "lb" => "Lebanon",
+ "ls" => "Lesotho",
+ "lr" => "Liberia",
+ "ly" => "Libya",
+ "li" => "Liechtenstein",
+ "lt" => "Lithuania",
+ "lu" => "Luxembourg",
+ "mo" => "Macao",
+ "mk" => "Macedonia (the former Yugoslav Republic of)",
+ "mg" => "Madagascar",
+ "mw" => "Malawi",
+ "my" => "Malaysia",
+ "mv" => "Maldives",
+ "ml" => "Mali",
+ "mt" => "Malta",
+ "mh" => "Marshall Islands",
+ "mq" => "Martinique",
+ "mr" => "Mauritania",
+ "mu" => "Mauritius",
+ "yt" => "Mayotte",
+ "mx" => "Mexico",
+ "fm" => "Micronesia (Federated States of)",
+ "md" => "Moldova (Republic of)",
+ "mc" => "Monaco",
+ "mn" => "Mongolia",
+ "me" => "Montenegro",
+ "ms" => "Montserrat",
+ "ma" => "Morocco",
+ "mz" => "Mozambique",
+ "mm" => "Myanmar",
+ "na" => "Namibia",
+ "nr" => "Nauru",
+ "np" => "Nepal",
+ "nl" => "Netherlands",
+ "nc" => "New Caledonia",
+ "nz" => "New Zealand",
+ "ni" => "Nicaragua",
+ "ne" => "Niger",
+ "ng" => "Nigeria",
+ "nu" => "Niue",
+ "nf" => "Norfolk Island",
+ "mp" => "Northern Mariana Islands",
+ "no" => "Norway",
+ "om" => "Oman",
+ "pk" => "Pakistan",
+ "pw" => "Palau",
+ "ps" => "Palestine, State of",
+ "pa" => "Panama",
+ "pg" => "Papua New Guinea",
+ "py" => "Paraguay",
+ "pe" => "Peru",
+ "ph" => "Philippines",
+ "pn" => "Pitcairn",
+ "pl" => "Poland",
+ "pt" => "Portugal",
+ "pr" => "Puerto Rico",
+ "qa" => "Qatar",
+ "re" => "Réunion",
+ "ro" => "Romania",
+ "ru" => "Russian Federation",
+ "rw" => "Rwanda",
+ "bl" => "Saint Barthélemy",
+ "sh" => "Saint Helena, Ascension and Tristan da Cunha",
+ "kn" => "Saint Kitts and Nevis",
+ "lc" => "Saint Lucia",
+ "mf" => "Saint Martin (French part)",
+ "pm" => "Saint Pierre and Miquelon",
+ "vc" => "Saint Vincent and the Grenadines",
+ "ws" => "Samoa",
+ "sm" => "San Marino",
+ "st" => "Sao Tome and Principe",
+ "sa" => "Saudi Arabia",
+ "sn" => "Senegal",
+ "rs" => "Serbia",
+ "sc" => "Seychelles",
+ "sl" => "Sierra Leone",
+ "sg" => "Singapore",
+ "sx" => "Sint Maarten (Dutch part)",
+ "sk" => "Slovakia",
+ "si" => "Slovenia",
+ "sb" => "Solomon Islands",
+ "so" => "Somalia",
+ "za" => "South Africa",
+ "gs" => "South Georgia and South Sandwich Islands",
+ "ss" => "South Sudan",
+ "es" => "Spain",
+ "lk" => "Sri Lanka",
+ "sd" => "Sudan",
+ "sr" => "Suriname",
+ "sj" => "Svalbard and Jan Mayen",
+ "sz" => "Swaziland",
+ "se" => "Sweden",
+ "ch" => "Switzerland",
+ "sy" => "Syrian Arab Republic",
+ "tw" => "Taiwan",
+ "tj" => "Tajikistan",
+ "tz" => "Tanzania, United Republic of",
+ "th" => "Thailand",
+ "tl" => "Timor-Leste",
+ "tg" => "Togo",
+ "tk" => "Tokelau",
+ "to" => "Tonga",
+ "tt" => "Trinidad and Tobago",
+ "tn" => "Tunisia",
+ "tr" => "Turkey",
+ "tm" => "Turkmenistan",
+ "tc" => "Turks and Caicos Islands",
+ "tv" => "Tuvalu",
+ "ug" => "Uganda",
+ "ua" => "Ukraine",
+ "ae" => "United Arab Emirates",
+ "gb" => "United Kingdom",
+ "us" => "United States of America",
+ "um" => "United States Minor Outlying Islands",
+ "uy" => "Uruguay",
+ "uz" => "Uzbekistan",
+ "vu" => "Vanuatu",
+ "ve" => "Venezuela (Bolivarian Republic of)",
+ "vn" => "Viet Nam",
+ "vg" => "Virgin Islands (British)",
+ "vi" => "Virgin Islands (U.S.)",
+ "wf" => "Wallis and Futuna",
+ "eh" => "Western Sahara",
+ "ye" => "Yemen",
+ "zm" => "Zambia",
+ "zw" => "Zimbabwe"
+ ]
+ ],
+ "region" => [
+ "display" => "Region",
+ "option" => [
+ "any" => "Any region",
+ "eu" => "European Union",
+ "de" => "Germany",
+ "fr" => "France",
+ "uk" => "United Kingdom"
+ ]
+ ],
+ "domain" => [
+ "display" => "Results per domain",
+ "option" => [
+ "1" => "1 result",
+ "2" => "2 results",
+ "3" => "3 results",
+ "4" => "4 results",
+ "5" => "5 results",
+ "10" => "10 results",
+ "0" => "Unlimited",
+ ]
+ ]
+ ];
+ break;
+
+ case "news":
+ return [];
+ }
+ }
+
+ private function get($url, $get = []){
+
+ $headers = [
+ "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1"
+ ];
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+ return $data;
+ }
+
+ public function web($get){
+
+ if($get["npt"]){
+
+ $token = $this->nextpage->get($get["npt"], "web");
+
+ try{
+ $html =
+ $this->get(
+ "https://www.mojeek.com" . $token,
+ []
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ }else{
+ $search = $get["s"];
+ $lang = $get["lang"];
+ $country = $get["country"];
+ $region = $get["region"];
+ $domain = $get["domain"];
+ $focus = $get["focus"];
+
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $params = [
+ "q" => $search,
+ "t" => 20, // number of results/page
+ "tn" => 7, // number of news results/page
+ "date" => 1, // show date
+ "tlen" => 128, // max length of title
+ "dlen" => 511, // max length of description
+ "arc" => ($country == "any" ? "none" : $country) // location. don't use autodetect!
+ ];
+
+ switch($focus){
+
+ case "any": break;
+
+ case "blogs":
+ $params["fmt"] = "sst";
+ $params["sst"] = "1";
+ break;
+
+ default:
+ $params["foc_t"] = $focus;
+ break;
+ }
+
+ if($lang != "any"){
+
+ $params["lb"] = $lang;
+ }
+
+ if($region != "any"){
+
+ $params["reg"] = $region;
+ }
+
+ if($domain != "1"){
+
+ $params["si"] = $domain;
+ }
+
+ try{
+ $html =
+ $this->get(
+ "https://www.mojeek.com/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+ /*
+ $handle = fopen("scraper/mojeek.html", "r");
+ $html = fread($handle, filesize("scraper/mojeek.html"));
+ fclose($handle);*/
+
+ }
+
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ $this->fuckhtml->load($html);
+
+ $results =
+ $this->fuckhtml
+ ->getElementsByClassName("results-standard", "ul");
+
+ if(count($results) === 0){
+
+ return $out;
+ }
+
+ $this->fuckhtml->load($results[0]);
+
+ /*
+ Get search results
+ */
+ $results =
+ $this->fuckhtml
+ ->getElementsByTagName("li");
+
+ foreach($results as $result){
+
+ $data = [
+ "title" => null,
+ "description" => null,
+ "url" => null,
+ "date" => null,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => []
+ ];
+
+ $this->fuckhtml->load($result);
+
+ $title =
+ $this->fuckhtml
+ ->getElementsByClassName("title", "a")[0];
+
+ $data["title"] =
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $title["innerHTML"]
+ )
+ );
+
+ $data["url"] =
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $title["attributes"]["href"]
+ )
+ );
+
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "s", "p"
+ );
+
+ if(count($description) !== 0){
+
+ $data["description"] =
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ )
+ )
+ );
+ }
+
+ $data["date"] =
+ explode(
+ " - ",
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByClassName("i", "p")[1]
+ )
+ );
+
+ $data["date"] =
+ strtotime(
+ $data["date"][count($data["date"]) - 1]
+ );
+
+ $out["web"][] = $data;
+ }
+
+ /*
+ Get instant answers
+ */
+ $this->fuckhtml->load($html);
+
+ $infoboxes =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "infobox infobox-top",
+ "div"
+ );
+
+ foreach($infoboxes as $infobox){
+
+ $answer = [
+ "title" => null,
+ "description" => [],
+ "url" => null,
+ "thumb" => null,
+ "table" => [],
+ "sublink" => []
+ ];
+
+ // load first part with title + short definition
+ $infobox_html =
+ explode(
+ "<hr>",
+ $infobox["innerHTML"]
+ );
+
+ $this->fuckhtml->load($infobox_html[0]);
+
+ // title
+ $answer["title"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByTagName("h1")[0]
+ );
+
+ // short definition
+ $definition =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "p"
+ );
+
+ if(count($definition) !== 0){
+
+ $answer["description"][] = [
+ "type" => "quote",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $definition[0]
+ )
+ ];
+ }
+
+ // get thumbnail, if it exists
+ $this->fuckhtml->load($infobox_html[1]);
+
+ $thumb =
+ $this->fuckhtml
+ ->getElementsByClassName("float-right", "img");
+
+ if(count($thumb) !== 0){
+
+ preg_match(
+ '/\/image\?img=([^&]+)/i',
+ $thumb[0]["attributes"]["src"],
+ $thumb
+ );
+
+ if(count($thumb) === 2){
+
+ $answer["thumb"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $thumb[1]
+ );
+ }
+ }
+
+ // get description
+ $ps =
+ $this->fuckhtml
+ ->getElementsByTagName("p");
+
+ $first_tag = true;
+ foreach($ps as $p){
+
+ $this->fuckhtml->load($p);
+
+ if(
+ preg_match(
+ '/^\s*<strong>/i',
+ $p["innerHTML"]
+ )
+ ){
+
+ /*
+ Parse table
+ */
+
+ $strong =
+ $this->fuckhtml
+ ->getElementsByTagName("strong")[0];
+
+ $p["innerHTML"] =
+ str_replace($strong["innerHTML"], "", $p["innerHTML"]);
+
+ $strong =
+ preg_replace(
+ '/:$/',
+ "",
+ ucfirst(
+ $this->fuckhtml
+ ->getTextContent(
+ $strong
+ )
+ )
+ );
+
+ $answer["table"][trim($strong)] =
+ trim(
+ $this->fuckhtml
+ ->getTextContent(
+ $p
+ )
+ );
+
+ continue;
+ }
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByClassName("svg-icon");
+
+ if(count($as) !== 0){
+
+ /*
+ Parse websites
+ */
+ foreach($as as $a){
+
+ $answer["sublink"][
+ ucfirst(explode(" ", $a["attributes"]["class"], 2)[1])
+ ] =
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ );
+ }
+
+ continue;
+ }
+
+ /*
+ Parse text content
+ */
+ $tags =
+ $this->fuckhtml
+ ->getElementsByTagName("*");
+
+ $i = 0;
+ foreach($tags as $tag){
+
+ $c = count($answer["description"]);
+
+ // remove tag from innerHTML
+ $p["innerHTML"] =
+ explode($tag["outerHTML"], $p["innerHTML"], 2);
+
+ if(count($p["innerHTML"]) === 2){
+
+ if(
+ $i === 0 &&
+ $c !== 0 &&
+ $answer["description"][$c - 1]["type"] == "link"
+ ){
+
+ $append = "\n\n";
+ }else{
+
+ $append = "";
+ }
+
+ if($p["innerHTML"][0] != ""){
+ $answer["description"][] = [
+ "type" => "text",
+ "value" => $append . trim($p["innerHTML"][0])
+ ];
+ }
+
+ $p["innerHTML"] = $p["innerHTML"][1];
+ }else{
+
+ $p["innerHTML"] = $p["innerHTML"][0];
+ }
+
+ switch($tag["tagName"]){
+
+ case "a":
+
+ $value =
+ $this->fuckhtml
+ ->getTextContent(
+ $tag
+ );
+
+ if(strtolower($value) == "wikipedia"){
+
+ if($c !== 0){
+ $answer["description"][$c - 1]["value"] =
+ rtrim($answer["description"][$c - 1]["value"]);
+ }
+ break;
+ }
+
+ $answer["description"][] = [
+ "type" => "link",
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $tag["attributes"]["href"]
+ ),
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $tag
+ )
+ ];
+ break;
+ }
+
+ $i++;
+ }
+ }
+
+ // get URL
+ $this->fuckhtml->load($infobox_html[2]);
+
+ $answer["url"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ )[0]
+ ["attributes"]
+ ["href"]
+ );
+
+ // append answer
+ $out["answer"][] = $answer;
+ }
+
+ /*
+ Get news
+ */
+ $this->fuckhtml->load($html);
+
+ $news =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "results news-results",
+ "div"
+ );
+
+ if(count($news) !== 0){
+
+ $this->fuckhtml->load($news[0]);
+
+ $lis =
+ $this->fuckhtml
+ ->getElementsByTagName("li");
+
+ foreach($lis as $li){
+
+ $this->fuckhtml->load($li);
+
+ $a =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "ob",
+ "a"
+ );
+
+ if(count($a) === 0){
+
+ continue;
+ }
+
+ $a = $a[0];
+
+ $out["news"][] = [
+ "title" =>
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $a
+ )
+ ),
+ "description" => null,
+ "date" =>
+ strtotime(
+ explode(
+ " - ",
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ )[0]
+ ),
+ 2
+ )[1]
+ ),
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ )
+ ];
+ }
+ }
+
+ /*
+ Get next page
+ */
+ $this->fuckhtml->load($html);
+
+ $pagination =
+ $this->fuckhtml
+ ->getElementsByClassName("pagination");
+
+ if(count($pagination) !== false){
+
+ $this->fuckhtml->load($pagination[0]);
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName("a");
+
+ foreach($as as $a){
+
+ if($a["innerHTML"] == "Next"){
+
+ $out["npt"] = $this->nextpage->store(
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ ),
+ "web"
+ );
+ }
+ }
+ }
+
+ return $out;
+ }
+
+ public function news($get){
+
+ $search = $get["s"];
+
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "news" => []
+ ];
+
+ try{
+ $html =
+ $this->get(
+ "https://www.mojeek.com/search",
+ [
+ "q" => $search,
+ "fmt" => "news"
+ ]
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ /*
+ $handle = fopen("scraper/mojeek.html", "r");
+ $html = fread($handle, filesize("scraper/mojeek.html"));
+ fclose($handle);*/
+
+ /*
+ Get big, standard and smaller nodes
+ */
+ foreach(
+ [
+ "results-extended",
+ "results-standard"
+ ]
+ as $categoryname
+ ){
+
+ $this->fuckhtml->load($html);
+
+ $categories =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $categoryname,
+ "ul"
+ );
+
+ foreach($categories as $category){
+
+ $this->fuckhtml->load($category);
+
+ $nodes =
+ $this->fuckhtml
+ ->getElementsByTagName("li");
+
+ foreach($nodes as $node){
+
+ $data = [
+ "title" => null,
+ "author" => null,
+ "description" => null,
+ "date" => null,
+ "thumb" =>
+ [
+ "url" => null,
+ "ratio" => null
+ ],
+ "url" => null
+ ];
+
+ /*
+ Parse the results
+ */
+ $this->fuckhtml->load($node);
+
+ // get title + url
+ $a =
+ $this->fuckhtml
+ ->getElementsByTagName("a")[0];
+
+ $data["title"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["title"]
+ );
+
+ $data["url"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ );
+
+ // get image
+ $image =
+ $this->fuckhtml
+ ->getElementsByTagName("img");
+
+ if(count($image) !== 0){
+
+ $data["thumb"] = [
+ "url" =>
+ urldecode(
+ str_replace(
+ "/image?img=",
+ "",
+ $this->fuckhtml
+ ->getTextContent(
+ $image[0]["attributes"]["src"]
+ )
+ )
+ ),
+ "ratio" => "16:9"
+ ];
+ }
+
+ // get description
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName("s", "p");
+
+ if(count($description) !== 0){
+
+ $data["description"] =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ )
+ );
+ }
+
+ // get date + time
+ $date =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "date",
+ "p"
+ );
+
+ $i =
+ $this->fuckhtml
+ ->getElementsByClassName("i", "p");
+
+ if(count($date) !== 0){
+
+ // we're inside a big node
+ $data["date"] = strtotime($date[0]["innerHTML"]);
+
+ if(count($i) !== 0){
+
+ $this->fuckhtml->load($i[0]);
+
+ $a =
+ $this->fuckhtml
+ ->getElementsByTagName("a");
+
+ if(count($a) !== 0){
+
+ $data["author"] =
+ $this->fuckhtml
+ ->getTextContent($a[0]);
+ }
+ }
+ }else{
+
+ // we're inside a small node
+ if(count($i) !== 0){
+
+ $i =
+ explode(
+ " - ",
+ $this->fuckhtml
+ ->getTextContent($i[0])
+ );
+
+ $data["date"] = strtotime(array_pop($i));
+ $data["author"] = implode(" - ", $i);
+ }
+ }
+
+ $out["news"][] = $data;
+ }
+ }
+ }
+
+ return $out;
+ }
+
+ private function titledots($title){
+
+ return trim($title, ". \t\n\r\0\x0B");
+ }
+}
+