summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
Diffstat (limited to 'scraper')
-rw-r--r--scraper/googlealt.php5029
1 files changed, 5029 insertions, 0 deletions
diff --git a/scraper/googlealt.php b/scraper/googlealt.php
new file mode 100644
index 0000000..d7878cf
--- /dev/null
+++ b/scraper/googlealt.php
@@ -0,0 +1,5029 @@
+<?php
+
+// @TODO check for consent.google.com page, if need be
+
+class googlealt{
+
+ public function __construct(){
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+
+ include "lib/backend.php";
+ $this->backend = new backend("googlealt");
+ }
+
+ public function getfilters($page){
+
+ $base = [
+ "country" => [ // gl=<country> (image: cr=countryAF)
+ "display" => "Country",
+ "option" => [
+ "any" => "Instance's country",
+ "af" => "Afghanistan",
+ "al" => "Albania",
+ "dz" => "Algeria",
+ "as" => "American Samoa",
+ "ad" => "Andorra",
+ "ao" => "Angola",
+ "ai" => "Anguilla",
+ "aq" => "Antarctica",
+ "ag" => "Antigua and Barbuda",
+ "ar" => "Argentina",
+ "am" => "Armenia",
+ "aw" => "Aruba",
+ "au" => "Australia",
+ "at" => "Austria",
+ "az" => "Azerbaijan",
+ "bs" => "Bahamas",
+ "bh" => "Bahrain",
+ "bd" => "Bangladesh",
+ "bb" => "Barbados",
+ "by" => "Belarus",
+ "be" => "Belgium",
+ "bz" => "Belize",
+ "bj" => "Benin",
+ "bm" => "Bermuda",
+ "bt" => "Bhutan",
+ "bo" => "Bolivia",
+ "ba" => "Bosnia and Herzegovina",
+ "bw" => "Botswana",
+ "bv" => "Bouvet Island",
+ "br" => "Brazil",
+ "io" => "British Indian Ocean Territory",
+ "bn" => "Brunei Darussalam",
+ "bg" => "Bulgaria",
+ "bf" => "Burkina Faso",
+ "bi" => "Burundi",
+ "kh" => "Cambodia",
+ "cm" => "Cameroon",
+ "ca" => "Canada",
+ "cv" => "Cape Verde",
+ "ky" => "Cayman Islands",
+ "cf" => "Central African Republic",
+ "td" => "Chad",
+ "cl" => "Chile",
+ "cn" => "China",
+ "cx" => "Christmas Island",
+ "cc" => "Cocos (Keeling) Islands",
+ "co" => "Colombia",
+ "km" => "Comoros",
+ "cg" => "Congo",
+ "cd" => "Congo, the Democratic Republic",
+ "ck" => "Cook Islands",
+ "cr" => "Costa Rica",
+ "ci" => "Cote D'ivoire",
+ "hr" => "Croatia",
+ "cu" => "Cuba",
+ "cy" => "Cyprus",
+ "cz" => "Czech Republic",
+ "dk" => "Denmark",
+ "dj" => "Djibouti",
+ "dm" => "Dominica",
+ "do" => "Dominican Republic",
+ "ec" => "Ecuador",
+ "eg" => "Egypt",
+ "sv" => "El Salvador",
+ "gq" => "Equatorial Guinea",
+ "er" => "Eritrea",
+ "ee" => "Estonia",
+ "et" => "Ethiopia",
+ "fk" => "Falkland Islands (Malvinas)",
+ "fo" => "Faroe Islands",
+ "fj" => "Fiji",
+ "fi" => "Finland",
+ "fr" => "France",
+ "gf" => "French Guiana",
+ "pf" => "French Polynesia",
+ "tf" => "French Southern Territories",
+ "ga" => "Gabon",
+ "gm" => "Gambia",
+ "ge" => "Georgia",
+ "de" => "Germany",
+ "gh" => "Ghana",
+ "gi" => "Gibraltar",
+ "gr" => "Greece",
+ "gl" => "Greenland",
+ "gd" => "Grenada",
+ "gp" => "Guadeloupe",
+ "gu" => "Guam",
+ "gt" => "Guatemala",
+ "gn" => "Guinea",
+ "gw" => "Guinea-Bissau",
+ "gy" => "Guyana",
+ "ht" => "Haiti",
+ "hm" => "Heard Island and Mcdonald Islands",
+ "va" => "Holy See (Vatican City State)",
+ "hn" => "Honduras",
+ "hk" => "Hong Kong",
+ "hu" => "Hungary",
+ "is" => "Iceland",
+ "in" => "India",
+ "id" => "Indonesia",
+ "ir" => "Iran, Islamic Republic",
+ "iq" => "Iraq",
+ "ie" => "Ireland",
+ "il" => "Israel",
+ "it" => "Italy",
+ "jm" => "Jamaica",
+ "jp" => "Japan",
+ "jo" => "Jordan",
+ "kz" => "Kazakhstan",
+ "ke" => "Kenya",
+ "ki" => "Kiribati",
+ "kp" => "Korea, Democratic People's Republic",
+ "kr" => "Korea, Republic",
+ "kw" => "Kuwait",
+ "kg" => "Kyrgyzstan",
+ "la" => "Lao People's Democratic Republic",
+ "lv" => "Latvia",
+ "lb" => "Lebanon",
+ "ls" => "Lesotho",
+ "lr" => "Liberia",
+ "ly" => "Libyan Arab Jamahiriya",
+ "li" => "Liechtenstein",
+ "lt" => "Lithuania",
+ "lu" => "Luxembourg",
+ "mo" => "Macao",
+ "mk" => "Macedonia, the Former Yugosalv Republic",
+ "mg" => "Madagascar",
+ "mw" => "Malawi",
+ "my" => "Malaysia",
+ "mv" => "Maldives",
+ "ml" => "Mali",
+ "mt" => "Malta",
+ "mh" => "Marshall Islands",
+ "mq" => "Martinique",
+ "mr" => "Mauritania",
+ "mu" => "Mauritius",
+ "yt" => "Mayotte",
+ "mx" => "Mexico",
+ "fm" => "Micronesia, Federated States",
+ "md" => "Moldova, Republic",
+ "mc" => "Monaco",
+ "mn" => "Mongolia",
+ "ms" => "Montserrat",
+ "ma" => "Morocco",
+ "mz" => "Mozambique",
+ "mm" => "Myanmar",
+ "na" => "Namibia",
+ "nr" => "Nauru",
+ "np" => "Nepal",
+ "nl" => "Netherlands",
+ "an" => "Netherlands Antilles",
+ "nc" => "New Caledonia",
+ "nz" => "New Zealand",
+ "ni" => "Nicaragua",
+ "ne" => "Niger",
+ "ng" => "Nigeria",
+ "nu" => "Niue",
+ "nf" => "Norfolk Island",
+ "mp" => "Northern Mariana Islands",
+ "no" => "Norway",
+ "om" => "Oman",
+ "pk" => "Pakistan",
+ "pw" => "Palau",
+ "ps" => "Palestinian Territory, Occupied",
+ "pa" => "Panama",
+ "pg" => "Papua New Guinea",
+ "py" => "Paraguay",
+ "pe" => "Peru",
+ "ph" => "Philippines",
+ "pn" => "Pitcairn",
+ "pl" => "Poland",
+ "pt" => "Portugal",
+ "pr" => "Puerto Rico",
+ "qa" => "Qatar",
+ "re" => "Reunion",
+ "ro" => "Romania",
+ "ru" => "Russian Federation",
+ "rw" => "Rwanda",
+ "sh" => "Saint Helena",
+ "kn" => "Saint Kitts and Nevis",
+ "lc" => "Saint Lucia",
+ "pm" => "Saint Pierre and Miquelon",
+ "vc" => "Saint Vincent and the Grenadines",
+ "ws" => "Samoa",
+ "sm" => "San Marino",
+ "st" => "Sao Tome and Principe",
+ "sa" => "Saudi Arabia",
+ "sn" => "Senegal",
+ "cs" => "Serbia and Montenegro",
+ "sc" => "Seychelles",
+ "sl" => "Sierra Leone",
+ "sg" => "Singapore",
+ "sk" => "Slovakia",
+ "si" => "Slovenia",
+ "sb" => "Solomon Islands",
+ "so" => "Somalia",
+ "za" => "South Africa",
+ "gs" => "South Georgia and the South Sandwich Islands",
+ "es" => "Spain",
+ "lk" => "Sri Lanka",
+ "sd" => "Sudan",
+ "sr" => "Suriname",
+ "sj" => "Svalbard and Jan Mayen",
+ "sz" => "Swaziland",
+ "se" => "Sweden",
+ "ch" => "Switzerland",
+ "sy" => "Syrian Arab Republic",
+ "tw" => "Taiwan, Province of China",
+ "tj" => "Tajikistan",
+ "tz" => "Tanzania, United Republic",
+ "th" => "Thailand",
+ "tl" => "Timor-Leste",
+ "tg" => "Togo",
+ "tk" => "Tokelau",
+ "to" => "Tonga",
+ "tt" => "Trinidad and Tobago",
+ "tn" => "Tunisia",
+ "tr" => "Turkey",
+ "tm" => "Turkmenistan",
+ "tc" => "Turks and Caicos Islands",
+ "tv" => "Tuvalu",
+ "ug" => "Uganda",
+ "ua" => "Ukraine",
+ "ae" => "United Arab Emirates",
+ "uk" => "United Kingdom",
+ "us" => "United States",
+ "um" => "United States Minor Outlying Islands",
+ "uy" => "Uruguay",
+ "uz" => "Uzbekistan",
+ "vu" => "Vanuatu",
+ "ve" => "Venezuela",
+ "vn" => "Viet Nam",
+ "vg" => "Virgin Islands, British",
+ "vi" => "Virgin Islands, U.S.",
+ "wf" => "Wallis and Futuna",
+ "eh" => "Western Sahara",
+ "ye" => "Yemen",
+ "zm" => "Zambia",
+ "zw" => "Zimbabwe"
+ ]
+ ],
+ "nsfw" => [
+ "display" => "NSFW",
+ "option" => [
+ "yes" => "Yes", // safe=active
+ "no" => "No" // safe=off
+ ]
+ ]
+ ];
+
+ switch($page){
+
+ case "web":
+ return array_merge(
+ $base,
+ [
+ "lang" => [ // lr=<lang> (prefix lang with "lang_")
+ "display" => "Language",
+ "option" => [
+ "any" => "Any language",
+ "ar" => "Arabic",
+ "bg" => "Bulgarian",
+ "ca" => "Catalan",
+ "cs" => "Czech",
+ "da" => "Danish",
+ "de" => "German",
+ "el" => "Greek",
+ "en" => "English",
+ "es" => "Spanish",
+ "et" => "Estonian",
+ "fi" => "Finnish",
+ "fr" => "French",
+ "hr" => "Croatian",
+ "hu" => "Hungarian",
+ "id" => "Indonesian",
+ "is" => "Icelandic",
+ "it" => "Italian",
+ "iw" => "Hebrew",
+ "ja" => "Japanese",
+ "ko" => "Korean",
+ "lt" => "Lithuanian",
+ "lv" => "Latvian",
+ "nl" => "Dutch",
+ "no" => "Norwegian",
+ "pl" => "Polish",
+ "pt" => "Portuguese",
+ "ro" => "Romanian",
+ "ru" => "Russian",
+ "sk" => "Slovak",
+ "sl" => "Slovenian",
+ "sr" => "Serbian",
+ "sv" => "Swedish",
+ "tr" => "Turkish",
+ "zh-CN" => "Chinese (Simplified)",
+ "zh-TW" => "Chinese (Traditional)"
+ ]
+ ],
+ "newer" => [ // tbs
+ "display" => "Newer than",
+ "option" => "_DATE"
+ ],
+ "older" => [
+ "display" => "Older than",
+ "option" => "_DATE"
+ ],
+ "spellcheck" => [
+ "display" => "Spellcheck",
+ "option" => [
+ "yes" => "Yes",
+ "no" => "No"
+ ]
+ ]
+ ]
+ );
+ break;
+
+ case "images":
+ return array_merge(
+ $base,
+ [
+ "time" => [ // tbs=qdr:<time>
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "d" => "Past 24 hours",
+ "w" => "Past week",
+ "m" => "Past month",
+ "y" => "Past year"
+ ]
+ ],
+ "size" => [ // imgsz
+ "display" => "Size",
+ "option" => [
+ "any" => "Any size",
+ "l" => "Large",
+ "m" => "Medium",
+ "i" => "Icon",
+ "qsvga" => "Larger than 400x300",
+ "vga" => "Larger than 640x480",
+ "svga" => "Larger than 800x600",
+ "xga" => "Larger than 1024x768",
+ "2mp" => "Larger than 2MP",
+ "4mp" => "Larger than 4MP",
+ "6mp" => "Larger than 6MP",
+ "8mp" => "Larger than 8MP",
+ "10mp" => "Larger than 10MP",
+ "12mp" => "Larger than 12MP",
+ "15mp" => "Larger than 15MP",
+ "20mp" => "Larger than 20MP",
+ "40mp" => "Larger than 40MP",
+ "70mp" => "Larger than 70MP"
+ ]
+ ],
+ "ratio" => [ // imgar
+ "display" => "Aspect ratio",
+ "option" => [
+ "any" => "Any ratio",
+ "t|xt" => "Tall",
+ "s" => "Square",
+ "w" => "Wide",
+ "xw" => "Panoramic"
+ ]
+ ],
+ "color" => [ // imgc
+ "display" => "Color",
+ "option" => [
+ "any" => "Any color",
+ "color" => "Full color",
+ "bnw" => "Black & white",
+ "trans" => "Transparent",
+ // from here, imgcolor
+ "red" => "Red",
+ "orange" => "Orange",
+ "yellow" => "Yellow",
+ "green" => "Green",
+ "teal" => "Teal",
+ "blue" => "Blue",
+ "purple" => "Purple",
+ "pink" => "Pink",
+ "white" => "White",
+ "gray" => "Gray",
+ "black" => "Black",
+ "brown" => "Brown"
+ ]
+ ],
+ "type" => [ // tbs=itp:<type>
+ "display" => "Type",
+ "option" => [
+ "any" => "Any type",
+ "clipart" => "Clip Art",
+ "lineart" => "Line Drawing",
+ "animated" => "Animated"
+ ]
+ ],
+ "format" => [ // as_filetype
+ "display" => "Format",
+ "option" => [
+ "any" => "Any format",
+ "jpg" => "JPG",
+ "gif" => "GIF",
+ "png" => "PNG",
+ "bmp" => "BMP",
+ "svg" => "SVG",
+ "webp" => "WEBP",
+ "ico" => "ICO",
+ "craw" => "RAW"
+ ]
+ ],
+ "rights" => [ // tbs=sur:<rights>
+ "display" => "Usage rights",
+ "option" => [
+ "any" => "Any license",
+ "cl" => "Creative Commons licenses",
+ "ol" => "Commercial & other licenses"
+ ]
+ ]
+ ]
+ );
+ break;
+
+ case "videos":
+ return array_merge(
+ $base,
+ [
+ "newer" => [ // tbs
+ "display" => "Newer than",
+ "option" => "_DATE"
+ ],
+ "older" => [
+ "display" => "Older than",
+ "option" => "_DATE"
+ ],
+ "duration" => [
+ "display" => "Duration",
+ "option" => [
+ "any" => "Any duration",
+ "s" => "Short (0-4min)", // tbs=dur:s
+ "m" => "Medium (4-20min)", // tbs=dur:m
+ "l" => "Long (20+ min)" // tbs=dur:l
+ ]
+ ],
+ "quality" => [
+ "display" => "Quality",
+ "option" => [
+ "any" => "Any quality",
+ "h" => "High quality" // tbs=hq:h
+ ]
+ ],
+ "captions" => [
+ "display" => "Captions",
+ "option" => [
+ "any" => "No preference",
+ "yes" => "Closed captioned" // tbs=cc:1
+ ]
+ ]
+ ]
+ );
+ break;
+
+ case "news":
+ return array_merge(
+ $base,
+ [
+ "newer" => [ // tbs
+ "display" => "Newer than",
+ "option" => "_DATE"
+ ],
+ "older" => [
+ "display" => "Older than",
+ "option" => "_DATE"
+ ],
+ "sort" => [
+ "display" => "Sort",
+ "option" => [
+ "relevance" => "Relevance",
+ "date" => "Date" // sbd:1
+ ]
+ ]
+ ]
+ );
+ break;
+ }
+ }
+
+ private function get($proxy, $url, $get = []){
+
+ $headers = [
+ "User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ //"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1",
+ "Priority: u=1",
+ "TE: trailers"
+ ];
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
+ curl_setopt($curlproc, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V6);
+
+
+ // use http2
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ // follow redirects
+ curl_setopt($curlproc, CURLOPT_FOLLOWLOCATION, true);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+ return $data;
+ }
+
+
+
+
+ private function parsepage($html, $pagetype, $search, $proxy, $params){
+
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ $this->fuckhtml->load($html);
+
+ $this->detect_sorry();
+
+ // parse all <style> tags
+ $this->parsestyles();
+
+ // get javascript images
+ $this->scrape_dimg($html);
+
+ // get html blobs
+ preg_match_all(
+ '/function\(\){window\.jsl\.dh\(\'([^\']+?)\',\'(.+?[^\'])\'\);/',
+ $html,
+ $blobs
+ );
+
+ $this->blobs = [];
+ if(isset($blobs[1])){
+
+ for($i=0; $i<count($blobs[1]); $i++){
+
+ $this->blobs[$blobs[1][$i]] =
+ $this->fuckhtml
+ ->parseJsString(
+ $blobs[2][$i]
+ );
+ }
+ }
+
+ $this->scrape_imagearr($html);
+
+ //
+ // load result column
+ //
+ $result_div =
+ $this->fuckhtml
+ ->getElementById(
+ "center_col",
+ "div"
+ );
+
+ if($result_div === false){
+
+ throw new Exception("Failed to grep result div");
+ }
+
+ $this->fuckhtml->load($result_div);
+
+ //
+ // Get word corrections
+ //
+ $correction =
+ $this->fuckhtml
+ ->getElementById(
+ "fprs",
+ "p"
+ );
+
+ if($correction){
+
+ $this->fuckhtml->load($correction);
+
+ $a =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ $using =
+ $this->fuckhtml
+ ->getElementById(
+ "fprsl",
+ $a
+ );
+
+ if($using){
+
+ $using =
+ $this->fuckhtml
+ ->getTextContent(
+ $using
+ );
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ );
+
+ $type_span =
+ $this->fuckhtml
+ ->getTextContent(
+ $spans[0]
+ );
+
+ $type = "not_many";
+
+ if(
+ stripos(
+ $type_span,
+ "Showing results for"
+ ) !== false
+ ){
+
+ $type = "including";
+ }
+
+ $correction =
+ $this->fuckhtml
+ ->getTextContent(
+ $a[count($a) - 1]
+ );
+
+ $out["spelling"] = [
+ "type" => $type,
+ "using" => $using,
+ "correction" => $correction
+ ];
+ }
+
+ // reset
+ $this->fuckhtml->load($result_div);
+ }else{
+
+ // get the "Did you mean?" prompt
+ $taw =
+ $this->fuckhtml
+ ->getElementById(
+ "taw"
+ );
+
+ if($taw){
+
+ $this->fuckhtml->load($taw);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($as) !== 0){
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]
+ );
+
+ // @TODO implement did_you_mean
+ $out["spelling"] = [
+ "type" => "including",
+ "using" => $search,
+ "correction" => $text
+ ];
+ }
+ }
+
+ $this->fuckhtml->load($result_div);
+ }
+
+ //
+ // get notices
+ //
+ $botstuff =
+ $this->fuckhtml
+ ->getElementById(
+ "botstuff"
+ );
+
+ // important for later
+ $last_page = false;
+
+ if($botstuff){
+
+ $this->fuckhtml->load($botstuff);
+
+ $cards =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "line-height" => "normal"
+ ]
+ ),
+ "div"
+ );
+
+ foreach($cards as $card){
+
+ $this->fuckhtml->load($card);
+
+ $h2 =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "h2"
+ );
+
+ if(count($h2) !== 0){
+
+ $title =
+ $this->fuckhtml
+ ->getTextContent(
+ $h2[0]
+ );
+
+ $card["innerHTML"] =
+ str_replace(
+ $h2[0]["outerHTML"],
+ "",
+ $card["innerHTML"]
+ );
+ }else{
+
+ $title = "Notice";
+ }
+
+ $description = [];
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($as) !== 0){
+
+ $first = true;
+
+ foreach($as as $a){
+
+ $text_link =
+ $this->fuckhtml
+ ->getTextContent(
+ $a
+ );
+
+ if(stripos($text_link, "repeat the search") !== false){
+
+ $last_page = true;
+ break 2;
+ }
+
+ $parts =
+ explode(
+ $a["outerHTML"],
+ $card["innerHTML"],
+ 2
+ );
+
+ $card["innerHTML"] = $parts[1];
+
+ $value =
+ preg_replace(
+ '/ +/',
+ " ",
+ $this->fuckhtml
+ ->getTextContent(
+ $parts[0],
+ false,
+ false
+ )
+ );
+
+ if(strlen(trim($value)) !== 0){
+
+ $description[] = [
+ "type" => "text",
+ "value" => $value
+ ];
+
+ if($first){
+
+ $description[0]["value"] =
+ ltrim($description[0]["value"]);
+ }
+ }
+
+ $first = false;
+
+ $description[] = [
+ "type" => "link",
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]
+ ["href"]
+ ),
+ "value" => $text_link
+ ];
+ }
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent(
+ $card["innerHTML"],
+ false,
+ false
+ );
+
+ if(strlen(trim($text)) !== 0){
+
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ rtrim(
+ $text
+ )
+ ];
+ }
+
+ }else{
+
+ // @TODO: Check if this ever gets populated without giving me garbage
+ /*
+ $text =
+ $this->fuckhtml
+ ->getTextContent(
+ $card
+ );
+
+ if($text != ""){
+ $description[] = [
+ "type" => "text",
+ "value" => $text
+ ];
+ }*/
+ }
+
+ if(count($description) !== 0){
+
+ $out["answer"][] = [
+ "title" => $title,
+ "description" => $description,
+ "url" => null,
+ "thumb" => null,
+ "table" => [],
+ "sublink" => []
+ ];
+ }
+ }
+
+ // reset
+ $this->fuckhtml->load($html);
+ }
+
+ //
+ // get "Related Searches" and "People also search for"
+ //
+ $relateds =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "wyccme",
+ "div"
+ );
+
+ foreach($relateds as $related){
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent(
+ $related
+ );
+
+ if($text == "More results"){ continue; }
+
+ $out["related"][] = $text;
+ }
+
+ //
+ // Get text results
+ //
+ $results =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "g",
+ "div"
+ );
+
+ $this->skip_next = false;
+
+ foreach($results as $result){
+
+ if($this->skip_next){
+
+ $this->skip_next = false;
+ continue;
+ }
+
+ $this->fuckhtml->load($result);
+
+ $web = [
+ "title" => null,
+ "description" => null,
+ "url" => null,
+ "date" => null,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => []
+ ];
+
+ // Detect presence of sublinks
+ $g =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "g",
+ "div"
+ );
+
+ $sublinks = [];
+ if(count($g) > 0){
+
+ $table =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "table"
+ );
+
+ if(count($table) !== 0){
+
+ // found some sublinks!
+
+ $this->fuckhtml->load($table[0]);
+
+ $tds =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "td"
+ );
+
+ foreach($tds as $td){
+
+ $this->fuckhtml->load($td);
+
+ $a =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(
+ count($a) === 0 ||
+ (
+ isset($a[0]["attributes"]["class"]) &&
+ $a[0]["attributes"]["class"] == "fl"
+ )
+ ){
+
+ continue;
+ }
+
+ $td["innerHTML"] =
+ str_replace(
+ $a[0]["outerHTML"],
+ "",
+ $td["innerHTML"]
+ );
+
+ $web["sublink"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $a[0]
+ )
+ ),
+ "description" =>
+ html_entity_decode(
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $td
+ )
+ )
+ ),
+ "url" =>
+ $this->unshiturl(
+ $a[0]
+ ["attributes"]
+ ["href"]
+ ),
+ "date" => null
+ ];
+ }
+
+ // reset
+ $this->fuckhtml->load($result);
+ }
+
+ // skip on next iteration
+ $this->skip_next = true;
+ }
+
+ // get title
+ $h3 =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "h3"
+ );
+
+ if(count($h3) === 0){
+
+ continue;
+ }
+
+ $web["title"] =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $h3[0]
+ )
+ );
+
+ // get url
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ $web["url"] =
+ $this->unshiturl(
+ $as[0]
+ ["attributes"]
+ ["href"]
+ );
+
+ if(
+ !preg_match(
+ '/^http/',
+ $web["url"]
+ )
+ ){
+
+ // skip if invalid url is found
+ continue;
+ }
+
+ //
+ // probe for twitter carousel
+ //
+ $carousel =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "g-scrolling-carousel"
+ );
+
+ if(count($carousel) !== 0){
+
+ $this->fuckhtml->load($carousel[0]);
+
+ $items =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "g-inner-card"
+ );
+
+ $has_thumbnail = false;
+
+ foreach($items as $item){
+
+ $this->fuckhtml->load($item);
+
+ if($has_thumbnail === false){
+
+ // get thumbnail
+ $thumb =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ );
+
+ if(
+ count($thumb) !== 0 &&
+ isset($thumb[0]["attributes"]["id"])
+ ){
+
+ $web["thumb"] = [
+ "url" =>
+ $this->getdimg(
+ $thumb[0]["attributes"]["id"]
+ ),
+ "ratio" => "16:9"
+ ];
+
+ $has_thumbnail = true;
+ }
+
+ // or else, try getting a thumbnail from next container
+ }
+
+ // cache div
+ $div =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "div"
+ );
+
+ // get link
+ $links =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ // get description of carousel sublink
+ $description =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "role",
+ "heading",
+ $div
+ );
+
+ if(count($description) !== 0){
+
+ $description =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ )
+ );
+ }else{
+
+ $description = null;
+ }
+
+ $bottom =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "style",
+ "z-index:2",
+ $div
+ );
+
+ $title = null;
+ $date = null;
+ if(count($bottom) !== 0){
+
+ $this->fuckhtml->load($bottom[0]);
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ );
+
+ $title =
+ $this->fuckhtml
+ ->getTextContent(
+ $spans[0]
+ );
+
+ $date =
+ strtotime(
+ $this->fuckhtml
+ ->getTextContent(
+ $spans[count($spans) - 1]
+ )
+ );
+ }
+
+ $web["sublink"][] = [
+ "title" => $title,
+ "description" => $description,
+ "url" =>
+ $this->unshiturl(
+ $links[0]
+ ["attributes"]
+ ["href"]
+ ),
+ "date" => $date
+ ];
+ }
+
+ $out["web"][] = $web;
+ continue;
+ }
+
+ //
+ // get viewcount, time posted and follower count from <cite> tag
+ //
+ $cite =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "cite"
+ );
+
+ if(count($cite) !== 0){
+
+ $this->fuckhtml->load($cite[0]);
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName("span");
+
+ if(count($spans) === 0){
+
+ $cites =
+ explode(
+ "·",
+ $this->fuckhtml
+ ->getTextContent(
+ $cite[0]
+ )
+ );
+
+ foreach($cites as $cite){
+
+ $cite = trim($cite);
+
+ if(
+ preg_match(
+ '/(.+) (views|followers|likes)$/',
+ $cite,
+ $match
+ )
+ ){
+
+ $web["table"][ucfirst($match[2])] =
+ $match[1];
+ }elseif(
+ preg_match(
+ '/ago$/',
+ $cite
+ )
+ ){
+
+ $web["date"] =
+ strtotime($cite);
+ }
+ }
+ }
+
+ // reset
+ $this->fuckhtml->load($result);
+ }
+
+ //
+ // attempt to fetch description cleanly
+ //
+ $description =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "style",
+ "-webkit-line-clamp:2"
+ );
+
+ if(count($description) !== 0){
+
+ $web["description"] =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ )
+ );
+ }else{
+
+ // use ANOTHER method where the description is a header of the result
+ $description =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "wa:/description"
+ );
+
+ if(count($description) !== 0){
+
+ // get date off that shit
+ $date =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "font-size" => "12px",
+ "line-height" => "1.34",
+ "display" => "inline-block",
+ "font-family" => "google sans,arial,sans-serif",
+ "padding-right" => "0",
+ "white-space" => "nowrap"
+ ]
+ ),
+ "span"
+ );
+
+ if(count($date) !== 0){
+
+ $description[0]["innerHTML"] =
+ str_replace(
+ $date[0]["outerHTML"],
+ "",
+ $description[0]["innerHTML"]
+ );
+
+ $web["date"] =
+ strtotime(
+ $this->fuckhtml
+ ->getTextContent(
+ $date[0]
+ )
+ );
+ }
+
+ $web["description"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ );
+ }else{
+
+ // Yes.. You guessed it, use ANOTHER method to get descriptions
+ // off youtube containers
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "-webkit-box-orient" => "vertical",
+ "display" => "-webkit-box",
+ "font-size" => "14px",
+ "-webkit-line-clamp" => "2",
+ "line-height" => "22px",
+ "overflow" => "hidden",
+ "word-break" => "break-word",
+ "color" => "#4d5156"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($description) !== 0){
+
+ // check for video duration
+ $duration =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "background-color" => "rgba(0,0,0,0.6)",
+ "color" => "#fff",
+ "fill" => "#fff"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($duration) !== 0){
+
+ $web["table"]["Duration"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $duration[0]
+ );
+ }
+
+ $web["description"] =
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ )
+ )
+ );
+
+ // get author + time posted
+ $info =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "color" => "var(" . $this->getcolorvar("#70757a") . ")",
+ "font-size" => "14px",
+ "line-height" => "20px",
+ "margin-top" => "12px"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($info) !== 0){
+
+ $info =
+ explode(
+ "·",
+ $this->fuckhtml
+ ->getTextContent(
+ $info[0]
+ )
+ );
+
+ switch(count($info)){
+
+ case 3:
+ $web["table"]["Author"] = trim($info[1]);
+ $web["date"] = strtotime(trim($info[2]));
+ break;
+
+ case 2:
+ $web["date"] = strtotime(trim($info[1]));
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ //
+ // get categories of content within the search result
+ //
+ $cats =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "data-sncf",
+ "div"
+ );
+
+ foreach($cats as $cat){
+
+ $this->fuckhtml->load($cat);
+
+ // detect image category
+ $images =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ );
+
+ if(count($images) !== 0){
+
+ foreach($images as $image){
+
+ if(isset($image["attributes"]["id"])){
+ // we found an image
+
+ if(isset($image["attributes"]["width"])){
+
+ $width = (int)$image["attributes"]["width"];
+
+ if($width == 110){
+
+ $ratio = "1:1";
+ }elseif($width > 110){
+
+ $ratio = "16:9";
+ }else{
+
+ $ratio = "9:16";
+ }
+ }else{
+
+ $ratio = "1:1";
+ }
+
+ $web["thumb"] = [
+ "url" => $this->getdimg($image["attributes"]["id"]),
+ "ratio" => $ratio
+ ];
+
+ continue 2;
+ }
+ }
+ }
+
+ // Detect rating
+ $spans_unfiltered =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ );
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "aria-label",
+ $spans_unfiltered
+ );
+
+ foreach($spans as $span){
+
+ if(
+ preg_match(
+ '/^Rated/',
+ $span["attributes"]["aria-label"]
+ )
+ ){
+
+ // found rating
+ // scrape rating
+ preg_match(
+ '/([0-9.]+).*([0-9.]+)/',
+ $span["attributes"]["aria-label"],
+ $rating
+ );
+
+ if(isset($rating[1])){
+
+ $web["table"]["Rating"] =
+ $rating[1] . "/" . $rating[2];
+ }
+
+ $has_seen_reviews = 0;
+ foreach($spans_unfiltered as $span_unfiltered){
+
+ if(
+ preg_match(
+ '/([0-9,.]+) +([A-z]+)$/',
+ $this->fuckhtml
+ ->getTextContent(
+ $span_unfiltered
+ ),
+ $votes
+ )
+ ){
+
+ $has_seen_reviews++;
+ $web["table"][ucfirst($votes[2])] = $votes[1];
+ continue;
+ }
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent(
+ $span_unfiltered
+ );
+
+ if(
+ $text == "&nbsp;&nbsp;&nbsp;" ||
+ $text == ""
+ ){
+
+ break;
+ }
+
+ switch($has_seen_reviews){
+
+ case 1:
+ // scrape price
+ $web["table"]["Price"] = $text;
+ $has_seen_reviews++;
+ break;
+
+ case 2:
+ // scrape platform
+ $web["table"]["Platform"] = $text;
+ $has_seen_reviews++;
+ break;
+
+ case 3:
+ // Scrape type
+ $web["table"]["Medium"] = $text;
+ break;
+ }
+ }
+
+ continue 2;
+ }
+ }
+
+ // check if its a table of small sublinks
+ $table =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "display" => "table",
+ "white-space" => "nowrap",
+ "margin" => "5px 0",
+ "line-height" => "1.58",
+ "color" => "var(" . $this->getcolorvar("#70757a") . ")"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($table) !== 0){
+
+ $this->fuckhtml->load($table[0]);
+
+ $rows =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "display" => "flex",
+ "white-space" => "normal"
+ ]
+ ),
+ "div"
+ );
+
+ foreach($rows as $row){
+
+ $this->fuckhtml->load($row);
+
+ $sublink = [
+ "title" => null,
+ "description" => null,
+ "url" => null,
+ "date" => null
+ ];
+
+ $link =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ )[0];
+
+ $sublink["title"] =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $link
+ )
+ );
+
+ $sublink["url"] =
+ $this->unshiturl(
+ $link
+ ["attributes"]
+ ["href"]
+ );
+
+ $row["innerHTML"] =
+ str_replace(
+ $link["outerHTML"],
+ "",
+ $row["innerHTML"]
+ );
+
+ $this->fuckhtml->load($row);
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ );
+
+ foreach($spans as $span){
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent(
+ $span
+ );
+
+ if(
+ preg_match(
+ '/answers?$/',
+ $text
+ )
+ ){
+
+ $sublink["description"] =
+ $text;
+
+ continue;
+ }
+
+ $time = strtotime($text);
+
+ if($time !== false){
+
+ $sublink["date"] = $time;
+ }
+ }
+
+ $web["sublink"][] = $sublink;
+ }
+
+ // reset
+ $this->fuckhtml->load($cat);
+ continue;
+ }
+
+ // check if its an answer header
+ $answer_header =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "overflow" => "hidden",
+ "text-overflow" => "ellipsis"
+ ]
+ ),
+ "span"
+ );
+
+ if(count($answer_header) !== 0){
+
+ $link =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ $cat["innerHTML"] =
+ str_replace(
+ $link[0]["outerHTML"],
+ "",
+ $cat["innerHTML"]
+ );
+
+ $web["sublink"][] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $link[0]
+ ),
+ "description" =>
+ $this->titledots(
+ trim(
+ str_replace(
+ "\xc2\xa0",
+ " ",
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $cat
+ )
+ )
+ ),
+ " ·"
+ )
+ ),
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $link[0]
+ ["attributes"]
+ ["href"]
+ ),
+ "date" => null
+ ];
+
+ continue;
+ }
+
+ // check if its list of small sublinks
+ $urls =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($urls) !== 0){
+
+ // found small links
+ foreach($urls as $url){
+
+ $target =
+ $this->fuckhtml
+ ->getTextContent(
+ $url
+ ["attributes"]
+ ["href"]
+ );
+
+ if(
+ !preg_match(
+ '/^http/',
+ $target
+ )
+ ){
+
+ continue;
+ }
+
+ $web["sublink"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $url
+ )
+ ),
+ "description" => null,
+ "url" => $target,
+ "date" => null
+ ];
+ }
+
+ continue;
+ }
+
+ // we probed everything, assume this is the description
+ // if we didn't find one cleanly previously
+ if($web["description"] === null){
+ $web["description"] =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $cat
+ )
+ );
+ }
+ }
+
+ // check if description contains date
+ $description = explode("—", $web["description"], 2);
+
+ if(
+ count($description) === 2 &&
+ strlen($description[0]) <= 20
+ ){
+
+ $date = strtotime($description[0]);
+
+ if($date !== false){
+
+ $web["date"] = $date;
+ $web["description"] = ltrim($description[1]);
+ }
+ }
+
+ // fetch youtube thumbnail
+ $thumbnail =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "border-radius" => "8px",
+ "height" => "fit-content",
+ "justify-content" => "center",
+ "margin-right" => "20px",
+ "margin-top" => "4px",
+ "position" => "relative",
+ "width" => "fit-content"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($thumbnail) !== 0){
+
+ // load thumbnail container
+ $this->fuckhtml->load($thumbnail[0]);
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ );
+
+ if(
+ count($image) !== 0 &&
+ isset($image[0]["attributes"]["id"])
+ ){
+
+ $web["thumb"] = [
+ "url" =>
+ $this->unshit_thumb(
+ $this->getdimg(
+ $image[0]["attributes"]["id"]
+ )
+ ),
+ "ratio" => "16:9"
+ ];
+ }
+
+ // reset
+ $this->fuckhtml->load($result);
+ }
+
+ $out["web"][] = $web;
+ }
+
+ // reset
+ $this->fuckhtml->load($result_div);
+
+ //
+ // Get instant answers
+ //
+ $answer_containers =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "padding-left" => "0px",
+ "padding-right" => "0px"
+ ]
+ ),
+ "div"
+ );
+
+ $date_class =
+ $this->getstyle(
+ [
+ "font-size" => "12px",
+ "line-height" => "1.34",
+ "display" => "inline-block",
+ "font-family" => "google sans,arial,sans-serif",
+ "padding-right" => "0",
+ "white-space" => "nowrap"
+ ]
+ );
+
+ foreach($answer_containers as $container){
+
+ $this->fuckhtml->load($container);
+
+ $web = [
+ "title" => null,
+ "description" => null,
+ "url" => null,
+ "date" => null,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => []
+ ];
+
+ $answers =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "aria-controls",
+ "div"
+ );
+
+ $item_insert_pos = 1;
+ foreach($answers as $answer){
+
+ $out["related"][] =
+ $this->fuckhtml
+ ->getTextContent(
+ $answer
+ );
+
+ if(
+ isset(
+ $this->blobs[
+ $answer
+ ["attributes"]
+ ["aria-controls"]
+ ]
+ )
+ ){
+
+ $this->fuckhtml->load(
+ $this->blobs[
+ $answer
+ ["attributes"]
+ ["aria-controls"]
+ ]
+ );
+
+ $divs =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "id",
+ "div"
+ );
+
+ foreach($divs as $div){
+
+ if(
+ !isset(
+ $this->blobs[
+ $div
+ ["attributes"]
+ ["id"]
+ ]
+ )
+ ){
+
+ continue;
+ }
+
+ $this->fuckhtml->load(
+ $this->blobs[
+ $div
+ ["attributes"]
+ ["id"]
+ ]
+ );
+
+ // get url
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($as) !== 0){
+
+ $web["url"] =
+ $this->unshiturl(
+ $as[0]["attributes"]["href"]
+ );
+
+ // skip entries that redirect to a search
+ if(
+ !preg_match(
+ '/^http/',
+ $web["url"]
+ )
+ ){
+
+ continue 3;
+ }
+ }
+
+ // get title
+ $h3 =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "h3"
+ );
+
+ if(count($h3) !== 0){
+
+ $web["title"] =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $h3[0]
+ )
+ );
+ }
+
+ $description =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "wa:/description",
+ "div"
+ );
+
+ if(count($description) !== 0){
+
+ // check for date
+ $this->fuckhtml->load($description[0]);
+
+ $date =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $date_class,
+ "span"
+ );
+
+ if(count($date) !== 0){
+
+ $description[0]["innerHTML"] =
+ str_replace(
+ $date[0]["outerHTML"],
+ "",
+ $description[0]["innerHTML"]
+ );
+
+ $web["date"] =
+ strtotime(
+ $this->fuckhtml
+ ->getTextContent(
+ $date[0]
+ )
+ );
+ }
+
+ $web["description"] =
+ ltrim(
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ ),
+ ": "
+ );
+ }
+ }
+
+ foreach($out["web"] as $item){
+
+ if($item["url"] == $web["url"]){
+
+ continue 2;
+ }
+ }
+
+ array_splice($out["web"], $item_insert_pos, 0, [$web]);
+ $item_insert_pos++;
+ }
+ }
+ }
+
+ // reset
+ $this->fuckhtml->load($result_div);
+
+ //
+ // Scrape word definition
+ //
+ $definition_container =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "lr_container",
+ "div"
+ );
+
+ if(count($definition_container) !== 0){
+
+ $this->fuckhtml->load($definition_container[0]);
+
+ // get header
+ $header =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "EntryHeader",
+ "div"
+ );
+
+ if(count($header) !== 0){
+
+ $description = [];
+
+ $this->fuckhtml->load($header[0]);
+
+ $title_div =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "font-family" => "google sans,arial,sans-serif",
+ "font-size" => "28px",
+ "line-height" => "36px"
+ ]
+ )
+ );
+
+ if(count($title_div) !== 0){
+
+ $title =
+ $this->fuckhtml
+ ->getTextContent(
+ $title_div[0]
+ );
+ }else{
+
+ $title = "Word definition";
+ }
+
+ $subtext_div =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "font-family" => "arial,sans-serif",
+ "font-size" => "14px",
+ "line-height" => "22px"
+ ]
+ ),
+ "span"
+ );
+
+ if(count($subtext_div) !== 0){
+
+ $description[] = [
+ "type" => "quote",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $subtext_div[0]
+ )
+ ];
+ }
+
+ // get audio
+ $audio =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "audio"
+ );
+
+ if(count($audio) !== 0){
+
+ $this->fuckhtml->load($audio[0]);
+
+ $source =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "source"
+ );
+
+ if(count($source) !== 0){
+
+ $description[] = [
+ "type" => "audio",
+ "url" =>
+ preg_replace(
+ '/^\/\//',
+ "https://",
+ $this->fuckhtml
+ ->getTextContent(
+ $source[0]
+ ["attributes"]
+ ["src"]
+ )
+ )
+ ];
+ }
+
+ }
+
+ // remove header to avoid confusion
+ $definition_container[0]["innerHTML"] =
+ str_replace(
+ $header[0]["outerHTML"],
+ "",
+ $definition_container[0]["innerHTML"]
+ );
+
+ // reset
+ $this->fuckhtml->load($definition_container[0]);
+
+ $vmods =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "vmod",
+ "div"
+ );
+
+ foreach($vmods as $category){
+
+ if(
+ !isset(
+ $category
+ ["attributes"]
+ ["data-topic"]
+ ) ||
+ $category
+ ["attributes"]
+ ["class"] != "vmod"
+ ){
+
+ continue;
+ }
+
+ $this->fuckhtml->load($category);
+
+ // get category type
+ $type =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "i"
+ );
+
+ if(count($type) !== 0){
+
+ $description[] = [
+ "type" => "title",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $type[0]
+ )
+ ];
+ }
+
+ // get heading text
+ $headings =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "xpdxpnd",
+ "div"
+ );
+
+ foreach($headings as $heading){
+
+ $description[] = [
+ "type" => "quote",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $heading
+ )
+ ];
+ }
+
+ $definitions =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "SenseDefinition",
+ "div"
+ );
+
+ $i = 1;
+ $text = [];
+
+ foreach($definitions as $definition){
+
+ $text[] =
+ $i . ". " .
+ $this->fuckhtml
+ ->getTextContent(
+ $definition
+ );
+
+ $i++;
+ }
+
+ if(count($text) !== 0){
+
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ implode("\n", $text)
+ ];
+ }
+ }
+
+ $out["answer"][] = [
+ "title" => $title,
+ "description" => $description,
+ "url" => null,
+ "thumb" => null,
+ "table" => [],
+ "sublink" => []
+ ];
+ }
+
+ // reset
+ $this->fuckhtml->load($result_div);
+ }
+
+ //
+ // scrape elements with a g-section-with-header
+ // includes: images, news carousels
+ //
+
+ $g_sections =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "g-section-with-header"
+ );
+
+ if(count($g_sections) !== 0){
+ foreach($g_sections as $g_section){
+
+ // parse elements with a g-section-with-header
+ $this->fuckhtml->load($g_section);
+
+ $div_title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "a-no-hover-decoration",
+ "a"
+ );
+
+ if(count($div_title) !== 0){
+
+ // title detected, skip
+ continue;
+ }
+
+ // no title detected: detect news container
+ $news =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "outline-offset" => "-1px",
+ "display" => "flex",
+ "flex-direction" => "column",
+ "flex-grow" => "1"
+ ]
+ )
+ );
+
+ foreach($news as $new){
+
+ $this->fuckhtml->load($new);
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "id",
+ "img"
+ );
+
+ if(
+ count($image) !== 0 &&
+ !(
+ isset($image[0]["attributes"]["style"]) &&
+ strpos(
+ $image[0]["attributes"]["style"],
+ "height:18px"
+ ) !== false
+ )
+ ){
+
+ $thumb = [
+ "url" =>
+ $this->getdimg(
+ $image[0]
+ ["attributes"]
+ ["id"]
+ ),
+ "ratio" => "1:1"
+ ];
+ }
+
+ $title =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "role",
+ "heading",
+ "div"
+ )[0]
+ )
+ );
+
+ $date_div =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "style",
+ "div"
+ );
+
+ if(count($date_div) !== 0){
+
+ foreach($date_div as $div){
+
+ if(
+ strpos(
+ $div["attributes"]["style"],
+ "bottom:"
+ ) !== false
+ ){
+ $date =
+ strtotime(
+ $this->fuckhtml
+ ->getTextContent(
+ $div
+ )
+ );
+
+ break;
+ }
+ }
+ }else{
+
+ $date = null;
+ }
+
+ $out["news"][] = [
+ "title" => $title,
+ "description" => null,
+ "date" => $date,
+ "thumb" => $thumb,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $new
+ ["attributes"]
+ ["href"]
+ )
+ ];
+ }
+ }
+
+ // reset
+ $this->fuckhtml->load($result_div);
+ }
+
+ //
+ // Parse images (carousel, left hand-side)
+ //
+ $image_carousels =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "id",
+ "media_result_group",
+ "div"
+ );
+
+ if(count($image_carousels) !== 0){
+
+ foreach($image_carousels as $image_carousel){
+
+ $this->fuckhtml->load($image_carousel);
+
+ // get related searches in image carousel
+ $relateds =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "display" => "inline-block",
+ "margin-right" => "6px",
+ "outline" => "none",
+ "padding" => "6px 0"
+ ],
+ "a"
+ )
+ );
+
+ foreach($relateds as $related){
+
+ if(!isset($related["innerHTML"])){
+
+ // found an image
+ continue;
+ }
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent(
+ $related
+ );
+
+ if($text != ""){
+
+ $out["related"][] = $text;
+ }
+ }
+
+ $div =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "div"
+ );
+
+ // get loaded images
+ $images =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "ivg-i",
+ $div
+ );
+
+ foreach($images as $image){
+
+ $this->fuckhtml->load($image);
+
+ $img_tags =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ );
+
+ if(
+ !isset($image["attributes"]["data-docid"]) ||
+ !isset($this->image_arr[$image["attributes"]["data-docid"]])
+ ){
+
+ continue;
+ }
+
+ // search for the right image tag
+ $image_tag = false;
+ foreach($img_tags as $img){
+
+ if(
+ isset(
+ $img
+ ["attributes"]
+ ["alt"]
+ ) &&
+ trim(
+ $img
+ ["attributes"]
+ ["alt"]
+ ) != ""
+ ){
+
+ $image_tag = $img;
+ break;
+ }
+ }
+
+ if($image_tag === false){
+
+ continue;
+ }
+
+ $out["image"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $image_tag
+ ["attributes"]
+ ["alt"]
+ )
+ ),
+ "source" =>
+ $this->image_arr[
+ $image
+ ["attributes"]
+ ["data-docid"]
+ ],
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $image
+ ["attributes"]
+ ["data-lpage"]
+ )
+ ];
+ }
+
+ // get unloaded javascript images
+ $images_js_sel =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "id",
+ $div
+ );
+
+ $loaded = [];
+
+ foreach($images_js_sel as $sel){
+
+ if(
+ !isset($this->blobs[$sel["attributes"]["id"]]) ||
+ in_array((string)$sel["attributes"]["id"], $loaded, true)
+ ){
+
+ // not an unloaded javascript image
+ continue;
+ }
+
+ $loaded[] = $sel["attributes"]["id"];
+
+ // get yet another javascript component
+ $this->fuckhtml->load($this->blobs[$sel["attributes"]["id"]]);
+
+ // get js node: contains title & url
+ $js_node =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "div"
+ )[0];
+
+ if(!isset($this->blobs[$js_node["attributes"]["id"]])){
+
+ // did not find refer id
+ continue;
+ }
+
+ // load second javascript component
+ $this->fuckhtml->load($this->blobs[$js_node["attributes"]["id"]]);
+
+ // get title from image alt text.
+ // data-src from this image is cropped, ignore it..
+ $img =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ )[0];
+
+ $out["image"][] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $img["attributes"]["alt"]
+ ),
+ "source" =>
+ $this->image_arr[
+ $js_node["attributes"]["data-docid"]
+ ],
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $js_node["attributes"]["data-lpage"]
+ )
+ ];
+ }
+ }
+
+ // reset
+ $this->fuckhtml->load($result_div);
+ }
+
+ //
+ // Parse videos
+ //
+ $this->fuckhtml->load($result_div);
+
+ $videos =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "data-vid",
+ "div"
+ );
+
+ foreach($videos as $video){
+
+ $this->fuckhtml->load($video);
+
+ // get url
+ $url =
+ $this->fuckhtml
+ ->getTextContent(
+ $video
+ ["attributes"]
+ ["data-surl"]
+ );
+
+ foreach($out["web"] as $link){
+
+ if($link["url"] == $url){
+
+ // ignore if we already have the video in $out["web"]
+ continue 2;
+ }
+ }
+
+ // get heading element
+ $heading =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "role",
+ "heading",
+ "div"
+ );
+
+ if(count($heading) === 0){
+
+ // no heading, fuck this.
+ continue;
+ }
+
+ // get thumbnail before loading heading object
+ $image =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "id",
+ "img"
+ );
+
+ if(count($image) !== 0){
+
+ $thumb = [
+ "url" => $this->getdimg($image[0]["attributes"]["id"]),
+ "ratio" => "16:9"
+ ];
+ }else{
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }
+
+ // get duration
+ $duration_div =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "border-radius" => "10px",
+ "font-family" => "arial,sans-serif-medium,sans-serif",
+ "font-size" => "12px",
+ "line-height" => "16px",
+ "padding-block" => "2px",
+ "padding-inline" => "8px"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($duration_div) !== 0){
+
+ $duration =
+ $this->hms2int(
+ $this->fuckhtml
+ ->getTextContent(
+ $duration_div[0]
+ )
+ );
+ }else{
+
+ // check if its a livestream
+ $duration =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "background-color" => "#d93025",
+ "border-radius" => "10px",
+ "color" => "#fff",
+ "font-family" => "arial,sans-serif-medium,sans-serif",
+ "font-size" => "12px",
+ "line-height" => "16px",
+ "padding-block" => "2px",
+ "padding-inline" => "8px"
+ ]
+ ),
+ "span"
+ );
+
+ if(count($duration) !== 0){
+
+ $duration = "_LIVE";
+ }else{
+
+ $duration = null;
+ }
+ }
+
+ // load heading
+ $this->fuckhtml->load($heading[0]);
+
+ // get title
+ $title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "font-family" => "arial,sans-serif",
+ "font-size" => "16px",
+ "font-weight" => "400",
+ "line-height" => "24px"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($title) === 0){
+
+ // ?? no title
+ continue;
+ }
+
+ $title =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ )
+ );
+
+ // get date
+ $date_div =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "color" => "var(" . $this->getcolorvar("#70757a") . ")",
+ "font-size" => "14px"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($date_div) !== 0){
+
+ $date = strtotime(
+ $this->fuckhtml
+ ->getTextContent(
+ $date_div[0]
+ )
+ );
+
+ if($date === false){
+
+ // failed to parse date
+ $date = null;
+ }
+ }else{
+
+ $date = null;
+ }
+
+ $out["video"][] = [
+ "title" => $title,
+ "description" => null,
+ "date" => $date,
+ "duration" => $duration,
+ "views" => null,
+ "thumb" => $thumb,
+ "url" => $url
+ ];
+ }
+
+ //
+ // Parse featured results (which contain images, fuck the rest desu)
+ //
+ $this->fuckhtml->load($html);
+ $top =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "aria-label",
+ "Featured results",
+ "div"
+ );
+
+ if(count($top) !== 0){
+
+ $this->fuckhtml->load($top[0]);
+
+ // get images
+ $grid =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "border-radius" => "20px",
+ "display" => "grid",
+ "grid-gap" => "2px",
+ "grid-template-rows" => "repeat(2,minmax(0,1fr))",
+ "overflow" => "hidden",
+ "bottom" => "0",
+ "left" => "0",
+ "right" => "0",
+ "top" => "0",
+ "position" => "absolute",
+ ]
+ ),
+ "div"
+ );
+
+ if(count($grid) !== 0){
+
+ // we found image grid
+ $this->fuckhtml->load($grid[0]);
+
+ $images_div =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "data-attrid",
+ "div"
+ );
+
+ foreach($images_div as $image_div){
+
+ $this->fuckhtml->load($image_div);
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ );
+
+ if(
+ count($image) === 0 ||
+ !isset($image_div["attributes"]["data-docid"]) ||
+ !isset($this->image_arr[$image_div["attributes"]["data-docid"]])
+ ){
+
+ // ?? no image, continue
+ continue;
+ }
+
+ $out["image"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $image[0]["attributes"]["alt"]
+ )
+ ),
+ "source" =>
+ $this->image_arr[
+ $image_div["attributes"]["data-docid"]
+ ],
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $image_div["attributes"]["data-lpage"]
+ )
+ ];
+ }
+ }
+ }
+
+
+ //
+ // craft $npt token
+ //
+ if(
+ $last_page === false &&
+ count($out["web"]) !== 0
+ ){
+ if(!isset($params["start"])){
+
+ $params["start"] = 20;
+ }else{
+
+ $params["start"] += 20;
+ }
+
+ $out["npt"] =
+ $this->backend
+ ->store(
+ json_encode($params),
+ $pagetype,
+ $proxy
+ );
+ }
+
+
+ //
+ // Parse right handside
+ //
+ $this->fuckhtml->load($html);
+
+ $rhs =
+ $this->fuckhtml
+ ->getElementById(
+ "rhs"
+ );
+
+ if($rhs === null){
+
+ return $out;
+ }
+
+ $this->fuckhtml->load($rhs);
+
+ // get images gallery
+ $image_gallery =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-rc",
+ "ivg-i",
+ "div"
+ );
+
+ if(count($image_gallery) !== 0){
+
+ $this->fuckhtml->load($image_gallery[0]);
+
+ // get images
+ $images_div =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "ivg-i",
+ "div"
+ );
+
+ foreach($images_div as $image_div){
+
+ $this->fuckhtml->load($image_div);
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ );
+
+ if(
+ count($image) === 0 ||
+ !isset(
+ $this->image_arr[
+ $image_div
+ ["attributes"]
+ ["data-docid"]
+ ]
+ )
+ ){
+
+ continue;
+ }
+
+ foreach($out["image"] as $existing_image){
+
+ // might already exist
+ if(
+ $existing_image["source"][1]["url"] ==
+ $this->image_arr[
+ $image_div
+ ["attributes"]
+ ["data-docid"]
+ ][1]["url"]
+ ){
+
+ continue 2;
+ }
+ }
+
+ $out["image"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $image[0]
+ ["attributes"]
+ ["alt"]
+ )
+ ),
+ "source" =>
+ $this->image_arr[
+ $image_div
+ ["attributes"]
+ ["data-docid"]
+ ],
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $image_div
+ ["attributes"]
+ ["data-lpage"]
+ )
+ ];
+ }
+
+ // reset
+ $this->fuckhtml->load($rhs);
+ }
+
+ // get header container
+ $header =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "padding" => "0 0 16px 20px",
+ "display" => "flex"
+ ]
+ ),
+ "div"
+ );
+
+ // stop parsing wikipedia heads if there isn't a header
+ $description = [];
+ $title = "About";
+
+ if(count($header) !== 0){
+
+ $this->fuckhtml->load($header[0]);
+
+ // g-snackbar-action present: we found a button instead
+ if(
+ count(
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "g-snackbar-action"
+ )
+ ) !== 0
+ ){
+
+ $title_tag =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "title",
+ "div"
+ );
+
+ if(count($title_tag) !== 0){
+ $title =
+ $this->fuckhtml
+ ->getTextContent(
+ $title_tag[0]
+ );
+
+ $header[0]["innerHTML"] =
+ str_replace(
+ $title_tag[0]["outerHTML"],
+ "",
+ $header[0]["innerHTML"]
+ );
+
+ // if header still contains text, add it as a subtitle in description
+ $subtitle =
+ $this->fuckhtml
+ ->getTextContent(
+ $header[0]
+ );
+
+ if(strlen($subtitle) !== 0){
+
+ $description[] = [
+ "type" => "quote",
+ "value" => $subtitle
+ ];
+ }
+ }
+ }
+
+ // reset
+ $this->fuckhtml->load($rhs);
+ }
+
+ // get description elements
+ $url = null;
+
+ $text =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "description",
+ "div"
+ );
+
+ if(count($text) !== 0){
+
+ $this->fuckhtml->load($text[0]);
+
+ $a =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($a) !== 0){
+ // get link and remove it from description
+
+ $a = $a[count($a) - 1];
+
+ $text[0]["innerHTML"] =
+ str_replace(
+ $a["outerHTML"],
+ "",
+ $text[0]["innerHTML"]
+ );
+
+ $url =
+ $this->fuckhtml
+ ->getTextContent(
+ $a
+ ["attributes"]
+ ["href"]
+ );
+ }
+
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ html_entity_decode(
+ preg_replace(
+ '/^Description/',
+ "",
+ $this->fuckhtml
+ ->getTextContent(
+ $text[0]
+ )
+ )
+ )
+ ];
+
+ // reset
+ $this->fuckhtml->load($rhs);
+ }
+
+ // get reviews (google play, steam, etc)
+ $review_container =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "align-items" => "start",
+ "display" => "flex"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($review_container) !== 0){
+
+ $this->fuckhtml->load($review_container[0]);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($as) !== 0){
+
+ $description[] = [
+ "type" => "title",
+ "value" => "Ratings"
+ ];
+
+ foreach($as as $a){
+
+ $this->fuckhtml->load($a);
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ );
+
+ if(count($spans) >= 2){
+
+ $value =
+ trim(
+ $this->fuckhtml
+ ->getTextContent(
+ $spans[1]
+ ),
+ "· "
+ );
+
+ if(
+ $value == "" &&
+ isset($spans[2])
+ ){
+
+ $value =
+ $this->fuckhtml
+ ->getTextContent(
+ $spans[2]
+ );
+ }
+
+ $description[] = [
+ "type" => "link",
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]
+ ["href"]
+ ),
+ "value" => $value
+ ];
+
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ ": " .
+ $this->fuckhtml
+ ->getTextContent(
+ $spans[0]
+ ) . "\n"
+ ];
+ }
+ }
+ }
+
+ // reset
+ $this->fuckhtml->load($rhs);
+ }
+
+ // initialize sublinks
+ $sublinks = [];
+
+ // get description from business
+ if(count($description) === 0){
+
+ $data_attrid =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "data-attrid"
+ );
+
+ $summary =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "kc:/local:one line summary",
+ $data_attrid
+ );
+
+ if(count($summary) !== 0){
+
+ $description[] = [
+ "type" => "quote",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $summary[0]
+ )
+ ];
+
+ // remove summary so it doesnt get parsed as a table
+ $rhs["innerHTML"] =
+ str_replace(
+ $summary[0]["outerHTML"],
+ "",
+ $rhs["innerHTML"]
+ );
+
+ $this->fuckhtml->load($rhs);
+ }
+
+ $address =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "kc:/location/location:address",
+ $data_attrid
+ );
+
+ if(count($address) !== 0){
+
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $address[0]
+ )
+ ];
+ }
+
+ // get title
+ $title_div =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "title",
+ $data_attrid
+ );
+
+ if(count($title_div) !== 0){
+
+ $title =
+ $this->fuckhtml
+ ->getTextContent(
+ $title_div[0]
+ );
+ }
+
+ // get phone number
+ $phone =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "kc:/local:alt phone",
+ $data_attrid
+ );
+
+ if(count($phone) !== 0){
+
+ $this->fuckhtml->load($phone[0]);
+
+ $sublinks["Call"] =
+ "tel:" .
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "aria-label",
+ "span"
+ )[0]
+ );
+
+ $this->fuckhtml->load($rhs);
+ }
+ }
+
+ if(count($description) === 0){
+
+ // still no description? abort
+ return $out;
+ }
+
+ // get table elements
+ $table = [];
+ $table_elems =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "margin-top" => "7px"
+ ]
+ ),
+ "div"
+ );
+
+ foreach($table_elems as $elem){
+
+ $this->fuckhtml->load($elem);
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ );
+
+ if(count($spans) === 0){
+
+ // ?? invalid
+ continue;
+ }
+
+ $elem["innerHTML"] =
+ str_replace(
+ $spans[0]["outerHTML"],
+ "",
+ $elem["innerHTML"]
+ );
+
+ $key =
+ rtrim(
+ $this->fuckhtml
+ ->getTextContent(
+ $spans[0]
+ ),
+ ": "
+ );
+
+ if(
+ $key == "" ||
+ $key == "Phone"
+ ){
+
+ continue;
+ }
+
+ if($key == "Hours"){
+
+ $hours = [];
+
+ $this->fuckhtml->load($elem);
+
+ $trs =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "tr"
+ );
+
+ foreach($trs as $tr){
+
+ $this->fuckhtml->load($tr);
+
+ $tds =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "td"
+ );
+
+ if(count($tds) === 2){
+
+ $hours[] =
+ $this->fuckhtml
+ ->getTextContent(
+ $tds[0]
+ ) . ": " .
+ $this->fuckhtml
+ ->getTextContent(
+ $tds[1]
+ );
+ }
+ }
+
+ if(count($hours) !== 0){
+
+ $hours = implode("\n", $hours);
+ $table["Hours"] = $hours;
+ }
+
+ continue;
+ }
+
+ $table[$key] =
+ preg_replace(
+ '/ +/',
+ " ",
+ $this->fuckhtml
+ ->getTextContent(
+ $elem
+ )
+ );
+ }
+
+ // reset
+ $this->fuckhtml->load($rhs);
+
+ // get the website div
+ $as =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "visit_official_site",
+ "a"
+ );
+
+ if(count($as) !== 0){
+
+ $sublinks["Website"] =
+ str_replace(
+ "http://",
+ "https://",
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]
+ ["attributes"]
+ ["href"]
+ )
+ );
+ }else{
+
+ // get website through button
+ $button =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "ab_button",
+ "a"
+ );
+
+ if(count($button) !== 0){
+
+ $sublinks["Website"] =
+ $this->unshiturl(
+ $this->fuckhtml
+ ->getTextContent(
+ $button[0]
+ ["attributes"]
+ ["href"]
+ )
+ );
+ }
+ }
+
+ // get social media links
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "g-link"
+ );
+
+ foreach($as as $a){
+
+ $this->fuckhtml->load($a);
+
+ $link =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($link) === 0){
+
+ continue;
+ }
+
+ $sublink_title =
+ $this->fuckhtml
+ ->getTextContent(
+ $a
+ );
+
+ if($sublink_title == "X (Twitter)"){
+
+ $sublink_title = "Twitter";
+ }
+
+ $sublinks[$sublink_title] =
+ $this->fuckhtml
+ ->getTextContent(
+ $link[0]
+ ["attributes"]
+ ["href"]
+ );
+ }
+
+ // reset
+ $this->fuckhtml->load($rhs);
+
+ // get those round containers
+ $containers =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "tpa-ci"
+ );
+
+ foreach($containers as $container){
+
+ $this->fuckhtml->load($container);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($as) === 0){
+
+ continue;
+ }
+
+ $sublinks[
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]
+ )
+ ] =
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]
+ ["attributes"]
+ ["href"]
+ );
+ }
+
+ $out["answer"][] = [
+ "title" => $title,
+ "description" => $description,
+ "url" => $url,
+ "thumb" => null,
+ "table" => $table,
+ "sublink" => $sublinks
+ ];
+
+ return $out;
+ }
+
+
+ private function scrape_dimg($html){
+
+ // get images loaded through javascript
+ $this->dimg = [];
+
+ preg_match_all(
+ '/function\(\){google\.ldi=({.*?});/',
+ $html,
+ $dimg
+ );
+
+ if(isset($dimg[1])){
+
+ foreach($dimg[1] as $i){
+
+ $tmp = json_decode($i, true);
+ foreach($tmp as $key => $value){
+
+ $this->dimg[$key] =
+ $this->unshit_thumb(
+ $value
+ );
+ }
+ }
+ }
+
+ // get additional javascript base64 images
+ preg_match_all(
+ '/var s=\'(data:image\/[^\']+)\';var ii=\[((?:\'[^\']+\',?)+)\];/',
+ $html,
+ $dimg
+ );
+
+ if(isset($dimg[1])){
+
+ for($i=0; $i<count($dimg[1]); $i++){
+
+ $delims = explode(",", $dimg[2][$i]);
+ $string =
+ $this->fuckhtml
+ ->parseJsString(
+ $dimg[1][$i]
+ );
+
+ foreach($delims as $delim){
+
+ $this->dimg[trim($delim, "'")] = $string;
+ }
+ }
+ }
+ }
+
+
+ private function scrape_imagearr($html){
+ // get image links arrays
+ preg_match_all(
+ '/\[0,"([^"]+)",\["([^"]+)\",([0-9]+),([0-9]+)\],\["([^"]+)",([0-9]+),([0-9]+)\]/',
+ $html,
+ $image_arr
+ );
+
+ $this->image_arr = [];
+ if(isset($image_arr[1])){
+
+ for($i=0; $i<count($image_arr[1]); $i++){
+
+ $this->image_arr[$image_arr[1][$i]] =
+ [
+ [
+ "url" =>
+ $this->fuckhtml
+ ->parseJsString(
+ $image_arr[5][$i]
+ ),
+ "width" => (int)$image_arr[7][$i],
+ "height" => (int)$image_arr[6][$i]
+ ],
+ [
+ "url" =>
+ $this->unshit_thumb(
+ $this->fuckhtml
+ ->parseJsString(
+ $image_arr[2][$i]
+ )
+ ),
+ "width" => (int)$image_arr[4][$i],
+ "height" => (int)$image_arr[3][$i]
+ ]
+ ];
+ }
+ }
+ }
+
+
+ private function getdimg($dimg){
+
+ return isset($this->dimg[$dimg]) ? $this->dimg[$dimg] : null;
+ }
+
+
+ private function unshit_thumb($url){
+ // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj
+ // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA
+
+ $parts = parse_url($url);
+
+ if(
+ isset($parts["host"]) &&
+ preg_match(
+ '/tbn.*\.gstatic\.com/',
+ $parts["host"]
+ )
+ ){
+
+ parse_str($parts["query"], $params);
+
+ if(isset($params["q"])){
+
+ return "https://" . $parts["host"] . "/images?q=" . $params["q"];
+ }
+ }
+
+ return $url;
+ }
+
+
+ private function parsestyles(){
+
+ $styles = [];
+
+ $style_div =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "style"
+ );
+
+ $raw_styles = "";
+
+ foreach($style_div as $style){
+
+ $raw_styles .= $style["innerHTML"];
+ }
+
+ // filter out media/keyframe queries
+ $raw_styles =
+ preg_replace(
+ '/@\s*(?!font-face)[^{]+\s*{[\S\s]+?}\s*}/',
+ "",
+ $raw_styles
+ );
+
+ // get styles
+ preg_match_all(
+ '/(.+?){([\S\s]*?)}/',
+ $raw_styles,
+ $matches
+ );
+
+ for($i=0; $i<count($matches[1]); $i++){
+
+ // get style values
+ preg_match_all(
+ '/([^:;]+):([^;]*?(?:\([^)]+\)[^;]*?)?)(?:;|$)/',
+ $matches[2][$i],
+ $values_regex
+ );
+
+ $values = [];
+ for($k=0; $k<count($values_regex[1]); $k++){
+
+ $values[trim($values_regex[1][$k])] =
+ strtolower(trim($values_regex[2][$k]));
+ }
+
+ $names = explode(",", $matches[1][$i]);
+
+ // h1,h2,h3 will each get their own array index
+ foreach($names as $name){
+
+ $name = trim($name, "}\t\n\r\0\x0B");
+
+ foreach($values as $key => $value){
+
+ $styles[$name][$key] = $value;
+ }
+ }
+ }
+
+ foreach($styles as $key => $values){
+
+ $styles[$key]["_c"] = count($values);
+ }
+
+ $this->styles = $styles;
+
+ // get CSS colors
+ $this->css_colors = [];
+
+ if(isset($this->styles[":root"])){
+
+ foreach($this->styles[":root"] as $key => $value){
+
+ $this->css_colors[$value] = strtolower($key);
+ }
+ }
+ }
+
+
+
+ private function getstyle($styles){
+
+ $styles["_c"] = count($styles);
+
+ foreach($this->styles as $style_key => $style_values){
+
+ if(count(array_intersect_assoc($style_values, $styles)) === $styles["_c"] + 1){
+
+ $style_key =
+ explode(" ", $style_key);
+
+ $style_key = $style_key[count($style_key) - 1];
+
+ return
+ ltrim(
+ str_replace(
+ [".", "#"],
+ " ",
+ $style_key
+ )
+ );
+ }
+ }
+
+ return false;
+ }
+
+
+
+ private function getcolorvar($color){
+
+ if(isset($this->css_colors[$color])){
+
+ return $this->css_colors[$color];
+ }
+
+ return null;
+ }
+
+
+
+ public function web($get){
+
+ if($get["npt"]){
+
+ [$params, $proxy] = $this->backend->get($get["npt"], "web");
+ $params = json_decode($params, true);
+
+ $search = $params["q"];
+
+ }else{
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $lang = $get["lang"];
+ $older = $get["older"];
+ $newer = $get["newer"];
+ $spellcheck = $get["spellcheck"];
+ $proxy = $this->backend->get_ip();
+
+ $offset = 0;
+
+ $params = [
+ "q" => $search,
+ "hl" => "en",
+ "num" => 20 // get 20 results
+ ];
+
+ // country
+ if($country != "any"){
+
+ $params["gl"] = $country;
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ // language
+ if($lang != "any"){
+
+ $params["lr"] = "lang_" . $lang;
+ }
+
+ // generate tbs
+ $tbs = [];
+
+ // get date
+ $older = $older === false ? null : date("m/d/Y", $older);
+ $newer = $newer === false ? null : date("m/d/Y", $newer);
+
+ if(
+ $older !== null ||
+ $newer !== null
+ ){
+
+ $tbs["cdr"] = "1";
+ $tbs["cd_min"] = $newer;
+ $tbs["cd_max"] = $older;
+ }
+
+ // spellcheck filter
+ if($spellcheck == "no"){
+
+ $params["nfpr"] = "1";
+ }
+
+ if(count($tbs) !== 0){
+
+ $params["tbs"] = "";
+
+ foreach($tbs as $key => $value){
+
+ $params["tbs"] .= $key . ":" . $value . ",";
+ }
+
+ $params["tbs"] = rtrim($params["tbs"], ",");
+ }
+ }
+
+ try{
+ $html =
+ $this->get(
+ $proxy,
+ "https://www.google.com/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ //$html = file_get_contents("scraper/google.html");
+
+ return $this->parsepage($html, "web", $search, $proxy, $params);
+ }
+
+
+
+ public function video($get){
+
+ if($get["npt"]){
+
+ [$params, $proxy] = $this->backend->get($get["npt"], "web");
+ $params = json_decode($params, true);
+
+ $search = $params["q"];
+
+ }else{
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $older = $get["older"];
+ $newer = $get["newer"];
+ $duration = $get["duration"];
+ $quality = $get["quality"];
+ $captions = $get["captions"];
+ $proxy = $this->backend->get_ip();
+
+ $params = [
+ "q" => $search,
+ "tbm" => "vid",
+ "hl" => "en",
+ "num" => "20"
+ ];
+
+ // country
+ if($country != "any"){
+
+ $params["gl"] = $country;
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ $tbs = [];
+
+ // get date
+ $older = $older === false ? null : date("m/d/Y", $older);
+ $newer = $newer === false ? null : date("m/d/Y", $newer);
+
+ if(
+ $older !== null ||
+ $newer !== null
+ ){
+
+ $tbs["cdr"] = "1";
+ $tbs["cd_min"] = $newer;
+ $tbs["cd_max"] = $older;
+ }
+
+ // duration
+ if($duration != "any"){
+
+ $tbs[] = "dur:" . $duration;
+ }
+
+ // quality
+ if($quality != "any"){
+
+ $tbs[] = "hq:" . $quality;
+ }
+
+ // captions
+ if($captions != "any"){
+
+ $tbs[] = "cc:" . $captions;
+ }
+
+ // append tbs
+ if(count($tbs) !== 0){
+
+ $params["tbs"] =
+ implode(",", $tbs);
+ }
+ }
+
+ try{
+ $html =
+ $this->get(
+ $proxy,
+ "https://www.google.com/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ //$html = file_get_contents("scraper/google.html");
+
+ $response = $this->parsepage($html, "videos", $search, $proxy, $params);
+ $out = [
+ "status" => "ok",
+ "npt" => $response["npt"],
+ "video" => [],
+ "author" => [],
+ "livestream" => [],
+ "playlist" => [],
+ "reel" => []
+ ];
+
+ foreach($response["web"] as $result){
+
+ $out["video"][] = [
+ "title" => $result["title"],
+ "description" => $result["description"],
+ "author" => [
+ "name" => isset($result["table"]["Author"]) ? $result["table"]["Author"] : null,
+ "url" => null,
+ "avatar" => null
+ ],
+ "date" => $result["date"],
+ "duration" => isset($result["table"]["Duration"]) ? $this->hms2int($result["table"]["Duration"]) : null,
+ "views" => null,
+ "thumb" => $result["thumb"],
+ "url" => $result["url"]
+ ];
+ }
+
+ return $out;
+ }
+
+
+
+ public function news($get){
+
+ if($get["npt"]){
+
+ [$req, $proxy] = $this->backend->get($get["npt"], "news");
+ /*parse_str(
+ parse_url($req, PHP_URL_QUERY),
+ $search
+ );*/
+
+ try{
+
+ $html =
+ $this->get(
+ $proxy,
+ "https://www.google.com" . $req,
+ []
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ }else{
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $older = $get["older"];
+ $newer = $get["newer"];
+ $sort = $get["sort"];
+ $proxy = $this->backend->get_ip();
+
+ $params = [
+ "q" => $search,
+ "tbm" => "nws",
+ "hl" => "en",
+ "num" => "20"
+ ];
+
+ // country
+ if($country != "any"){
+
+ $params["gl"] = $country;
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ $tbs = [];
+
+ // get date
+ $older = $older === false ? null : date("m/d/Y", $older);
+ $newer = $newer === false ? null : date("m/d/Y", $newer);
+
+ if(
+ $older !== null ||
+ $newer !== null
+ ){
+
+ $tbs["cdr"] = "1";
+ $tbs["cd_min"] = $newer;
+ $tbs["cd_max"] = $older;
+ }
+
+ // relevance
+ if($sort == "date"){
+
+ $tbs["sbd"] = "1";
+ }
+
+ // append tbs
+ if(count($tbs) !== 0){
+
+ $params["tbs"] = "";
+
+ foreach($tbs as $key => $value){
+
+ $params["tbs"] .= $key . ":" . $value . ",";
+ }
+
+ $params["tbs"] = rtrim($params["tbs"], ",");
+ }
+
+ //$html = file_get_contents("scraper/google-news.html");
+
+ $html =
+ $this->get(
+ $proxy,
+ "https://www.google.com/search",
+ $params
+ );
+ }
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "news" => []
+ ];
+
+ $this->fuckhtml->load($html);
+
+ $this->detect_sorry();
+
+ // get images
+ $this->scrape_dimg($html);
+
+ // parse styles
+ $this->parsestyles();
+
+ $center_col =
+ $this->fuckhtml
+ ->getElementById(
+ "center_col",
+ "div"
+ );
+
+ if($center_col === null){
+
+ throw new Exception("Could not grep result div");
+ }
+
+ $this->fuckhtml->load($center_col);
+
+ // get next page
+ $npt =
+ $this->fuckhtml
+ ->getElementById(
+ "pnnext",
+ "a"
+ );
+
+ if($npt !== false){
+
+ $out["npt"] =
+ $this->backend->store(
+ $this->fuckhtml
+ ->getTextContent(
+ $npt["attributes"]
+ ["href"]
+ ),
+ "news",
+ $proxy
+ );
+ }
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "jsname",
+ "a"
+ );
+
+ foreach($as as $a){
+
+ $this->fuckhtml->load($a);
+
+ // get title
+ $title =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "role",
+ "heading",
+ "div"
+ );
+
+ if(count($title) === 0){
+
+ continue;
+ }
+
+ $title =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ )
+ );
+
+ // get thumbnail
+ $image =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "id",
+ "img"
+ );
+
+ // check for padded title node, if found, we're inside a carousel
+ $probe =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "padding" => "16px 16px 40px 16px"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($probe) !== 0){
+
+ $probe = true;
+ }else{
+
+ $probe = false;
+ }
+
+ if(
+ count($image) !== 0 &&
+ !isset($image[0]["attributes"]["width"])
+ ){
+
+ $thumb = [
+ "url" =>
+ $this->getdimg(
+ $image[0]["attributes"]["id"]
+ ),
+ "ratio" => $probe === true ? "16:9" : "1:1"
+ ];
+ }else{
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }
+
+ $description = null;
+
+ if($probe === false){
+
+ $desc_divs =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "style",
+ "div"
+ );
+
+ foreach($desc_divs as $desc){
+
+ if(
+ strpos(
+ $desc["attributes"]["style"],
+ "margin-top:"
+ ) !== false
+ ){
+
+ $description =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $desc
+ )
+ );
+ break;
+ }
+ }
+ }
+
+ // get author
+ $author =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "overflow" => "hidden",
+ "text-align" => "left",
+ "text-overflow" => "ellipsis",
+ "white-space" => "nowrap",
+ "margin-bottom" => "8px"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($author) !== 0){
+
+ $author =
+ $this->fuckhtml
+ ->getTextContent(
+ $author[0]
+ );
+ }else{
+
+ $author = null;
+ }
+
+ // get date
+ $date = null;
+
+ $date_div =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "style",
+ "div"
+ );
+
+ foreach($date_div as $d){
+
+ $this->fuckhtml->load($d);
+
+ $span =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ );
+
+ if(
+ strpos(
+ $d["attributes"]["style"],
+ "bottom:"
+ ) !== false
+ ){
+
+ $date =
+ strtotime(
+ $this->fuckhtml
+ ->getTextContent(
+ $span[count($span) - 1]
+ )
+ );
+ break;
+ }
+ }
+
+ $out["news"][] = [
+ "title" => $title,
+ "author" => $author,
+ "description" => $description,
+ "date" => $date,
+ "thumb" => $thumb,
+ "url" =>
+ $this->unshiturl(
+ $a["attributes"]
+ ["href"]
+ )
+ ];
+ }
+
+ return $out;
+ }
+
+
+
+
+ public function image($get){
+
+ // generate parameters
+ if($get["npt"]){
+
+ [$params, $proxy] =
+ $this->backend->get(
+ $get["npt"],
+ "images"
+ );
+
+ $params = json_decode($params, true);
+ }else{
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $proxy = $this->backend->get_ip();
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $time = $get["time"];
+ $size = $get["size"];
+ $ratio = $get["ratio"];
+ $color = $get["color"];
+ $type = $get["type"];
+ $format = $get["format"];
+ $rights = $get["rights"];
+
+ $params = [
+ "q" => $search,
+ "udm" => "2" // get images
+ ];
+
+ // country (image search uses cr instead of gl)
+ if($country != "any"){
+
+ $params["cr"] = "country" . strtoupper($country);
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ // generate tbs
+ $tbs = [];
+
+ // time
+ if($time != "any"){
+
+ $tbs["qdr"] = $time;
+ }
+
+ // size
+ if($size != "any"){
+
+ $params["imgsz"] = $size;
+ }
+
+ // ratio
+ if($ratio != "any"){
+
+ $params["imgar"] = $ratio;
+ }
+
+ // color
+ if($color != "any"){
+
+ if(
+ $color == "color" ||
+ $color == "trans"
+ ){
+
+ $params["imgc"] = $color;
+ }elseif($color == "bnw"){
+
+ $params["imgc"] = "gray";
+ }else{
+
+ $tbs["ic"] = "specific";
+ $tbs["isc"] = $color;
+ }
+ }
+
+ // type
+ if($type != "any"){
+
+ $tbs["itp"] = $type;
+ }
+
+ // format
+ if($format != "any"){
+
+ $params["as_filetype"] = $format;
+ }
+
+ // rights (tbs)
+ if($rights != "any"){
+
+ $tbs["sur"] = $rights;
+ }
+
+ // append tbs
+ if(count($tbs) !== 0){
+
+ $params["tbs"] = "";
+
+ foreach($tbs as $key => $value){
+
+ $params["tbs"] .= $key . ":" . $value . ",";
+ }
+
+ $params["tbs"] = rtrim($params["tbs"], ",");
+ }
+ }
+ /*
+ $handle = fopen("scraper/google-img.html", "r");
+ $html = fread($handle, filesize("scraper/google-img.html"));
+ fclose($handle);*/
+
+ try{
+ $html =
+ $this->get(
+ $proxy,
+ "https://www.google.com/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get search page");
+ }
+
+ $this->fuckhtml->load($html);
+
+ $this->detect_sorry();
+
+ // get javascript images
+ $this->scrape_imagearr($html);
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "image" => []
+ ];
+
+ $images =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "ivg-i",
+ "div"
+ );
+
+ foreach($images as $div){
+
+ $this->fuckhtml->load($div);
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByTagName("img")[0];
+
+ $out["image"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $image["attributes"]["alt"]
+ )
+ ),
+ "source" =>
+ $this->image_arr[
+ $div["attributes"]["data-docid"]
+ ],
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $div["attributes"]["data-lpage"]
+ )
+ ];
+ }
+
+ // as usual, no way to check if there is a next page reliably
+ if(count($out["image"]) > 50){
+
+ if(!isset($params["start"])){
+
+ $params["start"] = 10;
+ }else{
+
+ $params["start"] += 10;
+ }
+
+ $out["npt"] =
+ $this->backend
+ ->store(
+ json_encode($params),
+ "image",
+ $proxy
+ );
+ }
+
+ return $out;
+ }
+
+ private function unshiturl($url, $return_size = false){
+
+ // decode
+ $url =
+ $this->fuckhtml
+ ->getTextContent($url);
+
+ $url_parts = parse_url($url);
+
+ if(
+ !isset(
+ $url_parts["host"]
+ )
+ ){
+
+ // no host, we have a tracking url
+ parse_str($url_parts["query"], $query);
+
+ if(isset($query["imgurl"])){
+
+ $url = $query["imgurl"];
+ }
+ elseif(isset($query["q"])){
+
+ $url = $query["q"];
+ }
+ }
+
+ // rewrite URLs to remove extra tracking parameters
+ $domain = parse_url($url, PHP_URL_HOST);
+
+ if(
+ preg_match(
+ '/wikipedia.org$/',
+ $domain
+ )
+ ){
+
+ // rewrite wikipedia mobile URLs to desktop
+ $url =
+ $this->replacedomain(
+ $url,
+ preg_replace(
+ '/([a-z0-9]+)(\.m\.)/',
+ '$1.',
+ $domain
+ )
+ );
+ }
+
+ elseif(
+ preg_match(
+ '/imdb\.com$|youtube\.[^.]+$/',
+ $domain
+ )
+ ){
+
+ // rewrite imdb and youtube mobile URLs too
+ $url =
+ $this->replacedomain(
+ $url,
+ preg_replace(
+ '/^m\./',
+ "",
+ $domain
+ )
+ );
+
+ }
+
+ elseif(
+ preg_match(
+ '/play\.google\.[^.]+$/',
+ $domain
+ )
+ ){
+
+ // remove referrers from play.google.com
+ $oldquery = parse_url($url, PHP_URL_QUERY);
+ if($oldquery !== null){
+
+ parse_str($oldquery, $query);
+ if(isset($query["referrer"])){ unset($query["referrer"]); }
+ if(isset($query["hl"])){ unset($query["hl"]); }
+ if(isset($query["gl"])){ unset($query["gl"]); }
+
+ $query = http_build_query($query);
+
+ $url =
+ str_replace(
+ $oldquery,
+ $query,
+ $url
+ );
+ }
+ }
+
+ elseif(
+ preg_match(
+ '/twitter\.com$/',
+ $domain
+ )
+ ){
+ // remove more referrers from twitter.com
+ $oldquery = parse_url($url, PHP_URL_QUERY);
+ if($oldquery !== null){
+
+ parse_str($oldquery, $query);
+ if(isset($query["ref_src"])){ unset($query["ref_src"]); }
+
+ $query = http_build_query($query);
+
+ $url =
+ str_replace(
+ $oldquery,
+ $query,
+ $url
+ );
+ }
+ }
+
+ elseif(
+ preg_match(
+ '/maps\.google\.[^.]+/',
+ $domain
+ )
+ ){
+
+ if(stripos($url, "maps?") !== false){
+
+ //https://maps.google.com/maps?daddr=Johnny,+603+Rue+St+Georges,+Saint-J%C3%A9r%C3%B4me,+Quebec+J7Z+5B7
+ $query = parse_url($url, PHP_URL_QUERY);
+
+ if($query !== null){
+
+ parse_str($query, $query);
+
+ if(isset($query["daddr"])){
+
+ $url =
+ "https://maps.google.com/maps?daddr=" .
+ urlencode($query["daddr"]);
+ }
+ }
+ }
+ }
+
+ if($return_size){
+
+ return [
+ "url" => $url,
+ "ref" => isset($query["imgrefurl"]) ? $query["imgrefurl"] : null,
+ "thumb_width" => isset($query["tbnw"]) ? (int)$query["tbnw"] : null,
+ "thumb_height" => isset($query["tbnh"]) ? (int)$query["tbnh"] : null,
+ "image_width" => isset($query["w"]) ? (int)$query["w"] : null,
+ "image_height" => isset($query["h"]) ? (int)$query["h"] : null
+ ];
+ }
+
+ return $url;
+ }
+
+ private function replacedomain($url, $domain){
+
+ return
+ preg_replace(
+ '/(https?:\/\/)([^\/]+)/',
+ '$1' . $domain,
+ $url
+ );
+ }
+
+ private function titledots($title){
+
+ return trim($title, " .\t\n\r\0\x0B…");
+ }
+
+ private function hms2int($time){
+
+ $parts = explode(":", $time, 3);
+ $time = 0;
+
+ if(count($parts) === 3){
+
+ // hours
+ $time = $time + ((int)$parts[0] * 3600);
+ array_shift($parts);
+ }
+
+ if(count($parts) === 2){
+
+ // minutes
+ $time = $time + ((int)$parts[0] * 60);
+ array_shift($parts);
+ }
+
+ // seconds
+ $time = $time + (int)$parts[0];
+
+ return $time;
+ }
+
+ private function detect_sorry(){
+
+ $recaptcha =
+ $this->fuckhtml
+ ->getElementById(
+ "recaptcha",
+ "div"
+ );
+
+ if($recaptcha !== false){
+
+ throw new Exception("Google returned a captcha");
+ }
+ }
+}