diff options
Diffstat (limited to 'scraper')
-rw-r--r-- | scraper/googlealt.php | 5029 |
1 files changed, 5029 insertions, 0 deletions
diff --git a/scraper/googlealt.php b/scraper/googlealt.php new file mode 100644 index 0000000..d7878cf --- /dev/null +++ b/scraper/googlealt.php @@ -0,0 +1,5029 @@ +<?php + +// @TODO check for consent.google.com page, if need be + +class googlealt{ + + public function __construct(){ + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + include "lib/backend.php"; + $this->backend = new backend("googlealt"); + } + + public function getfilters($page){ + + $base = [ + "country" => [ // gl=<country> (image: cr=countryAF) + "display" => "Country", + "option" => [ + "any" => "Instance's country", + "af" => "Afghanistan", + "al" => "Albania", + "dz" => "Algeria", + "as" => "American Samoa", + "ad" => "Andorra", + "ao" => "Angola", + "ai" => "Anguilla", + "aq" => "Antarctica", + "ag" => "Antigua and Barbuda", + "ar" => "Argentina", + "am" => "Armenia", + "aw" => "Aruba", + "au" => "Australia", + "at" => "Austria", + "az" => "Azerbaijan", + "bs" => "Bahamas", + "bh" => "Bahrain", + "bd" => "Bangladesh", + "bb" => "Barbados", + "by" => "Belarus", + "be" => "Belgium", + "bz" => "Belize", + "bj" => "Benin", + "bm" => "Bermuda", + "bt" => "Bhutan", + "bo" => "Bolivia", + "ba" => "Bosnia and Herzegovina", + "bw" => "Botswana", + "bv" => "Bouvet Island", + "br" => "Brazil", + "io" => "British Indian Ocean Territory", + "bn" => "Brunei Darussalam", + "bg" => "Bulgaria", + "bf" => "Burkina Faso", + "bi" => "Burundi", + "kh" => "Cambodia", + "cm" => "Cameroon", + "ca" => "Canada", + "cv" => "Cape Verde", + "ky" => "Cayman Islands", + "cf" => "Central African Republic", + "td" => "Chad", + "cl" => "Chile", + "cn" => "China", + "cx" => "Christmas Island", + "cc" => "Cocos (Keeling) Islands", + "co" => "Colombia", + "km" => "Comoros", + "cg" => "Congo", + "cd" => "Congo, the Democratic Republic", + "ck" => "Cook Islands", + "cr" => "Costa Rica", + "ci" => "Cote D'ivoire", + "hr" => "Croatia", + "cu" => "Cuba", + "cy" => "Cyprus", + "cz" => "Czech Republic", + "dk" => "Denmark", + "dj" => "Djibouti", + "dm" => "Dominica", + "do" => "Dominican Republic", + "ec" => "Ecuador", + "eg" => "Egypt", + "sv" => "El Salvador", + "gq" => "Equatorial Guinea", + "er" => "Eritrea", + "ee" => "Estonia", + "et" => "Ethiopia", + "fk" => "Falkland Islands (Malvinas)", + "fo" => "Faroe Islands", + "fj" => "Fiji", + "fi" => "Finland", + "fr" => "France", + "gf" => "French Guiana", + "pf" => "French Polynesia", + "tf" => "French Southern Territories", + "ga" => "Gabon", + "gm" => "Gambia", + "ge" => "Georgia", + "de" => "Germany", + "gh" => "Ghana", + "gi" => "Gibraltar", + "gr" => "Greece", + "gl" => "Greenland", + "gd" => "Grenada", + "gp" => "Guadeloupe", + "gu" => "Guam", + "gt" => "Guatemala", + "gn" => "Guinea", + "gw" => "Guinea-Bissau", + "gy" => "Guyana", + "ht" => "Haiti", + "hm" => "Heard Island and Mcdonald Islands", + "va" => "Holy See (Vatican City State)", + "hn" => "Honduras", + "hk" => "Hong Kong", + "hu" => "Hungary", + "is" => "Iceland", + "in" => "India", + "id" => "Indonesia", + "ir" => "Iran, Islamic Republic", + "iq" => "Iraq", + "ie" => "Ireland", + "il" => "Israel", + "it" => "Italy", + "jm" => "Jamaica", + "jp" => "Japan", + "jo" => "Jordan", + "kz" => "Kazakhstan", + "ke" => "Kenya", + "ki" => "Kiribati", + "kp" => "Korea, Democratic People's Republic", + "kr" => "Korea, Republic", + "kw" => "Kuwait", + "kg" => "Kyrgyzstan", + "la" => "Lao People's Democratic Republic", + "lv" => "Latvia", + "lb" => "Lebanon", + "ls" => "Lesotho", + "lr" => "Liberia", + "ly" => "Libyan Arab Jamahiriya", + "li" => "Liechtenstein", + "lt" => "Lithuania", + "lu" => "Luxembourg", + "mo" => "Macao", + "mk" => "Macedonia, the Former Yugosalv Republic", + "mg" => "Madagascar", + "mw" => "Malawi", + "my" => "Malaysia", + "mv" => "Maldives", + "ml" => "Mali", + "mt" => "Malta", + "mh" => "Marshall Islands", + "mq" => "Martinique", + "mr" => "Mauritania", + "mu" => "Mauritius", + "yt" => "Mayotte", + "mx" => "Mexico", + "fm" => "Micronesia, Federated States", + "md" => "Moldova, Republic", + "mc" => "Monaco", + "mn" => "Mongolia", + "ms" => "Montserrat", + "ma" => "Morocco", + "mz" => "Mozambique", + "mm" => "Myanmar", + "na" => "Namibia", + "nr" => "Nauru", + "np" => "Nepal", + "nl" => "Netherlands", + "an" => "Netherlands Antilles", + "nc" => "New Caledonia", + "nz" => "New Zealand", + "ni" => "Nicaragua", + "ne" => "Niger", + "ng" => "Nigeria", + "nu" => "Niue", + "nf" => "Norfolk Island", + "mp" => "Northern Mariana Islands", + "no" => "Norway", + "om" => "Oman", + "pk" => "Pakistan", + "pw" => "Palau", + "ps" => "Palestinian Territory, Occupied", + "pa" => "Panama", + "pg" => "Papua New Guinea", + "py" => "Paraguay", + "pe" => "Peru", + "ph" => "Philippines", + "pn" => "Pitcairn", + "pl" => "Poland", + "pt" => "Portugal", + "pr" => "Puerto Rico", + "qa" => "Qatar", + "re" => "Reunion", + "ro" => "Romania", + "ru" => "Russian Federation", + "rw" => "Rwanda", + "sh" => "Saint Helena", + "kn" => "Saint Kitts and Nevis", + "lc" => "Saint Lucia", + "pm" => "Saint Pierre and Miquelon", + "vc" => "Saint Vincent and the Grenadines", + "ws" => "Samoa", + "sm" => "San Marino", + "st" => "Sao Tome and Principe", + "sa" => "Saudi Arabia", + "sn" => "Senegal", + "cs" => "Serbia and Montenegro", + "sc" => "Seychelles", + "sl" => "Sierra Leone", + "sg" => "Singapore", + "sk" => "Slovakia", + "si" => "Slovenia", + "sb" => "Solomon Islands", + "so" => "Somalia", + "za" => "South Africa", + "gs" => "South Georgia and the South Sandwich Islands", + "es" => "Spain", + "lk" => "Sri Lanka", + "sd" => "Sudan", + "sr" => "Suriname", + "sj" => "Svalbard and Jan Mayen", + "sz" => "Swaziland", + "se" => "Sweden", + "ch" => "Switzerland", + "sy" => "Syrian Arab Republic", + "tw" => "Taiwan, Province of China", + "tj" => "Tajikistan", + "tz" => "Tanzania, United Republic", + "th" => "Thailand", + "tl" => "Timor-Leste", + "tg" => "Togo", + "tk" => "Tokelau", + "to" => "Tonga", + "tt" => "Trinidad and Tobago", + "tn" => "Tunisia", + "tr" => "Turkey", + "tm" => "Turkmenistan", + "tc" => "Turks and Caicos Islands", + "tv" => "Tuvalu", + "ug" => "Uganda", + "ua" => "Ukraine", + "ae" => "United Arab Emirates", + "uk" => "United Kingdom", + "us" => "United States", + "um" => "United States Minor Outlying Islands", + "uy" => "Uruguay", + "uz" => "Uzbekistan", + "vu" => "Vanuatu", + "ve" => "Venezuela", + "vn" => "Viet Nam", + "vg" => "Virgin Islands, British", + "vi" => "Virgin Islands, U.S.", + "wf" => "Wallis and Futuna", + "eh" => "Western Sahara", + "ye" => "Yemen", + "zm" => "Zambia", + "zw" => "Zimbabwe" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // safe=active + "no" => "No" // safe=off + ] + ] + ]; + + switch($page){ + + case "web": + return array_merge( + $base, + [ + "lang" => [ // lr=<lang> (prefix lang with "lang_") + "display" => "Language", + "option" => [ + "any" => "Any language", + "ar" => "Arabic", + "bg" => "Bulgarian", + "ca" => "Catalan", + "cs" => "Czech", + "da" => "Danish", + "de" => "German", + "el" => "Greek", + "en" => "English", + "es" => "Spanish", + "et" => "Estonian", + "fi" => "Finnish", + "fr" => "French", + "hr" => "Croatian", + "hu" => "Hungarian", + "id" => "Indonesian", + "is" => "Icelandic", + "it" => "Italian", + "iw" => "Hebrew", + "ja" => "Japanese", + "ko" => "Korean", + "lt" => "Lithuanian", + "lv" => "Latvian", + "nl" => "Dutch", + "no" => "Norwegian", + "pl" => "Polish", + "pt" => "Portuguese", + "ro" => "Romanian", + "ru" => "Russian", + "sk" => "Slovak", + "sl" => "Slovenian", + "sr" => "Serbian", + "sv" => "Swedish", + "tr" => "Turkish", + "zh-CN" => "Chinese (Simplified)", + "zh-TW" => "Chinese (Traditional)" + ] + ], + "newer" => [ // tbs + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ], + "spellcheck" => [ + "display" => "Spellcheck", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ] + ] + ); + break; + + case "images": + return array_merge( + $base, + [ + "time" => [ // tbs=qdr:<time> + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "d" => "Past 24 hours", + "w" => "Past week", + "m" => "Past month", + "y" => "Past year" + ] + ], + "size" => [ // imgsz + "display" => "Size", + "option" => [ + "any" => "Any size", + "l" => "Large", + "m" => "Medium", + "i" => "Icon", + "qsvga" => "Larger than 400x300", + "vga" => "Larger than 640x480", + "svga" => "Larger than 800x600", + "xga" => "Larger than 1024x768", + "2mp" => "Larger than 2MP", + "4mp" => "Larger than 4MP", + "6mp" => "Larger than 6MP", + "8mp" => "Larger than 8MP", + "10mp" => "Larger than 10MP", + "12mp" => "Larger than 12MP", + "15mp" => "Larger than 15MP", + "20mp" => "Larger than 20MP", + "40mp" => "Larger than 40MP", + "70mp" => "Larger than 70MP" + ] + ], + "ratio" => [ // imgar + "display" => "Aspect ratio", + "option" => [ + "any" => "Any ratio", + "t|xt" => "Tall", + "s" => "Square", + "w" => "Wide", + "xw" => "Panoramic" + ] + ], + "color" => [ // imgc + "display" => "Color", + "option" => [ + "any" => "Any color", + "color" => "Full color", + "bnw" => "Black & white", + "trans" => "Transparent", + // from here, imgcolor + "red" => "Red", + "orange" => "Orange", + "yellow" => "Yellow", + "green" => "Green", + "teal" => "Teal", + "blue" => "Blue", + "purple" => "Purple", + "pink" => "Pink", + "white" => "White", + "gray" => "Gray", + "black" => "Black", + "brown" => "Brown" + ] + ], + "type" => [ // tbs=itp:<type> + "display" => "Type", + "option" => [ + "any" => "Any type", + "clipart" => "Clip Art", + "lineart" => "Line Drawing", + "animated" => "Animated" + ] + ], + "format" => [ // as_filetype + "display" => "Format", + "option" => [ + "any" => "Any format", + "jpg" => "JPG", + "gif" => "GIF", + "png" => "PNG", + "bmp" => "BMP", + "svg" => "SVG", + "webp" => "WEBP", + "ico" => "ICO", + "craw" => "RAW" + ] + ], + "rights" => [ // tbs=sur:<rights> + "display" => "Usage rights", + "option" => [ + "any" => "Any license", + "cl" => "Creative Commons licenses", + "ol" => "Commercial & other licenses" + ] + ] + ] + ); + break; + + case "videos": + return array_merge( + $base, + [ + "newer" => [ // tbs + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ], + "duration" => [ + "display" => "Duration", + "option" => [ + "any" => "Any duration", + "s" => "Short (0-4min)", // tbs=dur:s + "m" => "Medium (4-20min)", // tbs=dur:m + "l" => "Long (20+ min)" // tbs=dur:l + ] + ], + "quality" => [ + "display" => "Quality", + "option" => [ + "any" => "Any quality", + "h" => "High quality" // tbs=hq:h + ] + ], + "captions" => [ + "display" => "Captions", + "option" => [ + "any" => "No preference", + "yes" => "Closed captioned" // tbs=cc:1 + ] + ] + ] + ); + break; + + case "news": + return array_merge( + $base, + [ + "newer" => [ // tbs + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ], + "sort" => [ + "display" => "Sort", + "option" => [ + "relevance" => "Relevance", + "date" => "Date" // sbd:1 + ] + ] + ] + ); + break; + } + } + + private function get($proxy, $url, $get = []){ + + $headers = [ + "User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + //"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1", + "Priority: u=1", + "TE: trailers" + ]; + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + curl_setopt($curlproc, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V6); + + + // use http2 + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + // follow redirects + curl_setopt($curlproc, CURLOPT_FOLLOWLOCATION, true); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + + + + private function parsepage($html, $pagetype, $search, $proxy, $params){ + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + $this->detect_sorry(); + + // parse all <style> tags + $this->parsestyles(); + + // get javascript images + $this->scrape_dimg($html); + + // get html blobs + preg_match_all( + '/function\(\){window\.jsl\.dh\(\'([^\']+?)\',\'(.+?[^\'])\'\);/', + $html, + $blobs + ); + + $this->blobs = []; + if(isset($blobs[1])){ + + for($i=0; $i<count($blobs[1]); $i++){ + + $this->blobs[$blobs[1][$i]] = + $this->fuckhtml + ->parseJsString( + $blobs[2][$i] + ); + } + } + + $this->scrape_imagearr($html); + + // + // load result column + // + $result_div = + $this->fuckhtml + ->getElementById( + "center_col", + "div" + ); + + if($result_div === false){ + + throw new Exception("Failed to grep result div"); + } + + $this->fuckhtml->load($result_div); + + // + // Get word corrections + // + $correction = + $this->fuckhtml + ->getElementById( + "fprs", + "p" + ); + + if($correction){ + + $this->fuckhtml->load($correction); + + $a = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + $using = + $this->fuckhtml + ->getElementById( + "fprsl", + $a + ); + + if($using){ + + $using = + $this->fuckhtml + ->getTextContent( + $using + ); + + $spans = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + $type_span = + $this->fuckhtml + ->getTextContent( + $spans[0] + ); + + $type = "not_many"; + + if( + stripos( + $type_span, + "Showing results for" + ) !== false + ){ + + $type = "including"; + } + + $correction = + $this->fuckhtml + ->getTextContent( + $a[count($a) - 1] + ); + + $out["spelling"] = [ + "type" => $type, + "using" => $using, + "correction" => $correction + ]; + } + + // reset + $this->fuckhtml->load($result_div); + }else{ + + // get the "Did you mean?" prompt + $taw = + $this->fuckhtml + ->getElementById( + "taw" + ); + + if($taw){ + + $this->fuckhtml->load($taw); + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) !== 0){ + + $text = + $this->fuckhtml + ->getTextContent( + $as[0] + ); + + // @TODO implement did_you_mean + $out["spelling"] = [ + "type" => "including", + "using" => $search, + "correction" => $text + ]; + } + } + + $this->fuckhtml->load($result_div); + } + + // + // get notices + // + $botstuff = + $this->fuckhtml + ->getElementById( + "botstuff" + ); + + // important for later + $last_page = false; + + if($botstuff){ + + $this->fuckhtml->load($botstuff); + + $cards = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "line-height" => "normal" + ] + ), + "div" + ); + + foreach($cards as $card){ + + $this->fuckhtml->load($card); + + $h2 = + $this->fuckhtml + ->getElementsByTagName( + "h2" + ); + + if(count($h2) !== 0){ + + $title = + $this->fuckhtml + ->getTextContent( + $h2[0] + ); + + $card["innerHTML"] = + str_replace( + $h2[0]["outerHTML"], + "", + $card["innerHTML"] + ); + }else{ + + $title = "Notice"; + } + + $description = []; + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) !== 0){ + + $first = true; + + foreach($as as $a){ + + $text_link = + $this->fuckhtml + ->getTextContent( + $a + ); + + if(stripos($text_link, "repeat the search") !== false){ + + $last_page = true; + break 2; + } + + $parts = + explode( + $a["outerHTML"], + $card["innerHTML"], + 2 + ); + + $card["innerHTML"] = $parts[1]; + + $value = + preg_replace( + '/ +/', + " ", + $this->fuckhtml + ->getTextContent( + $parts[0], + false, + false + ) + ); + + if(strlen(trim($value)) !== 0){ + + $description[] = [ + "type" => "text", + "value" => $value + ]; + + if($first){ + + $description[0]["value"] = + ltrim($description[0]["value"]); + } + } + + $first = false; + + $description[] = [ + "type" => "link", + "url" => + $this->fuckhtml + ->getTextContent( + $a["attributes"] + ["href"] + ), + "value" => $text_link + ]; + } + + $text = + $this->fuckhtml + ->getTextContent( + $card["innerHTML"], + false, + false + ); + + if(strlen(trim($text)) !== 0){ + + $description[] = [ + "type" => "text", + "value" => + rtrim( + $text + ) + ]; + } + + }else{ + + // @TODO: Check if this ever gets populated without giving me garbage + /* + $text = + $this->fuckhtml + ->getTextContent( + $card + ); + + if($text != ""){ + $description[] = [ + "type" => "text", + "value" => $text + ]; + }*/ + } + + if(count($description) !== 0){ + + $out["answer"][] = [ + "title" => $title, + "description" => $description, + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + } + } + + // reset + $this->fuckhtml->load($html); + } + + // + // get "Related Searches" and "People also search for" + // + $relateds = + $this->fuckhtml + ->getElementsByClassName( + "wyccme", + "div" + ); + + foreach($relateds as $related){ + + $text = + $this->fuckhtml + ->getTextContent( + $related + ); + + if($text == "More results"){ continue; } + + $out["related"][] = $text; + } + + // + // Get text results + // + $results = + $this->fuckhtml + ->getElementsByClassName( + "g", + "div" + ); + + $this->skip_next = false; + + foreach($results as $result){ + + if($this->skip_next){ + + $this->skip_next = false; + continue; + } + + $this->fuckhtml->load($result); + + $web = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + // Detect presence of sublinks + $g = + $this->fuckhtml + ->getElementsByClassName( + "g", + "div" + ); + + $sublinks = []; + if(count($g) > 0){ + + $table = + $this->fuckhtml + ->getElementsByTagName( + "table" + ); + + if(count($table) !== 0){ + + // found some sublinks! + + $this->fuckhtml->load($table[0]); + + $tds = + $this->fuckhtml + ->getElementsByTagName( + "td" + ); + + foreach($tds as $td){ + + $this->fuckhtml->load($td); + + $a = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if( + count($a) === 0 || + ( + isset($a[0]["attributes"]["class"]) && + $a[0]["attributes"]["class"] == "fl" + ) + ){ + + continue; + } + + $td["innerHTML"] = + str_replace( + $a[0]["outerHTML"], + "", + $td["innerHTML"] + ); + + $web["sublink"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $a[0] + ) + ), + "description" => + html_entity_decode( + $this->titledots( + $this->fuckhtml + ->getTextContent( + $td + ) + ) + ), + "url" => + $this->unshiturl( + $a[0] + ["attributes"] + ["href"] + ), + "date" => null + ]; + } + + // reset + $this->fuckhtml->load($result); + } + + // skip on next iteration + $this->skip_next = true; + } + + // get title + $h3 = + $this->fuckhtml + ->getElementsByTagName( + "h3" + ); + + if(count($h3) === 0){ + + continue; + } + + $web["title"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $h3[0] + ) + ); + + // get url + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + $web["url"] = + $this->unshiturl( + $as[0] + ["attributes"] + ["href"] + ); + + if( + !preg_match( + '/^http/', + $web["url"] + ) + ){ + + // skip if invalid url is found + continue; + } + + // + // probe for twitter carousel + // + $carousel = + $this->fuckhtml + ->getElementsByTagName( + "g-scrolling-carousel" + ); + + if(count($carousel) !== 0){ + + $this->fuckhtml->load($carousel[0]); + + $items = + $this->fuckhtml + ->getElementsByTagName( + "g-inner-card" + ); + + $has_thumbnail = false; + + foreach($items as $item){ + + $this->fuckhtml->load($item); + + if($has_thumbnail === false){ + + // get thumbnail + $thumb = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if( + count($thumb) !== 0 && + isset($thumb[0]["attributes"]["id"]) + ){ + + $web["thumb"] = [ + "url" => + $this->getdimg( + $thumb[0]["attributes"]["id"] + ), + "ratio" => "16:9" + ]; + + $has_thumbnail = true; + } + + // or else, try getting a thumbnail from next container + } + + // cache div + $div = + $this->fuckhtml + ->getElementsByTagName( + "div" + ); + + // get link + $links = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + // get description of carousel sublink + $description = + $this->fuckhtml + ->getElementsByAttributeValue( + "role", + "heading", + $div + ); + + if(count($description) !== 0){ + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ); + }else{ + + $description = null; + } + + $bottom = + $this->fuckhtml + ->getElementsByAttributeValue( + "style", + "z-index:2", + $div + ); + + $title = null; + $date = null; + if(count($bottom) !== 0){ + + $this->fuckhtml->load($bottom[0]); + + $spans = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + $title = + $this->fuckhtml + ->getTextContent( + $spans[0] + ); + + $date = + strtotime( + $this->fuckhtml + ->getTextContent( + $spans[count($spans) - 1] + ) + ); + } + + $web["sublink"][] = [ + "title" => $title, + "description" => $description, + "url" => + $this->unshiturl( + $links[0] + ["attributes"] + ["href"] + ), + "date" => $date + ]; + } + + $out["web"][] = $web; + continue; + } + + // + // get viewcount, time posted and follower count from <cite> tag + // + $cite = + $this->fuckhtml + ->getElementsByTagName( + "cite" + ); + + if(count($cite) !== 0){ + + $this->fuckhtml->load($cite[0]); + + $spans = + $this->fuckhtml + ->getElementsByTagName("span"); + + if(count($spans) === 0){ + + $cites = + explode( + "·", + $this->fuckhtml + ->getTextContent( + $cite[0] + ) + ); + + foreach($cites as $cite){ + + $cite = trim($cite); + + if( + preg_match( + '/(.+) (views|followers|likes)$/', + $cite, + $match + ) + ){ + + $web["table"][ucfirst($match[2])] = + $match[1]; + }elseif( + preg_match( + '/ago$/', + $cite + ) + ){ + + $web["date"] = + strtotime($cite); + } + } + } + + // reset + $this->fuckhtml->load($result); + } + + // + // attempt to fetch description cleanly + // + $description = + $this->fuckhtml + ->getElementsByAttributeValue( + "style", + "-webkit-line-clamp:2" + ); + + if(count($description) !== 0){ + + $web["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ); + }else{ + + // use ANOTHER method where the description is a header of the result + $description = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "wa:/description" + ); + + if(count($description) !== 0){ + + // get date off that shit + $date = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "font-size" => "12px", + "line-height" => "1.34", + "display" => "inline-block", + "font-family" => "google sans,arial,sans-serif", + "padding-right" => "0", + "white-space" => "nowrap" + ] + ), + "span" + ); + + if(count($date) !== 0){ + + $description[0]["innerHTML"] = + str_replace( + $date[0]["outerHTML"], + "", + $description[0]["innerHTML"] + ); + + $web["date"] = + strtotime( + $this->fuckhtml + ->getTextContent( + $date[0] + ) + ); + } + + $web["description"] = + $this->fuckhtml + ->getTextContent( + $description[0] + ); + }else{ + + // Yes.. You guessed it, use ANOTHER method to get descriptions + // off youtube containers + $description = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "-webkit-box-orient" => "vertical", + "display" => "-webkit-box", + "font-size" => "14px", + "-webkit-line-clamp" => "2", + "line-height" => "22px", + "overflow" => "hidden", + "word-break" => "break-word", + "color" => "#4d5156" + ] + ), + "div" + ); + + if(count($description) !== 0){ + + // check for video duration + $duration = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "background-color" => "rgba(0,0,0,0.6)", + "color" => "#fff", + "fill" => "#fff" + ] + ), + "div" + ); + + if(count($duration) !== 0){ + + $web["table"]["Duration"] = + $this->fuckhtml + ->getTextContent( + $duration[0] + ); + } + + $web["description"] = + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ) + ); + + // get author + time posted + $info = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "color" => "var(" . $this->getcolorvar("#70757a") . ")", + "font-size" => "14px", + "line-height" => "20px", + "margin-top" => "12px" + ] + ), + "div" + ); + + if(count($info) !== 0){ + + $info = + explode( + "·", + $this->fuckhtml + ->getTextContent( + $info[0] + ) + ); + + switch(count($info)){ + + case 3: + $web["table"]["Author"] = trim($info[1]); + $web["date"] = strtotime(trim($info[2])); + break; + + case 2: + $web["date"] = strtotime(trim($info[1])); + break; + } + } + } + } + } + + // + // get categories of content within the search result + // + $cats = + $this->fuckhtml + ->getElementsByAttributeName( + "data-sncf", + "div" + ); + + foreach($cats as $cat){ + + $this->fuckhtml->load($cat); + + // detect image category + $images = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if(count($images) !== 0){ + + foreach($images as $image){ + + if(isset($image["attributes"]["id"])){ + // we found an image + + if(isset($image["attributes"]["width"])){ + + $width = (int)$image["attributes"]["width"]; + + if($width == 110){ + + $ratio = "1:1"; + }elseif($width > 110){ + + $ratio = "16:9"; + }else{ + + $ratio = "9:16"; + } + }else{ + + $ratio = "1:1"; + } + + $web["thumb"] = [ + "url" => $this->getdimg($image["attributes"]["id"]), + "ratio" => $ratio + ]; + + continue 2; + } + } + } + + // Detect rating + $spans_unfiltered = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + $spans = + $this->fuckhtml + ->getElementsByAttributeName( + "aria-label", + $spans_unfiltered + ); + + foreach($spans as $span){ + + if( + preg_match( + '/^Rated/', + $span["attributes"]["aria-label"] + ) + ){ + + // found rating + // scrape rating + preg_match( + '/([0-9.]+).*([0-9.]+)/', + $span["attributes"]["aria-label"], + $rating + ); + + if(isset($rating[1])){ + + $web["table"]["Rating"] = + $rating[1] . "/" . $rating[2]; + } + + $has_seen_reviews = 0; + foreach($spans_unfiltered as $span_unfiltered){ + + if( + preg_match( + '/([0-9,.]+) +([A-z]+)$/', + $this->fuckhtml + ->getTextContent( + $span_unfiltered + ), + $votes + ) + ){ + + $has_seen_reviews++; + $web["table"][ucfirst($votes[2])] = $votes[1]; + continue; + } + + $text = + $this->fuckhtml + ->getTextContent( + $span_unfiltered + ); + + if( + $text == " " || + $text == "" + ){ + + break; + } + + switch($has_seen_reviews){ + + case 1: + // scrape price + $web["table"]["Price"] = $text; + $has_seen_reviews++; + break; + + case 2: + // scrape platform + $web["table"]["Platform"] = $text; + $has_seen_reviews++; + break; + + case 3: + // Scrape type + $web["table"]["Medium"] = $text; + break; + } + } + + continue 2; + } + } + + // check if its a table of small sublinks + $table = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "display" => "table", + "white-space" => "nowrap", + "margin" => "5px 0", + "line-height" => "1.58", + "color" => "var(" . $this->getcolorvar("#70757a") . ")" + ] + ), + "div" + ); + + if(count($table) !== 0){ + + $this->fuckhtml->load($table[0]); + + $rows = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "display" => "flex", + "white-space" => "normal" + ] + ), + "div" + ); + + foreach($rows as $row){ + + $this->fuckhtml->load($row); + + $sublink = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null + ]; + + $link = + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0]; + + $sublink["title"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $link + ) + ); + + $sublink["url"] = + $this->unshiturl( + $link + ["attributes"] + ["href"] + ); + + $row["innerHTML"] = + str_replace( + $link["outerHTML"], + "", + $row["innerHTML"] + ); + + $this->fuckhtml->load($row); + + $spans = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + foreach($spans as $span){ + + $text = + $this->fuckhtml + ->getTextContent( + $span + ); + + if( + preg_match( + '/answers?$/', + $text + ) + ){ + + $sublink["description"] = + $text; + + continue; + } + + $time = strtotime($text); + + if($time !== false){ + + $sublink["date"] = $time; + } + } + + $web["sublink"][] = $sublink; + } + + // reset + $this->fuckhtml->load($cat); + continue; + } + + // check if its an answer header + $answer_header = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "overflow" => "hidden", + "text-overflow" => "ellipsis" + ] + ), + "span" + ); + + if(count($answer_header) !== 0){ + + $link = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + $cat["innerHTML"] = + str_replace( + $link[0]["outerHTML"], + "", + $cat["innerHTML"] + ); + + $web["sublink"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $link[0] + ), + "description" => + $this->titledots( + trim( + str_replace( + "\xc2\xa0", + " ", + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $cat + ) + ) + ), + " ·" + ) + ), + "url" => + $this->fuckhtml + ->getTextContent( + $link[0] + ["attributes"] + ["href"] + ), + "date" => null + ]; + + continue; + } + + // check if its list of small sublinks + $urls = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($urls) !== 0){ + + // found small links + foreach($urls as $url){ + + $target = + $this->fuckhtml + ->getTextContent( + $url + ["attributes"] + ["href"] + ); + + if( + !preg_match( + '/^http/', + $target + ) + ){ + + continue; + } + + $web["sublink"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $url + ) + ), + "description" => null, + "url" => $target, + "date" => null + ]; + } + + continue; + } + + // we probed everything, assume this is the description + // if we didn't find one cleanly previously + if($web["description"] === null){ + $web["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $cat + ) + ); + } + } + + // check if description contains date + $description = explode("—", $web["description"], 2); + + if( + count($description) === 2 && + strlen($description[0]) <= 20 + ){ + + $date = strtotime($description[0]); + + if($date !== false){ + + $web["date"] = $date; + $web["description"] = ltrim($description[1]); + } + } + + // fetch youtube thumbnail + $thumbnail = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "border-radius" => "8px", + "height" => "fit-content", + "justify-content" => "center", + "margin-right" => "20px", + "margin-top" => "4px", + "position" => "relative", + "width" => "fit-content" + ] + ), + "div" + ); + + if(count($thumbnail) !== 0){ + + // load thumbnail container + $this->fuckhtml->load($thumbnail[0]); + + $image = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if( + count($image) !== 0 && + isset($image[0]["attributes"]["id"]) + ){ + + $web["thumb"] = [ + "url" => + $this->unshit_thumb( + $this->getdimg( + $image[0]["attributes"]["id"] + ) + ), + "ratio" => "16:9" + ]; + } + + // reset + $this->fuckhtml->load($result); + } + + $out["web"][] = $web; + } + + // reset + $this->fuckhtml->load($result_div); + + // + // Get instant answers + // + $answer_containers = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "padding-left" => "0px", + "padding-right" => "0px" + ] + ), + "div" + ); + + $date_class = + $this->getstyle( + [ + "font-size" => "12px", + "line-height" => "1.34", + "display" => "inline-block", + "font-family" => "google sans,arial,sans-serif", + "padding-right" => "0", + "white-space" => "nowrap" + ] + ); + + foreach($answer_containers as $container){ + + $this->fuckhtml->load($container); + + $web = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + $answers = + $this->fuckhtml + ->getElementsByAttributeName( + "aria-controls", + "div" + ); + + $item_insert_pos = 1; + foreach($answers as $answer){ + + $out["related"][] = + $this->fuckhtml + ->getTextContent( + $answer + ); + + if( + isset( + $this->blobs[ + $answer + ["attributes"] + ["aria-controls"] + ] + ) + ){ + + $this->fuckhtml->load( + $this->blobs[ + $answer + ["attributes"] + ["aria-controls"] + ] + ); + + $divs = + $this->fuckhtml + ->getElementsByAttributeName( + "id", + "div" + ); + + foreach($divs as $div){ + + if( + !isset( + $this->blobs[ + $div + ["attributes"] + ["id"] + ] + ) + ){ + + continue; + } + + $this->fuckhtml->load( + $this->blobs[ + $div + ["attributes"] + ["id"] + ] + ); + + // get url + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) !== 0){ + + $web["url"] = + $this->unshiturl( + $as[0]["attributes"]["href"] + ); + + // skip entries that redirect to a search + if( + !preg_match( + '/^http/', + $web["url"] + ) + ){ + + continue 3; + } + } + + // get title + $h3 = + $this->fuckhtml + ->getElementsByTagName( + "h3" + ); + + if(count($h3) !== 0){ + + $web["title"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $h3[0] + ) + ); + } + + $description = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "wa:/description", + "div" + ); + + if(count($description) !== 0){ + + // check for date + $this->fuckhtml->load($description[0]); + + $date = + $this->fuckhtml + ->getElementsByClassName( + $date_class, + "span" + ); + + if(count($date) !== 0){ + + $description[0]["innerHTML"] = + str_replace( + $date[0]["outerHTML"], + "", + $description[0]["innerHTML"] + ); + + $web["date"] = + strtotime( + $this->fuckhtml + ->getTextContent( + $date[0] + ) + ); + } + + $web["description"] = + ltrim( + $this->fuckhtml + ->getTextContent( + $description[0] + ), + ": " + ); + } + } + + foreach($out["web"] as $item){ + + if($item["url"] == $web["url"]){ + + continue 2; + } + } + + array_splice($out["web"], $item_insert_pos, 0, [$web]); + $item_insert_pos++; + } + } + } + + // reset + $this->fuckhtml->load($result_div); + + // + // Scrape word definition + // + $definition_container = + $this->fuckhtml + ->getElementsByClassName( + "lr_container", + "div" + ); + + if(count($definition_container) !== 0){ + + $this->fuckhtml->load($definition_container[0]); + + // get header + $header = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "EntryHeader", + "div" + ); + + if(count($header) !== 0){ + + $description = []; + + $this->fuckhtml->load($header[0]); + + $title_div = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "font-family" => "google sans,arial,sans-serif", + "font-size" => "28px", + "line-height" => "36px" + ] + ) + ); + + if(count($title_div) !== 0){ + + $title = + $this->fuckhtml + ->getTextContent( + $title_div[0] + ); + }else{ + + $title = "Word definition"; + } + + $subtext_div = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "font-family" => "arial,sans-serif", + "font-size" => "14px", + "line-height" => "22px" + ] + ), + "span" + ); + + if(count($subtext_div) !== 0){ + + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $subtext_div[0] + ) + ]; + } + + // get audio + $audio = + $this->fuckhtml + ->getElementsByTagName( + "audio" + ); + + if(count($audio) !== 0){ + + $this->fuckhtml->load($audio[0]); + + $source = + $this->fuckhtml + ->getElementsByTagName( + "source" + ); + + if(count($source) !== 0){ + + $description[] = [ + "type" => "audio", + "url" => + preg_replace( + '/^\/\//', + "https://", + $this->fuckhtml + ->getTextContent( + $source[0] + ["attributes"] + ["src"] + ) + ) + ]; + } + + } + + // remove header to avoid confusion + $definition_container[0]["innerHTML"] = + str_replace( + $header[0]["outerHTML"], + "", + $definition_container[0]["innerHTML"] + ); + + // reset + $this->fuckhtml->load($definition_container[0]); + + $vmods = + $this->fuckhtml + ->getElementsByClassName( + "vmod", + "div" + ); + + foreach($vmods as $category){ + + if( + !isset( + $category + ["attributes"] + ["data-topic"] + ) || + $category + ["attributes"] + ["class"] != "vmod" + ){ + + continue; + } + + $this->fuckhtml->load($category); + + // get category type + $type = + $this->fuckhtml + ->getElementsByTagName( + "i" + ); + + if(count($type) !== 0){ + + $description[] = [ + "type" => "title", + "value" => + $this->fuckhtml + ->getTextContent( + $type[0] + ) + ]; + } + + // get heading text + $headings = + $this->fuckhtml + ->getElementsByClassName( + "xpdxpnd", + "div" + ); + + foreach($headings as $heading){ + + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $heading + ) + ]; + } + + $definitions = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "SenseDefinition", + "div" + ); + + $i = 1; + $text = []; + + foreach($definitions as $definition){ + + $text[] = + $i . ". " . + $this->fuckhtml + ->getTextContent( + $definition + ); + + $i++; + } + + if(count($text) !== 0){ + + $description[] = [ + "type" => "text", + "value" => + implode("\n", $text) + ]; + } + } + + $out["answer"][] = [ + "title" => $title, + "description" => $description, + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + } + + // reset + $this->fuckhtml->load($result_div); + } + + // + // scrape elements with a g-section-with-header + // includes: images, news carousels + // + + $g_sections = + $this->fuckhtml + ->getElementsByTagName( + "g-section-with-header" + ); + + if(count($g_sections) !== 0){ + foreach($g_sections as $g_section){ + + // parse elements with a g-section-with-header + $this->fuckhtml->load($g_section); + + $div_title = + $this->fuckhtml + ->getElementsByClassName( + "a-no-hover-decoration", + "a" + ); + + if(count($div_title) !== 0){ + + // title detected, skip + continue; + } + + // no title detected: detect news container + $news = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "outline-offset" => "-1px", + "display" => "flex", + "flex-direction" => "column", + "flex-grow" => "1" + ] + ) + ); + + foreach($news as $new){ + + $this->fuckhtml->load($new); + + $image = + $this->fuckhtml + ->getElementsByAttributeName( + "id", + "img" + ); + + if( + count($image) !== 0 && + !( + isset($image[0]["attributes"]["style"]) && + strpos( + $image[0]["attributes"]["style"], + "height:18px" + ) !== false + ) + ){ + + $thumb = [ + "url" => + $this->getdimg( + $image[0] + ["attributes"] + ["id"] + ), + "ratio" => "1:1" + ]; + } + + $title = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByAttributeValue( + "role", + "heading", + "div" + )[0] + ) + ); + + $date_div = + $this->fuckhtml + ->getElementsByAttributeName( + "style", + "div" + ); + + if(count($date_div) !== 0){ + + foreach($date_div as $div){ + + if( + strpos( + $div["attributes"]["style"], + "bottom:" + ) !== false + ){ + $date = + strtotime( + $this->fuckhtml + ->getTextContent( + $div + ) + ); + + break; + } + } + }else{ + + $date = null; + } + + $out["news"][] = [ + "title" => $title, + "description" => null, + "date" => $date, + "thumb" => $thumb, + "url" => + $this->fuckhtml + ->getTextContent( + $new + ["attributes"] + ["href"] + ) + ]; + } + } + + // reset + $this->fuckhtml->load($result_div); + } + + // + // Parse images (carousel, left hand-side) + // + $image_carousels = + $this->fuckhtml + ->getElementsByAttributeValue( + "id", + "media_result_group", + "div" + ); + + if(count($image_carousels) !== 0){ + + foreach($image_carousels as $image_carousel){ + + $this->fuckhtml->load($image_carousel); + + // get related searches in image carousel + $relateds = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "display" => "inline-block", + "margin-right" => "6px", + "outline" => "none", + "padding" => "6px 0" + ], + "a" + ) + ); + + foreach($relateds as $related){ + + if(!isset($related["innerHTML"])){ + + // found an image + continue; + } + + $text = + $this->fuckhtml + ->getTextContent( + $related + ); + + if($text != ""){ + + $out["related"][] = $text; + } + } + + $div = + $this->fuckhtml + ->getElementsByTagName( + "div" + ); + + // get loaded images + $images = + $this->fuckhtml + ->getElementsByClassName( + "ivg-i", + $div + ); + + foreach($images as $image){ + + $this->fuckhtml->load($image); + + $img_tags = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if( + !isset($image["attributes"]["data-docid"]) || + !isset($this->image_arr[$image["attributes"]["data-docid"]]) + ){ + + continue; + } + + // search for the right image tag + $image_tag = false; + foreach($img_tags as $img){ + + if( + isset( + $img + ["attributes"] + ["alt"] + ) && + trim( + $img + ["attributes"] + ["alt"] + ) != "" + ){ + + $image_tag = $img; + break; + } + } + + if($image_tag === false){ + + continue; + } + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $image_tag + ["attributes"] + ["alt"] + ) + ), + "source" => + $this->image_arr[ + $image + ["attributes"] + ["data-docid"] + ], + "url" => + $this->fuckhtml + ->getTextContent( + $image + ["attributes"] + ["data-lpage"] + ) + ]; + } + + // get unloaded javascript images + $images_js_sel = + $this->fuckhtml + ->getElementsByAttributeName( + "id", + $div + ); + + $loaded = []; + + foreach($images_js_sel as $sel){ + + if( + !isset($this->blobs[$sel["attributes"]["id"]]) || + in_array((string)$sel["attributes"]["id"], $loaded, true) + ){ + + // not an unloaded javascript image + continue; + } + + $loaded[] = $sel["attributes"]["id"]; + + // get yet another javascript component + $this->fuckhtml->load($this->blobs[$sel["attributes"]["id"]]); + + // get js node: contains title & url + $js_node = + $this->fuckhtml + ->getElementsByTagName( + "div" + )[0]; + + if(!isset($this->blobs[$js_node["attributes"]["id"]])){ + + // did not find refer id + continue; + } + + // load second javascript component + $this->fuckhtml->load($this->blobs[$js_node["attributes"]["id"]]); + + // get title from image alt text. + // data-src from this image is cropped, ignore it.. + $img = + $this->fuckhtml + ->getElementsByTagName( + "img" + )[0]; + + $out["image"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $img["attributes"]["alt"] + ), + "source" => + $this->image_arr[ + $js_node["attributes"]["data-docid"] + ], + "url" => + $this->fuckhtml + ->getTextContent( + $js_node["attributes"]["data-lpage"] + ) + ]; + } + } + + // reset + $this->fuckhtml->load($result_div); + } + + // + // Parse videos + // + $this->fuckhtml->load($result_div); + + $videos = + $this->fuckhtml + ->getElementsByAttributeName( + "data-vid", + "div" + ); + + foreach($videos as $video){ + + $this->fuckhtml->load($video); + + // get url + $url = + $this->fuckhtml + ->getTextContent( + $video + ["attributes"] + ["data-surl"] + ); + + foreach($out["web"] as $link){ + + if($link["url"] == $url){ + + // ignore if we already have the video in $out["web"] + continue 2; + } + } + + // get heading element + $heading = + $this->fuckhtml + ->getElementsByAttributeValue( + "role", + "heading", + "div" + ); + + if(count($heading) === 0){ + + // no heading, fuck this. + continue; + } + + // get thumbnail before loading heading object + $image = + $this->fuckhtml + ->getElementsByAttributeName( + "id", + "img" + ); + + if(count($image) !== 0){ + + $thumb = [ + "url" => $this->getdimg($image[0]["attributes"]["id"]), + "ratio" => "16:9" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + // get duration + $duration_div = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "border-radius" => "10px", + "font-family" => "arial,sans-serif-medium,sans-serif", + "font-size" => "12px", + "line-height" => "16px", + "padding-block" => "2px", + "padding-inline" => "8px" + ] + ), + "div" + ); + + if(count($duration_div) !== 0){ + + $duration = + $this->hms2int( + $this->fuckhtml + ->getTextContent( + $duration_div[0] + ) + ); + }else{ + + // check if its a livestream + $duration = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "background-color" => "#d93025", + "border-radius" => "10px", + "color" => "#fff", + "font-family" => "arial,sans-serif-medium,sans-serif", + "font-size" => "12px", + "line-height" => "16px", + "padding-block" => "2px", + "padding-inline" => "8px" + ] + ), + "span" + ); + + if(count($duration) !== 0){ + + $duration = "_LIVE"; + }else{ + + $duration = null; + } + } + + // load heading + $this->fuckhtml->load($heading[0]); + + // get title + $title = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "font-family" => "arial,sans-serif", + "font-size" => "16px", + "font-weight" => "400", + "line-height" => "24px" + ] + ), + "div" + ); + + if(count($title) === 0){ + + // ?? no title + continue; + } + + $title = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ); + + // get date + $date_div = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "color" => "var(" . $this->getcolorvar("#70757a") . ")", + "font-size" => "14px" + ] + ), + "div" + ); + + if(count($date_div) !== 0){ + + $date = strtotime( + $this->fuckhtml + ->getTextContent( + $date_div[0] + ) + ); + + if($date === false){ + + // failed to parse date + $date = null; + } + }else{ + + $date = null; + } + + $out["video"][] = [ + "title" => $title, + "description" => null, + "date" => $date, + "duration" => $duration, + "views" => null, + "thumb" => $thumb, + "url" => $url + ]; + } + + // + // Parse featured results (which contain images, fuck the rest desu) + // + $this->fuckhtml->load($html); + $top = + $this->fuckhtml + ->getElementsByAttributeValue( + "aria-label", + "Featured results", + "div" + ); + + if(count($top) !== 0){ + + $this->fuckhtml->load($top[0]); + + // get images + $grid = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "border-radius" => "20px", + "display" => "grid", + "grid-gap" => "2px", + "grid-template-rows" => "repeat(2,minmax(0,1fr))", + "overflow" => "hidden", + "bottom" => "0", + "left" => "0", + "right" => "0", + "top" => "0", + "position" => "absolute", + ] + ), + "div" + ); + + if(count($grid) !== 0){ + + // we found image grid + $this->fuckhtml->load($grid[0]); + + $images_div = + $this->fuckhtml + ->getElementsByAttributeName( + "data-attrid", + "div" + ); + + foreach($images_div as $image_div){ + + $this->fuckhtml->load($image_div); + + $image = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if( + count($image) === 0 || + !isset($image_div["attributes"]["data-docid"]) || + !isset($this->image_arr[$image_div["attributes"]["data-docid"]]) + ){ + + // ?? no image, continue + continue; + } + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $image[0]["attributes"]["alt"] + ) + ), + "source" => + $this->image_arr[ + $image_div["attributes"]["data-docid"] + ], + "url" => + $this->fuckhtml + ->getTextContent( + $image_div["attributes"]["data-lpage"] + ) + ]; + } + } + } + + + // + // craft $npt token + // + if( + $last_page === false && + count($out["web"]) !== 0 + ){ + if(!isset($params["start"])){ + + $params["start"] = 20; + }else{ + + $params["start"] += 20; + } + + $out["npt"] = + $this->backend + ->store( + json_encode($params), + $pagetype, + $proxy + ); + } + + + // + // Parse right handside + // + $this->fuckhtml->load($html); + + $rhs = + $this->fuckhtml + ->getElementById( + "rhs" + ); + + if($rhs === null){ + + return $out; + } + + $this->fuckhtml->load($rhs); + + // get images gallery + $image_gallery = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-rc", + "ivg-i", + "div" + ); + + if(count($image_gallery) !== 0){ + + $this->fuckhtml->load($image_gallery[0]); + + // get images + $images_div = + $this->fuckhtml + ->getElementsByClassName( + "ivg-i", + "div" + ); + + foreach($images_div as $image_div){ + + $this->fuckhtml->load($image_div); + + $image = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if( + count($image) === 0 || + !isset( + $this->image_arr[ + $image_div + ["attributes"] + ["data-docid"] + ] + ) + ){ + + continue; + } + + foreach($out["image"] as $existing_image){ + + // might already exist + if( + $existing_image["source"][1]["url"] == + $this->image_arr[ + $image_div + ["attributes"] + ["data-docid"] + ][1]["url"] + ){ + + continue 2; + } + } + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $image[0] + ["attributes"] + ["alt"] + ) + ), + "source" => + $this->image_arr[ + $image_div + ["attributes"] + ["data-docid"] + ], + "url" => + $this->fuckhtml + ->getTextContent( + $image_div + ["attributes"] + ["data-lpage"] + ) + ]; + } + + // reset + $this->fuckhtml->load($rhs); + } + + // get header container + $header = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "padding" => "0 0 16px 20px", + "display" => "flex" + ] + ), + "div" + ); + + // stop parsing wikipedia heads if there isn't a header + $description = []; + $title = "About"; + + if(count($header) !== 0){ + + $this->fuckhtml->load($header[0]); + + // g-snackbar-action present: we found a button instead + if( + count( + $this->fuckhtml + ->getElementsByTagName( + "g-snackbar-action" + ) + ) !== 0 + ){ + + $title_tag = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "title", + "div" + ); + + if(count($title_tag) !== 0){ + $title = + $this->fuckhtml + ->getTextContent( + $title_tag[0] + ); + + $header[0]["innerHTML"] = + str_replace( + $title_tag[0]["outerHTML"], + "", + $header[0]["innerHTML"] + ); + + // if header still contains text, add it as a subtitle in description + $subtitle = + $this->fuckhtml + ->getTextContent( + $header[0] + ); + + if(strlen($subtitle) !== 0){ + + $description[] = [ + "type" => "quote", + "value" => $subtitle + ]; + } + } + } + + // reset + $this->fuckhtml->load($rhs); + } + + // get description elements + $url = null; + + $text = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "description", + "div" + ); + + if(count($text) !== 0){ + + $this->fuckhtml->load($text[0]); + + $a = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($a) !== 0){ + // get link and remove it from description + + $a = $a[count($a) - 1]; + + $text[0]["innerHTML"] = + str_replace( + $a["outerHTML"], + "", + $text[0]["innerHTML"] + ); + + $url = + $this->fuckhtml + ->getTextContent( + $a + ["attributes"] + ["href"] + ); + } + + $description[] = [ + "type" => "text", + "value" => + html_entity_decode( + preg_replace( + '/^Description/', + "", + $this->fuckhtml + ->getTextContent( + $text[0] + ) + ) + ) + ]; + + // reset + $this->fuckhtml->load($rhs); + } + + // get reviews (google play, steam, etc) + $review_container = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "align-items" => "start", + "display" => "flex" + ] + ), + "div" + ); + + if(count($review_container) !== 0){ + + $this->fuckhtml->load($review_container[0]); + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) !== 0){ + + $description[] = [ + "type" => "title", + "value" => "Ratings" + ]; + + foreach($as as $a){ + + $this->fuckhtml->load($a); + + $spans = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + if(count($spans) >= 2){ + + $value = + trim( + $this->fuckhtml + ->getTextContent( + $spans[1] + ), + "· " + ); + + if( + $value == "" && + isset($spans[2]) + ){ + + $value = + $this->fuckhtml + ->getTextContent( + $spans[2] + ); + } + + $description[] = [ + "type" => "link", + "url" => + $this->fuckhtml + ->getTextContent( + $a["attributes"] + ["href"] + ), + "value" => $value + ]; + + $description[] = [ + "type" => "text", + "value" => + ": " . + $this->fuckhtml + ->getTextContent( + $spans[0] + ) . "\n" + ]; + } + } + } + + // reset + $this->fuckhtml->load($rhs); + } + + // initialize sublinks + $sublinks = []; + + // get description from business + if(count($description) === 0){ + + $data_attrid = + $this->fuckhtml + ->getElementsByAttributeName( + "data-attrid" + ); + + $summary = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "kc:/local:one line summary", + $data_attrid + ); + + if(count($summary) !== 0){ + + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $summary[0] + ) + ]; + + // remove summary so it doesnt get parsed as a table + $rhs["innerHTML"] = + str_replace( + $summary[0]["outerHTML"], + "", + $rhs["innerHTML"] + ); + + $this->fuckhtml->load($rhs); + } + + $address = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "kc:/location/location:address", + $data_attrid + ); + + if(count($address) !== 0){ + + $description[] = [ + "type" => "text", + "value" => + $this->fuckhtml + ->getTextContent( + $address[0] + ) + ]; + } + + // get title + $title_div = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "title", + $data_attrid + ); + + if(count($title_div) !== 0){ + + $title = + $this->fuckhtml + ->getTextContent( + $title_div[0] + ); + } + + // get phone number + $phone = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "kc:/local:alt phone", + $data_attrid + ); + + if(count($phone) !== 0){ + + $this->fuckhtml->load($phone[0]); + + $sublinks["Call"] = + "tel:" . + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByAttributeName( + "aria-label", + "span" + )[0] + ); + + $this->fuckhtml->load($rhs); + } + } + + if(count($description) === 0){ + + // still no description? abort + return $out; + } + + // get table elements + $table = []; + $table_elems = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "margin-top" => "7px" + ] + ), + "div" + ); + + foreach($table_elems as $elem){ + + $this->fuckhtml->load($elem); + + $spans = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + if(count($spans) === 0){ + + // ?? invalid + continue; + } + + $elem["innerHTML"] = + str_replace( + $spans[0]["outerHTML"], + "", + $elem["innerHTML"] + ); + + $key = + rtrim( + $this->fuckhtml + ->getTextContent( + $spans[0] + ), + ": " + ); + + if( + $key == "" || + $key == "Phone" + ){ + + continue; + } + + if($key == "Hours"){ + + $hours = []; + + $this->fuckhtml->load($elem); + + $trs = + $this->fuckhtml + ->getElementsByTagName( + "tr" + ); + + foreach($trs as $tr){ + + $this->fuckhtml->load($tr); + + $tds = + $this->fuckhtml + ->getElementsByTagName( + "td" + ); + + if(count($tds) === 2){ + + $hours[] = + $this->fuckhtml + ->getTextContent( + $tds[0] + ) . ": " . + $this->fuckhtml + ->getTextContent( + $tds[1] + ); + } + } + + if(count($hours) !== 0){ + + $hours = implode("\n", $hours); + $table["Hours"] = $hours; + } + + continue; + } + + $table[$key] = + preg_replace( + '/ +/', + " ", + $this->fuckhtml + ->getTextContent( + $elem + ) + ); + } + + // reset + $this->fuckhtml->load($rhs); + + // get the website div + $as = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "visit_official_site", + "a" + ); + + if(count($as) !== 0){ + + $sublinks["Website"] = + str_replace( + "http://", + "https://", + $this->fuckhtml + ->getTextContent( + $as[0] + ["attributes"] + ["href"] + ) + ); + }else{ + + // get website through button + $button = + $this->fuckhtml + ->getElementsByClassName( + "ab_button", + "a" + ); + + if(count($button) !== 0){ + + $sublinks["Website"] = + $this->unshiturl( + $this->fuckhtml + ->getTextContent( + $button[0] + ["attributes"] + ["href"] + ) + ); + } + } + + // get social media links + $as = + $this->fuckhtml + ->getElementsByTagName( + "g-link" + ); + + foreach($as as $a){ + + $this->fuckhtml->load($a); + + $link = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($link) === 0){ + + continue; + } + + $sublink_title = + $this->fuckhtml + ->getTextContent( + $a + ); + + if($sublink_title == "X (Twitter)"){ + + $sublink_title = "Twitter"; + } + + $sublinks[$sublink_title] = + $this->fuckhtml + ->getTextContent( + $link[0] + ["attributes"] + ["href"] + ); + } + + // reset + $this->fuckhtml->load($rhs); + + // get those round containers + $containers = + $this->fuckhtml + ->getElementsByClassName( + "tpa-ci" + ); + + foreach($containers as $container){ + + $this->fuckhtml->load($container); + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) === 0){ + + continue; + } + + $sublinks[ + $this->fuckhtml + ->getTextContent( + $as[0] + ) + ] = + $this->fuckhtml + ->getTextContent( + $as[0] + ["attributes"] + ["href"] + ); + } + + $out["answer"][] = [ + "title" => $title, + "description" => $description, + "url" => $url, + "thumb" => null, + "table" => $table, + "sublink" => $sublinks + ]; + + return $out; + } + + + private function scrape_dimg($html){ + + // get images loaded through javascript + $this->dimg = []; + + preg_match_all( + '/function\(\){google\.ldi=({.*?});/', + $html, + $dimg + ); + + if(isset($dimg[1])){ + + foreach($dimg[1] as $i){ + + $tmp = json_decode($i, true); + foreach($tmp as $key => $value){ + + $this->dimg[$key] = + $this->unshit_thumb( + $value + ); + } + } + } + + // get additional javascript base64 images + preg_match_all( + '/var s=\'(data:image\/[^\']+)\';var ii=\[((?:\'[^\']+\',?)+)\];/', + $html, + $dimg + ); + + if(isset($dimg[1])){ + + for($i=0; $i<count($dimg[1]); $i++){ + + $delims = explode(",", $dimg[2][$i]); + $string = + $this->fuckhtml + ->parseJsString( + $dimg[1][$i] + ); + + foreach($delims as $delim){ + + $this->dimg[trim($delim, "'")] = $string; + } + } + } + } + + + private function scrape_imagearr($html){ + // get image links arrays + preg_match_all( + '/\[0,"([^"]+)",\["([^"]+)\",([0-9]+),([0-9]+)\],\["([^"]+)",([0-9]+),([0-9]+)\]/', + $html, + $image_arr + ); + + $this->image_arr = []; + if(isset($image_arr[1])){ + + for($i=0; $i<count($image_arr[1]); $i++){ + + $this->image_arr[$image_arr[1][$i]] = + [ + [ + "url" => + $this->fuckhtml + ->parseJsString( + $image_arr[5][$i] + ), + "width" => (int)$image_arr[7][$i], + "height" => (int)$image_arr[6][$i] + ], + [ + "url" => + $this->unshit_thumb( + $this->fuckhtml + ->parseJsString( + $image_arr[2][$i] + ) + ), + "width" => (int)$image_arr[4][$i], + "height" => (int)$image_arr[3][$i] + ] + ]; + } + } + } + + + private function getdimg($dimg){ + + return isset($this->dimg[$dimg]) ? $this->dimg[$dimg] : null; + } + + + private function unshit_thumb($url){ + // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj + // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA + + $parts = parse_url($url); + + if( + isset($parts["host"]) && + preg_match( + '/tbn.*\.gstatic\.com/', + $parts["host"] + ) + ){ + + parse_str($parts["query"], $params); + + if(isset($params["q"])){ + + return "https://" . $parts["host"] . "/images?q=" . $params["q"]; + } + } + + return $url; + } + + + private function parsestyles(){ + + $styles = []; + + $style_div = + $this->fuckhtml + ->getElementsByTagName( + "style" + ); + + $raw_styles = ""; + + foreach($style_div as $style){ + + $raw_styles .= $style["innerHTML"]; + } + + // filter out media/keyframe queries + $raw_styles = + preg_replace( + '/@\s*(?!font-face)[^{]+\s*{[\S\s]+?}\s*}/', + "", + $raw_styles + ); + + // get styles + preg_match_all( + '/(.+?){([\S\s]*?)}/', + $raw_styles, + $matches + ); + + for($i=0; $i<count($matches[1]); $i++){ + + // get style values + preg_match_all( + '/([^:;]+):([^;]*?(?:\([^)]+\)[^;]*?)?)(?:;|$)/', + $matches[2][$i], + $values_regex + ); + + $values = []; + for($k=0; $k<count($values_regex[1]); $k++){ + + $values[trim($values_regex[1][$k])] = + strtolower(trim($values_regex[2][$k])); + } + + $names = explode(",", $matches[1][$i]); + + // h1,h2,h3 will each get their own array index + foreach($names as $name){ + + $name = trim($name, "}\t\n\r\0\x0B"); + + foreach($values as $key => $value){ + + $styles[$name][$key] = $value; + } + } + } + + foreach($styles as $key => $values){ + + $styles[$key]["_c"] = count($values); + } + + $this->styles = $styles; + + // get CSS colors + $this->css_colors = []; + + if(isset($this->styles[":root"])){ + + foreach($this->styles[":root"] as $key => $value){ + + $this->css_colors[$value] = strtolower($key); + } + } + } + + + + private function getstyle($styles){ + + $styles["_c"] = count($styles); + + foreach($this->styles as $style_key => $style_values){ + + if(count(array_intersect_assoc($style_values, $styles)) === $styles["_c"] + 1){ + + $style_key = + explode(" ", $style_key); + + $style_key = $style_key[count($style_key) - 1]; + + return + ltrim( + str_replace( + [".", "#"], + " ", + $style_key + ) + ); + } + } + + return false; + } + + + + private function getcolorvar($color){ + + if(isset($this->css_colors[$color])){ + + return $this->css_colors[$color]; + } + + return null; + } + + + + public function web($get){ + + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "web"); + $params = json_decode($params, true); + + $search = $params["q"]; + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $lang = $get["lang"]; + $older = $get["older"]; + $newer = $get["newer"]; + $spellcheck = $get["spellcheck"]; + $proxy = $this->backend->get_ip(); + + $offset = 0; + + $params = [ + "q" => $search, + "hl" => "en", + "num" => 20 // get 20 results + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // language + if($lang != "any"){ + + $params["lr"] = "lang_" . $lang; + } + + // generate tbs + $tbs = []; + + // get date + $older = $older === false ? null : date("m/d/Y", $older); + $newer = $newer === false ? null : date("m/d/Y", $newer); + + if( + $older !== null || + $newer !== null + ){ + + $tbs["cdr"] = "1"; + $tbs["cd_min"] = $newer; + $tbs["cd_max"] = $older; + } + + // spellcheck filter + if($spellcheck == "no"){ + + $params["nfpr"] = "1"; + } + + if(count($tbs) !== 0){ + + $params["tbs"] = ""; + + foreach($tbs as $key => $value){ + + $params["tbs"] .= $key . ":" . $value . ","; + } + + $params["tbs"] = rtrim($params["tbs"], ","); + } + } + + try{ + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + //$html = file_get_contents("scraper/google.html"); + + return $this->parsepage($html, "web", $search, $proxy, $params); + } + + + + public function video($get){ + + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "web"); + $params = json_decode($params, true); + + $search = $params["q"]; + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $older = $get["older"]; + $newer = $get["newer"]; + $duration = $get["duration"]; + $quality = $get["quality"]; + $captions = $get["captions"]; + $proxy = $this->backend->get_ip(); + + $params = [ + "q" => $search, + "tbm" => "vid", + "hl" => "en", + "num" => "20" + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + $tbs = []; + + // get date + $older = $older === false ? null : date("m/d/Y", $older); + $newer = $newer === false ? null : date("m/d/Y", $newer); + + if( + $older !== null || + $newer !== null + ){ + + $tbs["cdr"] = "1"; + $tbs["cd_min"] = $newer; + $tbs["cd_max"] = $older; + } + + // duration + if($duration != "any"){ + + $tbs[] = "dur:" . $duration; + } + + // quality + if($quality != "any"){ + + $tbs[] = "hq:" . $quality; + } + + // captions + if($captions != "any"){ + + $tbs[] = "cc:" . $captions; + } + + // append tbs + if(count($tbs) !== 0){ + + $params["tbs"] = + implode(",", $tbs); + } + } + + try{ + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + //$html = file_get_contents("scraper/google.html"); + + $response = $this->parsepage($html, "videos", $search, $proxy, $params); + $out = [ + "status" => "ok", + "npt" => $response["npt"], + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + foreach($response["web"] as $result){ + + $out["video"][] = [ + "title" => $result["title"], + "description" => $result["description"], + "author" => [ + "name" => isset($result["table"]["Author"]) ? $result["table"]["Author"] : null, + "url" => null, + "avatar" => null + ], + "date" => $result["date"], + "duration" => isset($result["table"]["Duration"]) ? $this->hms2int($result["table"]["Duration"]) : null, + "views" => null, + "thumb" => $result["thumb"], + "url" => $result["url"] + ]; + } + + return $out; + } + + + + public function news($get){ + + if($get["npt"]){ + + [$req, $proxy] = $this->backend->get($get["npt"], "news"); + /*parse_str( + parse_url($req, PHP_URL_QUERY), + $search + );*/ + + try{ + + $html = + $this->get( + $proxy, + "https://www.google.com" . $req, + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $older = $get["older"]; + $newer = $get["newer"]; + $sort = $get["sort"]; + $proxy = $this->backend->get_ip(); + + $params = [ + "q" => $search, + "tbm" => "nws", + "hl" => "en", + "num" => "20" + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + $tbs = []; + + // get date + $older = $older === false ? null : date("m/d/Y", $older); + $newer = $newer === false ? null : date("m/d/Y", $newer); + + if( + $older !== null || + $newer !== null + ){ + + $tbs["cdr"] = "1"; + $tbs["cd_min"] = $newer; + $tbs["cd_max"] = $older; + } + + // relevance + if($sort == "date"){ + + $tbs["sbd"] = "1"; + } + + // append tbs + if(count($tbs) !== 0){ + + $params["tbs"] = ""; + + foreach($tbs as $key => $value){ + + $params["tbs"] .= $key . ":" . $value . ","; + } + + $params["tbs"] = rtrim($params["tbs"], ","); + } + + //$html = file_get_contents("scraper/google-news.html"); + + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + $this->fuckhtml->load($html); + + $this->detect_sorry(); + + // get images + $this->scrape_dimg($html); + + // parse styles + $this->parsestyles(); + + $center_col = + $this->fuckhtml + ->getElementById( + "center_col", + "div" + ); + + if($center_col === null){ + + throw new Exception("Could not grep result div"); + } + + $this->fuckhtml->load($center_col); + + // get next page + $npt = + $this->fuckhtml + ->getElementById( + "pnnext", + "a" + ); + + if($npt !== false){ + + $out["npt"] = + $this->backend->store( + $this->fuckhtml + ->getTextContent( + $npt["attributes"] + ["href"] + ), + "news", + $proxy + ); + } + + $as = + $this->fuckhtml + ->getElementsByAttributeName( + "jsname", + "a" + ); + + foreach($as as $a){ + + $this->fuckhtml->load($a); + + // get title + $title = + $this->fuckhtml + ->getElementsByAttributeValue( + "role", + "heading", + "div" + ); + + if(count($title) === 0){ + + continue; + } + + $title = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ); + + // get thumbnail + $image = + $this->fuckhtml + ->getElementsByAttributeName( + "id", + "img" + ); + + // check for padded title node, if found, we're inside a carousel + $probe = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "padding" => "16px 16px 40px 16px" + ] + ), + "div" + ); + + if(count($probe) !== 0){ + + $probe = true; + }else{ + + $probe = false; + } + + if( + count($image) !== 0 && + !isset($image[0]["attributes"]["width"]) + ){ + + $thumb = [ + "url" => + $this->getdimg( + $image[0]["attributes"]["id"] + ), + "ratio" => $probe === true ? "16:9" : "1:1" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + $description = null; + + if($probe === false){ + + $desc_divs = + $this->fuckhtml + ->getElementsByAttributeName( + "style", + "div" + ); + + foreach($desc_divs as $desc){ + + if( + strpos( + $desc["attributes"]["style"], + "margin-top:" + ) !== false + ){ + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $desc + ) + ); + break; + } + } + } + + // get author + $author = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "overflow" => "hidden", + "text-align" => "left", + "text-overflow" => "ellipsis", + "white-space" => "nowrap", + "margin-bottom" => "8px" + ] + ), + "div" + ); + + if(count($author) !== 0){ + + $author = + $this->fuckhtml + ->getTextContent( + $author[0] + ); + }else{ + + $author = null; + } + + // get date + $date = null; + + $date_div = + $this->fuckhtml + ->getElementsByAttributeName( + "style", + "div" + ); + + foreach($date_div as $d){ + + $this->fuckhtml->load($d); + + $span = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + if( + strpos( + $d["attributes"]["style"], + "bottom:" + ) !== false + ){ + + $date = + strtotime( + $this->fuckhtml + ->getTextContent( + $span[count($span) - 1] + ) + ); + break; + } + } + + $out["news"][] = [ + "title" => $title, + "author" => $author, + "description" => $description, + "date" => $date, + "thumb" => $thumb, + "url" => + $this->unshiturl( + $a["attributes"] + ["href"] + ) + ]; + } + + return $out; + } + + + + + public function image($get){ + + // generate parameters + if($get["npt"]){ + + [$params, $proxy] = + $this->backend->get( + $get["npt"], + "images" + ); + + $params = json_decode($params, true); + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $time = $get["time"]; + $size = $get["size"]; + $ratio = $get["ratio"]; + $color = $get["color"]; + $type = $get["type"]; + $format = $get["format"]; + $rights = $get["rights"]; + + $params = [ + "q" => $search, + "udm" => "2" // get images + ]; + + // country (image search uses cr instead of gl) + if($country != "any"){ + + $params["cr"] = "country" . strtoupper($country); + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // generate tbs + $tbs = []; + + // time + if($time != "any"){ + + $tbs["qdr"] = $time; + } + + // size + if($size != "any"){ + + $params["imgsz"] = $size; + } + + // ratio + if($ratio != "any"){ + + $params["imgar"] = $ratio; + } + + // color + if($color != "any"){ + + if( + $color == "color" || + $color == "trans" + ){ + + $params["imgc"] = $color; + }elseif($color == "bnw"){ + + $params["imgc"] = "gray"; + }else{ + + $tbs["ic"] = "specific"; + $tbs["isc"] = $color; + } + } + + // type + if($type != "any"){ + + $tbs["itp"] = $type; + } + + // format + if($format != "any"){ + + $params["as_filetype"] = $format; + } + + // rights (tbs) + if($rights != "any"){ + + $tbs["sur"] = $rights; + } + + // append tbs + if(count($tbs) !== 0){ + + $params["tbs"] = ""; + + foreach($tbs as $key => $value){ + + $params["tbs"] .= $key . ":" . $value . ","; + } + + $params["tbs"] = rtrim($params["tbs"], ","); + } + } + /* + $handle = fopen("scraper/google-img.html", "r"); + $html = fread($handle, filesize("scraper/google-img.html")); + fclose($handle);*/ + + try{ + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get search page"); + } + + $this->fuckhtml->load($html); + + $this->detect_sorry(); + + // get javascript images + $this->scrape_imagearr($html); + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + $images = + $this->fuckhtml + ->getElementsByClassName( + "ivg-i", + "div" + ); + + foreach($images as $div){ + + $this->fuckhtml->load($div); + + $image = + $this->fuckhtml + ->getElementsByTagName("img")[0]; + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $image["attributes"]["alt"] + ) + ), + "source" => + $this->image_arr[ + $div["attributes"]["data-docid"] + ], + "url" => + $this->fuckhtml + ->getTextContent( + $div["attributes"]["data-lpage"] + ) + ]; + } + + // as usual, no way to check if there is a next page reliably + if(count($out["image"]) > 50){ + + if(!isset($params["start"])){ + + $params["start"] = 10; + }else{ + + $params["start"] += 10; + } + + $out["npt"] = + $this->backend + ->store( + json_encode($params), + "image", + $proxy + ); + } + + return $out; + } + + private function unshiturl($url, $return_size = false){ + + // decode + $url = + $this->fuckhtml + ->getTextContent($url); + + $url_parts = parse_url($url); + + if( + !isset( + $url_parts["host"] + ) + ){ + + // no host, we have a tracking url + parse_str($url_parts["query"], $query); + + if(isset($query["imgurl"])){ + + $url = $query["imgurl"]; + } + elseif(isset($query["q"])){ + + $url = $query["q"]; + } + } + + // rewrite URLs to remove extra tracking parameters + $domain = parse_url($url, PHP_URL_HOST); + + if( + preg_match( + '/wikipedia.org$/', + $domain + ) + ){ + + // rewrite wikipedia mobile URLs to desktop + $url = + $this->replacedomain( + $url, + preg_replace( + '/([a-z0-9]+)(\.m\.)/', + '$1.', + $domain + ) + ); + } + + elseif( + preg_match( + '/imdb\.com$|youtube\.[^.]+$/', + $domain + ) + ){ + + // rewrite imdb and youtube mobile URLs too + $url = + $this->replacedomain( + $url, + preg_replace( + '/^m\./', + "", + $domain + ) + ); + + } + + elseif( + preg_match( + '/play\.google\.[^.]+$/', + $domain + ) + ){ + + // remove referrers from play.google.com + $oldquery = parse_url($url, PHP_URL_QUERY); + if($oldquery !== null){ + + parse_str($oldquery, $query); + if(isset($query["referrer"])){ unset($query["referrer"]); } + if(isset($query["hl"])){ unset($query["hl"]); } + if(isset($query["gl"])){ unset($query["gl"]); } + + $query = http_build_query($query); + + $url = + str_replace( + $oldquery, + $query, + $url + ); + } + } + + elseif( + preg_match( + '/twitter\.com$/', + $domain + ) + ){ + // remove more referrers from twitter.com + $oldquery = parse_url($url, PHP_URL_QUERY); + if($oldquery !== null){ + + parse_str($oldquery, $query); + if(isset($query["ref_src"])){ unset($query["ref_src"]); } + + $query = http_build_query($query); + + $url = + str_replace( + $oldquery, + $query, + $url + ); + } + } + + elseif( + preg_match( + '/maps\.google\.[^.]+/', + $domain + ) + ){ + + if(stripos($url, "maps?") !== false){ + + //https://maps.google.com/maps?daddr=Johnny,+603+Rue+St+Georges,+Saint-J%C3%A9r%C3%B4me,+Quebec+J7Z+5B7 + $query = parse_url($url, PHP_URL_QUERY); + + if($query !== null){ + + parse_str($query, $query); + + if(isset($query["daddr"])){ + + $url = + "https://maps.google.com/maps?daddr=" . + urlencode($query["daddr"]); + } + } + } + } + + if($return_size){ + + return [ + "url" => $url, + "ref" => isset($query["imgrefurl"]) ? $query["imgrefurl"] : null, + "thumb_width" => isset($query["tbnw"]) ? (int)$query["tbnw"] : null, + "thumb_height" => isset($query["tbnh"]) ? (int)$query["tbnh"] : null, + "image_width" => isset($query["w"]) ? (int)$query["w"] : null, + "image_height" => isset($query["h"]) ? (int)$query["h"] : null + ]; + } + + return $url; + } + + private function replacedomain($url, $domain){ + + return + preg_replace( + '/(https?:\/\/)([^\/]+)/', + '$1' . $domain, + $url + ); + } + + private function titledots($title){ + + return trim($title, " .\t\n\r\0\x0B…"); + } + + private function hms2int($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + private function detect_sorry(){ + + $recaptcha = + $this->fuckhtml + ->getElementById( + "recaptcha", + "div" + ); + + if($recaptcha !== false){ + + throw new Exception("Google returned a captcha"); + } + } +} |