diff options
author | root <root@itinerariummentis.org> | 2024-06-18 06:57:46 +0000 |
---|---|---|
committer | root <root@itinerariummentis.org> | 2024-06-18 06:57:46 +0000 |
commit | b13fce93ac14686250db5b38dc5c08432ed849f9 (patch) | |
tree | 0f9810f2ed0db1c6fe2a499dbc0c0b032b255f99 /scraper/google.php | |
parent | 7fb8b2adfd840bba7ec2a345c8868508337b2412 (diff) | |
parent | 4e039f709571c50895501f1422659d102bef6b16 (diff) |
Merge remote-tracking branch 'refs/remotes/origin/master'
fuck
Diffstat (limited to 'scraper/google.php')
-rw-r--r-- | scraper/google.php | 5984 |
1 files changed, 3556 insertions, 2428 deletions
diff --git a/scraper/google.php b/scraper/google.php index 50bcc22..185ad0e 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -1,14 +1,9 @@ <?php -// todo: -// aliexpress tracking links -// enhanced msx notice +// @TODO check for consent.google.com page, if need be class google{ - private const is_class = "."; - private const is_id = "#"; - public function __construct(){ include "lib/fuckhtml.php"; @@ -21,7 +16,7 @@ class google{ public function getfilters($page){ $base = [ - "country" => [ // gl=<country> + "country" => [ // gl=<country> (image: cr=countryAF) "display" => "Country", "option" => [ "any" => "Instance's country", @@ -272,47 +267,6 @@ class google{ "yes" => "Yes", // safe=active "no" => "No" // safe=off ] - ], - "lang" => [ // lr=<lang> (prefix lang with "lang_") - "display" => "Language", - "option" => [ - "any" => "Any language", - "ar" => "Arabic", - "bg" => "Bulgarian", - "ca" => "Catalan", - "cs" => "Czech", - "da" => "Danish", - "de" => "German", - "el" => "Greek", - "en" => "English", - "es" => "Spanish", - "et" => "Estonian", - "fi" => "Finnish", - "fr" => "French", - "hr" => "Croatian", - "hu" => "Hungarian", - "id" => "Indonesian", - "is" => "Icelandic", - "it" => "Italian", - "iw" => "Hebrew", - "ja" => "Japanese", - "ko" => "Korean", - "lt" => "Lithuanian", - "lv" => "Latvian", - "nl" => "Dutch", - "no" => "Norwegian", - "pl" => "Polish", - "pt" => "Portuguese", - "ro" => "Romanian", - "ru" => "Russian", - "sk" => "Slovak", - "sl" => "Slovenian", - "sr" => "Serbian", - "sv" => "Swedish", - "tr" => "Turkish", - "zh-CN" => "Chinese (Simplified)", - "zh-TW" => "Chinese (Traditional)" - ] ] ]; @@ -322,13 +276,61 @@ class google{ return array_merge( $base, [ - "newer" => [ // &sort=review-date:r:20090301:20090430 + "lang" => [ // lr=<lang> (prefix lang with "lang_") + "display" => "Language", + "option" => [ + "any" => "Any language", + "ar" => "Arabic", + "bg" => "Bulgarian", + "ca" => "Catalan", + "cs" => "Czech", + "da" => "Danish", + "de" => "German", + "el" => "Greek", + "en" => "English", + "es" => "Spanish", + "et" => "Estonian", + "fi" => "Finnish", + "fr" => "French", + "hr" => "Croatian", + "hu" => "Hungarian", + "id" => "Indonesian", + "is" => "Icelandic", + "it" => "Italian", + "iw" => "Hebrew", + "ja" => "Japanese", + "ko" => "Korean", + "lt" => "Lithuanian", + "lv" => "Latvian", + "nl" => "Dutch", + "no" => "Norwegian", + "pl" => "Polish", + "pt" => "Portuguese", + "ro" => "Romanian", + "ru" => "Russian", + "sk" => "Slovak", + "sl" => "Slovenian", + "sr" => "Serbian", + "sv" => "Swedish", + "tr" => "Turkish", + "zh-CN" => "Chinese (Simplified)", + "zh-TW" => "Chinese (Traditional)" + ] + ], + "newer" => [ // tbs "display" => "Newer than", "option" => "_DATE" ], "older" => [ "display" => "Older than", "option" => "_DATE" + ], + "spellcheck" => [ + "display" => "Spellcheck", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] ] ] ); @@ -338,7 +340,7 @@ class google{ return array_merge( $base, [ - "time" => [ // tbs=qrd:<size> + "time" => [ // tbs=qdr:<time> "display" => "Time posted", "option" => [ "any" => "Any time", @@ -348,19 +350,16 @@ class google{ "y" => "Past year" ] ], - "size" => [ + "size" => [ // imgsz "display" => "Size", "option" => [ - // tbs=isz:<size> "any" => "Any size", "l" => "Large", "m" => "Medium", "i" => "Icon", - // from here - // tbz:lt,islt:<size> "qsvga" => "Larger than 400x300", "vga" => "Larger than 640x480", - "qsvga" => "Larger than 800x600", + "svga" => "Larger than 800x600", "xga" => "Larger than 1024x768", "2mp" => "Larger than 2MP", "4mp" => "Larger than 4MP", @@ -374,24 +373,24 @@ class google{ "70mp" => "Larger than 70MP" ] ], - "ratio" => [ // tbs=iar:<size> + "ratio" => [ // imgar "display" => "Aspect ratio", "option" => [ "any" => "Any ratio", - "t" => "Tall", + "t|xt" => "Tall", "s" => "Square", "w" => "Wide", "xw" => "Panoramic" ] ], - "color" => [ // tbs=ic:<color> + "color" => [ // imgc "display" => "Color", "option" => [ "any" => "Any color", "color" => "Full color", - "gray" => "Black & white", + "bnw" => "Black & white", "trans" => "Transparent", - // from there, its ic:specific,isc:<color> + // from here, imgcolor "red" => "Red", "orange" => "Orange", "yellow" => "Yellow", @@ -410,14 +409,12 @@ class google{ "display" => "Type", "option" => [ "any" => "Any type", - "face" => "Faces", "clipart" => "Clip Art", "lineart" => "Line Drawing", - "stock" => "Stock", "animated" => "Animated" ] ], - "format" => [ // tbs=ift:<format> + "format" => [ // as_filetype "display" => "Format", "option" => [ "any" => "Any format", @@ -431,7 +428,7 @@ class google{ "craw" => "RAW" ] ], - "rights" => [ // tbs=il:<rights> + "rights" => [ // tbs=sur:<rights> "display" => "Usage rights", "option" => [ "any" => "Any license", @@ -447,16 +444,13 @@ class google{ return array_merge( $base, [ - "time" => [ - "display" => "Time posted", - "option" => [ // tbs=qdr - "any" => "Any time", - "h" => "Past hour", - "d" => "Past 24 hours", - "w" => "Past week", - "m" => "Past month", - "y" => "Past year" - ] + "newer" => [ // tbs + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" ], "duration" => [ "display" => "Duration", @@ -489,17 +483,13 @@ class google{ return array_merge( $base, [ - "time" => [ - "display" => "Time posted", - "option" => [ // tbs=qdr - "any" => "Any time", - "h" => "Past hour", - "d" => "Past 24 hours", - "w" => "Past week", - "m" => "Past month", - "y" => "Past year", - "a" => "Archives" // tbs=ar:1 - ] + "newer" => [ // tbs + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" ], "sort" => [ "display" => "Sort", @@ -517,18 +507,20 @@ class google{ private function get($proxy, $url, $get = []){ $headers = [ - "User-Agent: Mozilla/5.0 (Linux; U; Android 2.3.3; pt-pt; LG-P500h-parrot Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MMS/LG-Android-MMS-V1.0/1.2", + "User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip", "DNT: 1", - "Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY", + //"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: none", - "Sec-Fetch-User: ?1" + "Sec-Fetch-User: ?1", + "Priority: u=1", + "TE: trailers" ]; $curlproc = curl_init(); @@ -543,11 +535,17 @@ class google{ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + // use http2 + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + // follow redirects + curl_setopt($curlproc, CURLOPT_FOLLOWLOCATION, true); $this->backend->assign_proxy($curlproc, $proxy); @@ -564,847 +562,815 @@ class google{ - public function web($get){ + + private function parsepage($html, $pagetype, $search, $proxy, $params){ - if($get["npt"]){ - - [$req, $ip] = $this->backend->get($get["npt"], "web"); - parse_str( - parse_url($req, PHP_URL_QUERY), - $search - ); - - if(isset($search["q"])){ - - $search = $search["q"]; - }else{ - - $search = "a"; // lol - } - - try{ - $html = - $this->get( - $ip, - "https://www.google.com" . $req, - [] - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - }else{ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $lang = $get["lang"]; - $older = $get["older"]; - $newer = $get["newer"]; - $ip = $this->backend->get_ip(); - - $params = [ - "q" => $search, - "hl" => "en", - "num" => 20 // get 20 results - ]; - - // country - if($country != "any"){ - - $params["gl"] = $country; - } - - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; - - // language - if($lang != "any"){ - - $params["lr"] = "lang_" . $lang; - } - - // &sort=review-date:r:20090301:20090430 - $older = $older === false ? false : date("Ymd", $older); - $newer = $newer === false ? false : date("Ymd", $newer); - - if( - $older !== false && - $newer === false - ){ - - $newer = date("Ymd", time()); - } + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + $this->detect_sorry(); + + // parse all <style> tags + $this->parsestyles(); + + // get javascript images + $this->scrape_dimg($html); + + // get html blobs + preg_match_all( + '/function\(\){window\.jsl\.dh\(\'([^\']+?)\',\'(.+?[^\'])\'\);/', + $html, + $blobs + ); + + $this->blobs = []; + if(isset($blobs[1])){ - if( - $older !== false || - $newer !== false - ){ + for($i=0; $i<count($blobs[1]); $i++){ - $params["sort"] = "review-date:r:" . $older . ":" . $newer; - } - - try{ - $html = - $this->get( - $ip, - "https://www.google.com/search", - $params + $this->blobs[$blobs[1][$i]] = + $this->fuckhtml + ->parseJsString( + $blobs[2][$i] ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); } - - //$html = file_get_contents("scraper/google.html"); } - return $this->parsepage($html, "web", $search, $ip); - } - - - - public function video($get){ + $this->scrape_imagearr($html); - if($get["npt"]){ - - [$req, $ip] = $this->backend->get($get["npt"], "videos"); - parse_str( - parse_url($req, PHP_URL_QUERY), - $search + // + // load result column + // + $result_div = + $this->fuckhtml + ->getElementById( + "center_col", + "div" ); + + if($result_div === false){ - if(isset($search["q"])){ - - $search = $search["q"]; - }else{ - - $search = "a"; // lol - } - - try{ - - $html = - $this->get( - $ip, - "https://www.google.com" . $req, - [] - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - }else{ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $lang = $get["lang"]; - $time = $get["time"]; - $duration = $get["duration"]; - $quality = $get["quality"]; - $captions = $get["captions"]; - $ip = $this->backend->get_ip(); - - $params = [ - "q" => $search, - "tbm" => "vid", - "hl" => "en", - "num" => "20" - ]; - - // country - if($country != "any"){ - - $params["gl"] = $country; - } + throw new Exception("Failed to grep result div"); + } + + $this->fuckhtml->load($result_div); + + // + // Get word corrections + // + $correction = + $this->fuckhtml + ->getElementById( + "fprs", + "p" + ); + + if($correction){ - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; + $this->fuckhtml->load($correction); - // language - if($lang != "any"){ - - $params["lr"] = "lang_" . $lang; - } + $a = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + $using = + $this->fuckhtml + ->getElementById( + "fprsl", + $a + ); - $tbs = []; + if($using){ - // time - if($time != "any"){ + $using = + $this->fuckhtml + ->getTextContent( + $using + ); - $tbs[] = "qdr:" . $time; - } - - // duration - if($duration != "any"){ + $spans = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); - $tbs[] = "dur:" . $duration; - } - - // quality - if($quality != "any"){ + $type_span = + $this->fuckhtml + ->getTextContent( + $spans[0] + ); - $tbs[] = "hq:" . $quality; - } - - // captions - if($captions != "any"){ + $type = "not_many"; - $tbs[] = "cc:" . $captions; - } - - // append tbs - if(count($tbs) !== 0){ + if( + stripos( + $type_span, + "Showing results for" + ) !== false + ){ + + $type = "including"; + } - $params["tbs"] = - implode(",", $tbs); - } - - try{ - $html = - $this->get( - $ip, - "https://www.google.com/search", - $params + $correction = + $this->fuckhtml + ->getTextContent( + $a[count($a) - 1] ); - }catch(Exception $error){ - throw new Exception("Failed to get HTML"); + $out["spelling"] = [ + "type" => $type, + "using" => $using, + "correction" => $correction + ]; } - } - - $json = $this->parsepage($html, "videos", $search, $ip); - $out = [ - "status" => "ok", - "npt" => $json["npt"], - "video" => [], - "author" => [], - "livestream" => [], - "playlist" => [], - "reel" => [] - ]; - - foreach($json["web"] as $item){ - $out["video"][] = [ - "title" => $item["title"], - "description" => $item["description"], - "author" => [ - "name" => null, - "url" => null, - "avatar" => null - ], - "date" => isset($item["table"]["Posted"]) ? strtotime($item["table"]["Posted"]) : null, - "duration" => isset($item["table"]["Duration"]) ? $this->hms2int($item["table"]["Duration"]) : null, - "views" => null, - "thumb" => - $item["thumb"]["url"] === null ? - [ - "url" => null, - "ratio" => null - ] : - [ - "url" => $item["thumb"]["url"], - "ratio" => "16:9" - ], - "url" => $item["url"] - ]; + // reset + $this->fuckhtml->load($result_div); } - return $out; - } - - - - public function news($get){ + // + // get notices + // + $botstuff = + $this->fuckhtml + ->getElementById( + "botstuff" + ); - if($get["npt"]){ + // important for later + $last_page = false; + + if($botstuff){ - [$req, $ip] = $this->backend->get($get["npt"], "news"); - parse_str( - parse_url($req, PHP_URL_QUERY), - $search - ); + $this->fuckhtml->load($botstuff); - if(isset($search["q"])){ - - $search = $search["q"]; - }else{ - - $search = "a"; // lol - } + $cards = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "line-height" => "normal" + ] + ), + "div" + ); - try{ + foreach($cards as $card){ - $html = - $this->get( - $ip, - "https://www.google.com" . $req, - [] - ); - }catch(Exception $error){ + $this->fuckhtml->load($card); - throw new Exception("Failed to get HTML"); - } - - }else{ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $lang = $get["lang"]; - $time = $get["time"]; - $sort = $get["sort"]; - $ip = $this->backend->get_ip(); - - $params = [ - "q" => $search, - "tbm" => "nws", - "hl" => "en", - "num" => "20" - ]; - - // country - if($country != "any"){ + $h2 = + $this->fuckhtml + ->getElementsByTagName( + "h2" + ); - $params["gl"] = $country; - } - - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; - - // language - if($lang != "any"){ + if(count($h2) !== 0){ + + $title = + $this->fuckhtml + ->getTextContent( + $h2[0] + ); + + $card["innerHTML"] = + str_replace( + $h2[0]["outerHTML"], + "", + $card["innerHTML"] + ); + }else{ + + $title = "Notice"; + } - $params["lr"] = "lang_" . $lang; - } - - $tbs = []; + $description = []; - // time - if($time != "any"){ + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); - if($time == "a"){ + if(count($as) !== 0){ + + $first = true; + + foreach($as as $a){ + + $text_link = + $this->fuckhtml + ->getTextContent( + $a + ); + + if(stripos($text_link, "repeat the search") !== false){ + + $last_page = true; + break 2; + } + + $parts = + explode( + $a["outerHTML"], + $card["innerHTML"], + 2 + ); + + $card["innerHTML"] = $parts[1]; + + $value = + preg_replace( + '/ +/', + " ", + $this->fuckhtml + ->getTextContent( + $parts[0], + false, + false + ) + ); + + if(strlen(trim($value)) !== 0){ + + $description[] = [ + "type" => "text", + "value" => $value + ]; + + if($first){ + + $description[0]["value"] = + ltrim($description[0]["value"]); + } + } + + $first = false; + + $description[] = [ + "type" => "link", + "url" => + $this->fuckhtml + ->getTextContent( + $a["attributes"] + ["href"] + ), + "value" => $text_link + ]; + } + + $text = + $this->fuckhtml + ->getTextContent( + $card["innerHTML"], + false, + false + ); + + if(strlen(trim($text)) !== 0){ + + $description[] = [ + "type" => "text", + "value" => + rtrim( + $text + ) + ]; + } - $tbs[] = "ar:1"; }else{ - $tbs[] = "qdr:" . $time; + // @TODO: Check if this ever gets populated without giving me garbage + /* + $text = + $this->fuckhtml + ->getTextContent( + $card + ); + + if($text != ""){ + $description[] = [ + "type" => "text", + "value" => $text + ]; + }*/ } - } - - // relevance - if($sort == "date"){ - $tbs[] = "sbd:1"; - } + if(count($description) !== 0){ - // append tbs - if(count($tbs) !== 0){ - - $params["tbs"] = - implode(",", $tbs); + $out["answer"][] = [ + "title" => $title, + "description" => $description, + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + } } - $html = - $this->get( - $ip, - "https://www.google.com/search", - $params - ); + // reset + $this->fuckhtml->load($html); } - $json = $this->parsepage($html, "news", $search, $ip); - $out = [ - "status" => "ok", - "npt" => $json["npt"], - "news" => [] - ]; + // + // get "Related Searches" and "People also search for" + // + $relateds = + array_merge( + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "align-items" => "center", + "background-color" => "#28292a", + "border-radius" => "100px", + "box-sizing" => "border-box", + "display" => "flex", + "max-height" => "none", + "min-height" => "48px", + "padding-left" => "17px", + "padding-right" => "17px", + "position" => "relative" + ] + ) . " " . + $this->getstyle( + [ + "margin-left" => "8px", + "margin-right" => "8px" + ] + ), + "a" + ), + $this->fuckhtml + ->getElementsByClassName( + "wyccme", + "div" + ) + ); - foreach($json["web"] as $item){ - - $description = array_key_first($item["table"]); + foreach($relateds as $related){ - if($description !== null){ - - $date = $item["table"][$description]; - }else{ - - $date = null; - } + $text = + $this->fuckhtml + ->getTextContent( + $related + ); - $out["news"][] = [ - "title" => $item["title"], - "author" => $item["author"], - "description" => $description, - "date" => strtotime($date), - "thumb" => - $item["thumb"]["url"] === null ? - [ - "url" => null, - "ratio" => null - ] : - [ - "url" => $item["thumb"]["url"], - "ratio" => "16:9" - ], - "url" => $item["url"] - ]; - } - - return $out; - } - - - - private function parsepage($html, $pagetype, $search, $ip){ - /* - $handle = fopen("scraper/google.html", "r"); - $html = fread($handle, filesize("scraper/google.html")); - fclose($handle); - */ - - $out = [ - "status" => "ok", - "spelling" => [ - "type" => "no_correction", - "using" => null, - "correction" => null - ], - "npt" => null, - "answer" => [], - "web" => [], - "image" => [], - "video" => [], - "news" => [], - "related" => [] - ]; - - if($error = $this->detect_sorry($html)){ + if($text == "More results"){ continue; } - throw new Exception($error); + $out["related"][] = $text; } - $this->parsejavascript($html); - // - // parse accdef's + // Get text results // - $has_appended_accdef = false; + $results = + $this->fuckhtml + ->getElementsByClassName( + "g", + "div" + ); - preg_match_all( - '/window\.jsl\.dh\(\'(accdef_[0-9]+)\',\'(.*)\'\);/', - $html, - $accdefs_regex - ); + $this->skip_next = false; - $accdefs = []; - for($i=0; $i<count($accdefs_regex[0]); $i++){ - - // decode UTF-16 string - $answer = - $this->fuckhtml - ->parseJsString( - $accdefs_regex[2][$i] - ); - - $this->fuckhtml->load($answer); + foreach($results as $result){ - // get description - $description = - $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "padding" => "12px 16px 12px", - ], - self::is_class - ), - "div" - ); - - if(!isset($description[1])){ + if($this->skip_next){ - throw new Exception("Google returned an unsupported page format (will fix)"); - }else{ - - $description = $description[1]; + $this->skip_next = false; + continue; } - // get date (rare) - $date = - $this->fuckhtml - ->getElementsByTagName("sub"); + $this->fuckhtml->load($result); - if(count($date) !== 0){ - - $description = - str_replace( - $date[0]["outerHTML"], - "", - $description["innerHTML"] - ); - - $date = - strtotime( - $this->fuckhtml - ->getTextContent( - $date[0] - ) - ); - }else{ - - $date = null; - } - - // get information table - $table = []; + $web = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; - $tbody = + // Detect presence of sublinks + $g = $this->fuckhtml - ->getElementsByTagName("tbody"); + ->getElementsByClassName( + "g", + "div" + ); - if(count($tbody) !== 0){ - - $this->fuckhtml->load($tbody[0]); + $sublinks = []; + if(count($g) > 0){ - $trs = + $table = $this->fuckhtml - ->getElementsByTagName("tr"); + ->getElementsByTagName( + "table" + ); - foreach($trs as $tr){ + if(count($table) !== 0){ + + // found some sublinks! - $this->fuckhtml->load($tr); + $this->fuckhtml->load($table[0]); $tds = $this->fuckhtml - ->getElementsByTagName("td"); + ->getElementsByTagName( + "td" + ); - if(count($tds) === 2){ + foreach($tds as $td){ - $table[ + $this->fuckhtml->load($td); + + $a = $this->fuckhtml - ->getTextContent( - $tds[0] + ->getElementsByTagName( + "a" + ); + + if( + count($a) === 0 || + ( + isset($a[0]["attributes"]["class"]) && + $a[0]["attributes"]["class"] == "fl" ) - ] = - $this->fuckhtml - ->getTextContent( - $tds[1] + ){ + + continue; + } + + $td["innerHTML"] = + str_replace( + $a[0]["outerHTML"], + "", + $td["innerHTML"] ); + + $web["sublink"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $a[0] + ) + ), + "description" => + html_entity_decode( + $this->titledots( + $this->fuckhtml + ->getTextContent( + $td + ) + ) + ), + "url" => + $this->unshiturl( + $a[0] + ["attributes"] + ["href"] + ), + "date" => null + ]; } + + // reset + $this->fuckhtml->load($result); } - - // load back what we had - $this->fuckhtml->load($answer); + + // skip on next iteration + $this->skip_next = true; } - // get title & link - $a = - $this->fuckhtml - ->getElementsByTagName("a")[0]; - - $this->fuckhtml->load($a); - - $title = + // get title + $h3 = $this->fuckhtml - ->getElementsByTagName("span"); + ->getElementsByTagName( + "h3" + ); - if(count($title) === 0){ + if(count($h3) === 0){ continue; } - $accdefs[] = [ - "title" => - $this->titledots( - $this->fuckhtml - ->getTextContent( - $title[0] - ) - ), - "description" => + $web["title"] = + $this->titledots( $this->fuckhtml ->getTextContent( - $description - ), - "url" => - $this->unshiturl( - $a["attributes"]["href"] - ), - "date" => $date, - "type" => "web", - "thumb" => [ - "url" => null, - "ratio" => null - ], - "sublink" => [], - "table" => $table - ]; - } - - $this->fuckhtml->load($html); - - $containers = - $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "background-color" => "#fff", - "margin-bottom" => "10px", - "-webkit-box-shadow" => "0 1px 6px rgba(32,33,36,0.28)", - "border-radius" => "8px" - ], - self::is_class - ), - "div" - ); - - foreach($containers as $container){ + $h3[0] + ) + ); - $this->fuckhtml->load($container); + // get url + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + $web["url"] = + $this->unshiturl( + $as[0] + ["attributes"] + ["href"] + ); + + if( + !preg_match( + '/^http/', + $web["url"] + ) + ){ + + // skip if invalid url is found + continue; + } - // detect spelling - $spelling = + // + // probe for twitter carousel + // + $carousel = $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "font-size" => "20px", - "line-height" => "26px", - "padding-top" => "2px", - "margin-bottom" => "1px" - ], - self::is_class - ), - "div" + ->getElementsByTagName( + "g-scrolling-carousel" ); - if(count($spelling) !== 0){ + if(count($carousel) !== 0){ + + $this->fuckhtml->load($carousel[0]); - $a = + $items = $this->fuckhtml - ->getElementsByTagName("a"); + ->getElementsByTagName( + "g-inner-card" + ); + + $has_thumbnail = false; - if(count($a) !== 0){ + foreach($items as $item){ - $scripts = - $this->fuckhtml - ->getElementsByTagName("script"); + $this->fuckhtml->load($item); - foreach($scripts as $script){ + if($has_thumbnail === false){ - $container["innerHTML"] = - str_replace( - $script["outerHTML"], - "", - $container["innerHTML"] + // get thumbnail + $thumb = + $this->fuckhtml + ->getElementsByTagName( + "img" ); + + if( + count($thumb) !== 0 && + isset($thumb[0]["attributes"]["id"]) + ){ + + $web["thumb"] = [ + "url" => + $this->getdimg( + $thumb[0]["attributes"]["id"] + ), + "ratio" => "16:9" + ]; + + $has_thumbnail = true; + } + + // or else, try getting a thumbnail from next container } - $container["innerHTML"] = + // cache div + $div = $this->fuckhtml - ->getTextContent( - str_replace( - $a[0]["outerHTML"], - "", - $container["innerHTML"] - ) + ->getElementsByTagName( + "div" ); - if( - preg_match( - '/^did you mean/i', - $container["innerHTML"] - ) - ){ + // get link + $links = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + // get description of carousel sublink + $description = + $this->fuckhtml + ->getElementsByAttributeValue( + "role", + "heading", + $div + ); + + if(count($description) !== 0){ - $out["spelling"] = [ - "type" => "not_many", - "using" => $search, - "correction" => + $description = + $this->titledots( $this->fuckhtml ->getTextContent( - $a[0] + $description[0] ) - ]; + ); + }else{ + + $description = null; } - elseif( - preg_match( - '/^showing results for/i', - $container["innerHTML"] - ) - ){ + $bottom = + $this->fuckhtml + ->getElementsByAttributeValue( + "style", + "z-index:2", + $div + ); + + $title = null; + $date = null; + if(count($bottom) !== 0){ - $out["spelling"] = [ - "type" => "including", - "using" => + $this->fuckhtml->load($bottom[0]); + + $spans = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + $title = + $this->fuckhtml + ->getTextContent( + $spans[0] + ); + + $date = + strtotime( $this->fuckhtml ->getTextContent( - $a[0] - ), - "correction" => $search - ]; + $spans[count($spans) - 1] + ) + ); } + + $web["sublink"][] = [ + "title" => $title, + "description" => $description, + "url" => + $this->unshiturl( + $links[0] + ["attributes"] + ["href"] + ), + "date" => $date + ]; } + $out["web"][] = $web; continue; } - $title = + // + // get viewcount, time posted and follower count from <cite> tag + // + $cite = $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "color" => "#1967d2", - "font-size" => "20px", - "line-height" => "26px" - ], - self::is_class - ), - "div" + ->getElementsByTagName( + "cite" ); - if(count($title) !== 0){ + if(count($cite) !== 0){ - // - // Container is a web link - // - $web = [ - "title" => - $this->titledots( + $this->fuckhtml->load($cite[0]); + + $spans = + $this->fuckhtml + ->getElementsByTagName("span"); + + if(count($spans) === 0){ + + $cites = + explode( + "·", $this->fuckhtml ->getTextContent( - $title[0] + $cite[0] ) - ), - "description" => null, - "url" => null, - "date" => null, - "type" => "web", - "thumb" => [ - "url" => null, - "ratio" => null - ], - "sublink" => [], - "table" => [] - ]; - - // get link - $web["url"] = - $this->unshiturl( - $this->fuckhtml - ->getElementsByTagName("a") - [0] - ["attributes"] - ["href"] - ); - - // - // check if link contains a carousel - // - $carousels = $this->parsecarousels(); - if(count($carousels) !== 0){ + ); - $first = true; - foreach($carousels as $carousel_cat){ + foreach($cites as $cite){ + + $cite = trim($cite); - foreach($carousel_cat as $carousel){ + if( + preg_match( + '/(.+) (views|followers|likes)$/', + $cite, + $match + ) + ){ - if($first === true){ - - $first = false; - }elseif($carousel["image"] !== null){ - - $out["image"][] = [ - "title" => $carousel["title"], - "source" => [ - [ - "url" => $carousel["image"], - "width" => null, - "height" => null - ] - ], - "url" => $carousel["url"] - ]; - } + $web["table"][ucfirst($match[2])] = + $match[1]; + }elseif( + preg_match( + '/ago$/', + $cite + ) + ){ - $web["sublink"][] = [ - "title" => $carousel["title"], - "date" => $carousel["date"], - "description" => $carousel["description"], - "url" => $carousel["url"] - ]; + $web["date"] = + strtotime($cite); } } - - if($carousels[0][0]["image"] !== null){ - $web["thumb"] = [ - "url" => $carousels[0][0]["image"], - "ratio" => "16:9" - ]; - } - - $out["web"][] = $web; - continue; } - // - // no carousel entries, parse as normal link - // - $this->fuckhtml->load($container); + // reset + $this->fuckhtml->load($result); + } + + // + // attempt to fetch description cleanly + // + $description = + $this->fuckhtml + ->getElementsByAttributeValue( + "style", + "-webkit-line-clamp:2" + ); + + if(count($description) !== 0){ - // parse URL - $web["url"] = - $this->unshiturl( + $web["description"] = + $this->titledots( $this->fuckhtml - ->getElementsByTagName("a") - [0] - ["attributes"] - ["href"] + ->getTextContent( + $description[0] + ) ); + }else{ - $container = $container["innerHTML"]; - - $line_detect = + // use ANOTHER method where the description is a header of the result + $description = $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "height" => "1px", - "background-color" => "#dadce0", - "margin" => "0 16px" - ], - self::is_class - ), - "div" + ->getElementsByAttributeValue( + "data-attrid", + "wa:/description" ); - if(count($line_detect) !== 0){ - - // we found a line, this means we're dealing with a - // "featured snippet" - $featured = true; + if(count($description) !== 0){ - $description_container = + // get date off that shit + $date = $this->fuckhtml ->getElementsByClassName( - $this->findstyles( + $this->getstyle( [ - "white-space" => "pre-line", - "word-wrap" => "break-word" - ], - self::is_class + "font-size" => "12px", + "line-height" => "1.34", + "display" => "inline-block", + "font-family" => "Google Sans,arial,sans-serif", + "padding-right" => "0", + "white-space" => "nowrap" + ] ), - "div" - )[1]; - - // get date node for it - $date = - $this->fuckhtml - ->getElementsByTagName("sub"); + "span" + ); if(count($date) !== 0){ + + $description[0]["innerHTML"] = + str_replace( + $date[0]["outerHTML"], + "", + $description[0]["innerHTML"] + ); + $web["date"] = strtotime( $this->fuckhtml @@ -1413,176 +1379,346 @@ class google{ ) ); } - }else{ - - // we're dealing with a normal link - $featured = false; - $description_container = + $web["description"] = $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "padding" => "12px 16px 12px" - ], - self::is_class - ), - "div" - )[1]; - } - - // - // Get author if we're parsing news - // - if($pagetype == "news"){ + ->getTextContent( + $description[0] + ); + }else{ - $author = + // Yes.. You guessed it, use ANOTHER method to get descriptions + // off youtube containers + $description = $this->fuckhtml ->getElementsByClassName( - $this->findstyles( + $this->getstyle( [ - "position" => "absolute", - "width" => "100%", - "top" => "0", - "left" => "0", - "padding-top" => "1px", - "margin-bottom" => "-1px" - ], - self::is_class + "-webkit-box-orient" => "vertical", + "display" => "-webkit-box", + "font-size" => "14px", + "-webkit-line-clamp" => "2", + "line-height" => "22px", + "overflow" => "hidden", + "word-break" => "break-word", + "color" => "#bdc1c6" + ] ), "div" ); - if(count($author) !== 0){ + if(count($description) !== 0){ - $web["author"] = + // check for video duration + $duration = $this->fuckhtml - ->getTextContent( - $author[0] + ->getElementsByClassName( + $this->getstyle( + [ + "border-radius" => "10px", + "font-family" => "arial,sans-serif-medium,sans-serif", + "font-size" => "12px", + "line-height" => "16px", + "padding-block" => "2px", + "padding-inline" => "8px" + ] + ), + "div" + ); + + if(count($duration) !== 0){ + + $web["table"]["Duration"] = + $this->fuckhtml + ->getTextContent( + $duration[0] + ); + + // remove duration from description + $description[0]["innerHTML"] = + str_replace( + $duration[0]["outerHTML"], + "", + $description[0]["innerHTML"] + ); + } + + $web["description"] = + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ) ); - }else{ - $web["author"] = null; + // get author + time posted + $info = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "color" => "var(" . $this->getcolorvar("#70757a") . ")", + "font-size" => "14px", + "line-height" => "20px", + "margin-top" => "12px" + ] + ), + "div" + ); + + if(count($info) !== 0){ + + $info = + explode( + "·", + $this->fuckhtml + ->getTextContent( + $info[0] + ) + ); + + switch(count($info)){ + + case 3: + $web["table"]["Author"] = trim($info[1]); + $web["date"] = strtotime(trim($info[2])); + break; + + case 2: + $web["date"] = strtotime(trim($info[1])); + break; + } + } } } + } + + // + // get categories of content within the search result + // + $cats = + $this->fuckhtml + ->getElementsByAttributeName( + "data-sncf", + "div" + ); + + foreach($cats as $cat){ - $description = - $description_container["innerHTML"]; - - $this->fuckhtml->load($description); + $this->fuckhtml->load($cat); - // - // get thumbnail before we call loadhtml again - // - $img = + // detect image category + $images = $this->fuckhtml - ->getElementsByTagName("img"); + ->getElementsByTagName( + "img" + ); - if(count($img) !== 0){ + if(count($images) !== 0){ - $skip = true; - - if( - isset($img[0]["attributes"]["alt"]) && - stripos($img[0]["attributes"]["alt"], "Video for") !== false - ){ + foreach($images as $image){ - // is a video thumbnail - $web["thumb"]["ratio"] = "16:9"; - }else{ - - // is a google thumbnail - $web["thumb"]["ratio"] = "1:1"; + if(isset($image["attributes"]["id"])){ + // we found an image + + if(isset($image["attributes"]["width"])){ + + $width = (int)$image["attributes"]["width"]; + + if($width == 110){ + + $ratio = "1:1"; + }elseif($width > 110){ + + $ratio = "16:9"; + }else{ + + $ratio = "9:16"; + } + }else{ + + $ratio = "1:1"; + } + + $web["thumb"] = [ + "url" => $this->getdimg($image["attributes"]["id"]), + "ratio" => $ratio + ]; + + continue 2; + } } - - $web["thumb"]["url"] = - $this->getimage( - $img[0]["attributes"]["id"] - ); - }else{ - - $skip = false; } - // - // get sublinks - // - $links = + // Detect rating + $spans_unfiltered = $this->fuckhtml - ->getElementsByTagName("a"); + ->getElementsByTagName( + "span" + ); + + $spans = + $this->fuckhtml + ->getElementsByAttributeName( + "aria-label", + $spans_unfiltered + ); - foreach($links as $link){ + foreach($spans as $span){ - if($skip === true){ + if( + preg_match( + '/^Rated/', + $span["attributes"]["aria-label"] + ) + ){ - $skip = false; - continue; - } - - $description = - str_replace( - $link["outerHTML"], - "", - $description - ); - - $sublink = [ - "title" => null, - "description" => null, - "url" => null, - "date" => null - ]; - - $sublink["title"] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $link - ) - ); - - $sublink["url"] = - $this->unshiturl( - $link - ["attributes"] - ["href"] + // found rating + // scrape rating + preg_match( + '/([0-9.]+).*([0-9.]+)/', + $span["attributes"]["aria-label"], + $rating ); - - if(parse_url($sublink["url"], PHP_URL_HOST) !== null){ - $web["sublink"][] = $sublink; + if(isset($rating[1])){ + + $web["table"]["Rating"] = + $rating[1] . "/" . $rating[2]; + } + + $has_seen_reviews = 0; + foreach($spans_unfiltered as $span_unfiltered){ + + if( + preg_match( + '/([0-9,.]+) +([A-z]+)$/', + $this->fuckhtml + ->getTextContent( + $span_unfiltered + ), + $votes + ) + ){ + + $has_seen_reviews++; + $web["table"][ucfirst($votes[2])] = $votes[1]; + continue; + } + + $text = + $this->fuckhtml + ->getTextContent( + $span_unfiltered + ); + + if( + $text == " " || + $text == "" + ){ + + break; + } + + switch($has_seen_reviews){ + + case 1: + // scrape price + $web["table"]["Price"] = $text; + $has_seen_reviews++; + break; + + case 2: + // scrape platform + $web["table"]["Platform"] = $text; + $has_seen_reviews++; + break; + + case 3: + // Scrape type + $web["table"]["Medium"] = $text; + break; + } + } + + continue 2; } } - // - // Parse spans in description - // - $this->fuckhtml->load($description); + // check if its a table of small sublinks + $table = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "display" => "table", + "white-space" => "nowrap", + "margin" => "5px 0", + "line-height" => "1.58", + "color" => "var(" . $this->getcolorvar("#70757a") . ")" + ] + ), + "div" + ); - if($featured === false){ + if(count($table) !== 0){ + + $this->fuckhtml->load($table[0]); - $levels = + $rows = $this->fuckhtml ->getElementsByClassName( - $this->findstyles( + $this->getstyle( [ - "padding-bottom" => "8px" - ], - self::is_class + "display" => "flex", + "white-space" => "normal" + ] ), "div" ); - // oh my god yes, fucking great, sometimes there are NO levels - // hahahahahhahahahahahahahahahhahaa - if(count($levels) === 0){ + foreach($rows as $row){ - $levels = [$description]; - } - - foreach($levels as $level){ + $this->fuckhtml->load($row); - $this->fuckhtml->load($level); + $sublink = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null + ]; + + $link = + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0]; + + $sublink["title"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $link + ) + ); + + $sublink["url"] = + $this->unshiturl( + $link + ["attributes"] + ["href"] + ); + + $row["innerHTML"] = + str_replace( + $link["outerHTML"], + "", + $row["innerHTML"] + ); + + $this->fuckhtml->load($row); $spans = $this->fuckhtml @@ -1590,1906 +1726,2905 @@ class google{ "span" ); - $is_rating = -1; - foreach($spans as $span){ - $innertext = - trim( - $this->fuckhtml - ->getTextContent( - $span - ), - " ·." + $text = + $this->fuckhtml + ->getTextContent( + $span ); - if($innertext == ""){ continue; } - if( - strtolower($innertext) - == "rating" + preg_match( + '/answers?$/', + $text + ) ){ - $is_rating = 0; - - // clean up before we go - $description = - str_replace( - $span["outerHTML"], - "", - $description - ); - continue; - } - - // - // Parse rating object - // - if($is_rating >= 0){ - - // clean up description - $description = - str_replace( - $span["outerHTML"], - "", - $description - ); - - if($span["level"] !== 1){ continue; } - $is_rating++; - - // 10/10 (123) - if($is_rating === 1){ - - $innertext = explode(" ", $innertext, 2); - - $web["table"]["Rating"] = $innertext[0]; - - if(count($innertext) === 2){ - $web["table"]["Hits"] = - trim( - str_replace( - [ - "(", - ")" - ], - "", - $innertext[1] - ) - ); - - if($web["table"]["Hits"] == ""){ - - unset($web["table"]["Hits"]); - } - } - continue; - } - - // US$4.99 - // MYR 50.00 - // $38.34 - // JP¥6,480 - // Reviewed by your mom - if($is_rating === 2){ - - if( - preg_match( - '/^Review by (.+)/', - $innertext, - $match - ) - ){ - - $web["table"]["Author"] = $match[1]; - continue; - } - - $web["table"]["Price"] = $innertext; - continue; - } - - // Android / In stock - if($is_rating === 3){ - - $web["table"]["Support"] = $innertext; - continue; - } + $sublink["description"] = + $text; - // ignore the rest continue; } - // - // Parse standalone text - // - - // If we reach this point: - // 1. Ratings have been parsed - // 2. We're parsing a WEB link, not some shitty piece of shit - - // check for date - // if span has no text before it, assume it's a date - $desc_split = - explode( - $span["outerHTML"], - $description, - 2 - ); - - if( - $this->fuckhtml - ->getTextContent( - $desc_split[0] - ) == "" - ){ - - // has no text before - $date = strtotime($innertext); - if($date){ - - $web["date"] = $date; - } - - // cleanup - $description = - str_replace( - $span["outerHTML"], - "", - $description - ); - - continue; - } + $time = strtotime($text); - // Ready to parse table - if(count($desc_split) === 2){ - $this->fuckhtml->load($desc_split[1]); - - $web["table"][ - $this->fuckhtml - ->getTextContent( - trim($desc_split[0], ": ") - ) - ] = $innertext; + if($time !== false){ - // cleanup - $description = - str_replace( - $desc_split[0] . $span["outerHTML"], - "", - $description - ); + $sublink["date"] = $time; } } + + $web["sublink"][] = $sublink; } + + // reset + $this->fuckhtml->load($cat); + continue; } - $web["description"] = - trim( - $this->fuckhtml - ->getTextContent( - $description + // check if its an answer header + $answer_header = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "overflow" => "hidden", + "text-overflow" => "ellipsis" + ] ), - " ·." + "span" ); - if($web["description"] == ""){ + if(count($answer_header) !== 0){ + + $link = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); - $web["description"] = null; + $cat["innerHTML"] = + str_replace( + $link[0]["outerHTML"], + "", + $cat["innerHTML"] + ); + + $web["sublink"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $link[0] + ), + "description" => + $this->titledots( + trim( + str_replace( + "\xc2\xa0", + " ", + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $cat + ) + ) + ), + " ·" + ) + ), + "url" => + $this->fuckhtml + ->getTextContent( + $link[0] + ["attributes"] + ["href"] + ), + "date" => null + ]; + + continue; } - $out["web"][] = $web; - - continue; - } - - // - // Detect wikipedia shit - // - $wiki_title = - $this->fuckhtml - ->getElementsByTagName("h3"); - - if(count($wiki_title) !== 0){ - - $description_after = []; - $description = []; - $table = []; - $sublink = []; - - $as = + // check if its list of small sublinks + $urls = $this->fuckhtml - ->getElementsByTagName("a"); + ->getElementsByTagName( + "a" + ); - foreach($as as $a){ + if(count($urls) !== 0){ - if( - isset($a["attributes"]["href"]) && - parse_url($a["attributes"]["href"], PHP_URL_HOST) == "maps.google.com" - ){ + // found small links + foreach($urls as $url){ - // detected maps embed, ignore - continue 2; - } - } - - // get carousels and remove them from container for image grepper - $carousels = $this->parsecarousels($container["innerHTML"]); - $this->fuckhtml->load($container); - - // add images to image tab, if applicable - for($i=0; $i<count($carousels); $i++){ - - foreach($carousels[$i] as $item){ + $target = + $this->fuckhtml + ->getTextContent( + $url + ["attributes"] + ["href"] + ); if( - $item["url"] !== null && - $item["ref"] !== null && - $item["image"] !== null && - $item["title"] !== null + !preg_match( + '/^http/', + $target + ) ){ - $out["image"][] = [ - "title" => $item["title"], - "source" => [ - [ - "url" => $item["url"], - "width" => $item["image_width"], - "height" => $item["image_height"] - ], - [ - "url" => $item["image"], - "width" => $item["thumb_width"], - "height" => $item["thumb_height"] - ] - ], - "url" => $item["ref"] - ]; - - unset($carousels[$i]); + continue; } - } - } - - $carousels = array_values($carousels); - - // interpret remaining carousels as title + carousel - $titles = - $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "font-weight" => "700", - "letter-spacing" => "0.75px", - "text-transform" => "uppercase" - ], - self::is_class - ) - ); - - for($i=0; $i<count($titles); $i++){ - - if(!isset($carousels[$i])){ - break; + $web["sublink"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $url + ) + ), + "description" => null, + "url" => $target, + "date" => null + ]; } - $description_after[] = [ - "type" => "title", - "value" => + continue; + } + + // we probed everything, assume this is the description + // if we didn't find one cleanly previously + if($web["description"] === null){ + $web["description"] = + $this->titledots( $this->fuckhtml ->getTextContent( - $titles[$i] + $cat ) - ]; + ); + } + } + + // check if description contains date + $description = explode("—", $web["description"], 2); + + if( + count($description) === 2 && + strlen($description[0]) <= 20 + ){ + + $date = strtotime($description[0]); + + if($date !== false){ - foreach($carousels[$i] as $carousel){ - - $description_after[] = [ - "type" => "link", - "url" => "web?s=" . urlencode($carousel["description"]) . "&scraper=google", - "value" => $carousel["description"] - ]; - - if($carousel["subtext"] !== null){ - - $description_after[] = [ - "type" => "quote", - "value" => $carousel["subtext"] - ]; - } - - $description_after[] = [ - "type" => "image", - "url" => $carousel["image"] - ]; - } + $web["date"] = $date; + $web["description"] = ltrim($description[1]); } + } + + // fetch youtube thumbnail + $thumbnail = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "border-radius" => "8px", + "height" => "fit-content", + "justify-content" => "center", + "margin-right" => "20px", + "margin-top" => "4px", + "position" => "relative", + "width" => "fit-content" + ] + ), + "div" + ); + + if(count($thumbnail) !== 0){ - $categories = - $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "padding" => "12px 16px 12px" - ], - self::is_class - ) - ); + // load thumbnail container + $this->fuckhtml->load($thumbnail[0]); $image = $this->fuckhtml - ->getElementsByTagName("img"); + ->getElementsByTagName( + "img" + ); - if(count($image) !== 0){ - - $image = $this->getimage($image[0]["attributes"]["id"]); - }else{ + if( + count($image) !== 0 && + isset($image[0]["attributes"]["id"]) + ){ - $image = null; + $web["thumb"] = [ + "url" => + $this->unshit_thumb( + $this->getdimg( + $image[0]["attributes"]["id"] + ) + ), + "ratio" => "16:9" + ]; } - $url = null; + // reset + $this->fuckhtml->load($result); + } + + $out["web"][] = $web; + } + + // reset + $this->fuckhtml->load($result_div); + + // + // Get instant answers + // + $answer_containers = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "padding-left" => "0px", + "padding-right" => "0px" + ] + ), + "div" + ); + + $date_class = + $this->getstyle( + [ + "font-size" => "12px", + "line-height" => "1.34", + "display" => "inline-block", + "font-family" => "Google Sans,arial,sans-serif", + "padding-right" => "0", + "white-space" => "nowrap" + ] + ); + + foreach($answer_containers as $container){ + + $this->fuckhtml->load($container); + + $web = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + $answers = + $this->fuckhtml + ->getElementsByAttributeName( + "aria-controls", + "div" + ); + + $item_insert_pos = 1; + foreach($answers as $answer){ + + $out["related"][] = + $this->fuckhtml + ->getTextContent( + $answer + ); - for($i=0; $i<count($categories); $i++){ + if( + isset( + $this->blobs[ + $answer + ["attributes"] + ["aria-controls"] + ] + ) + ){ + + $this->fuckhtml->load( + $this->blobs[ + $answer + ["attributes"] + ["aria-controls"] + ] + ); - $this->fuckhtml->load($categories[$i]); + $divs = + $this->fuckhtml + ->getElementsByAttributeName( + "id", + "div" + ); - if($i === 0){ - // first node. this should be the header with the small - // information snippet + foreach($divs as $div){ - $url = + if( + !isset( + $this->blobs[ + $div + ["attributes"] + ["id"] + ] + ) + ){ + + continue; + } + + $this->fuckhtml->load( + $this->blobs[ + $div + ["attributes"] + ["id"] + ] + ); + + // get url + $as = $this->fuckhtml - ->getElementsByTagName("a"); + ->getElementsByTagName( + "a" + ); - if(count($url) !== 0){ + if(count($as) !== 0){ - $url = + $web["url"] = $this->unshiturl( - $url[0]["attributes"]["href"] + $as[0]["attributes"]["href"] ); - if(parse_url($url, PHP_URL_HOST) == "encrypted-tbn0.gstatic.com"){ + // skip entries that redirect to a search + if( + !preg_match( + '/^http/', + $web["url"] + ) + ){ - $image = $url; - $url = null; + continue 3; } - }else{ - - $url = null; } - $categories[$i]["innerHTML"] = - str_replace( - $wiki_title[0]["outerHTML"], - "", - $categories[$i]["innerHTML"] - ); - - $subtext = + // get title + $h3 = $this->fuckhtml - ->getTextContent( - $categories[$i]["innerHTML"] + ->getElementsByTagName( + "h3" ); - if(strlen($subtext) !== 0){ + if(count($h3) !== 0){ - $description[] = [ - "type" => "quote", - "value" => + $web["title"] = + $this->titledots( $this->fuckhtml ->getTextContent( - $categories[$i]["innerHTML"] + $h3[0] ) - ]; - } - - // detect audio file - $audio = - $this->fuckhtml - ->getElementsByTagName("audio"); - - if(count($audio) !== 0){ - - $description[] = [ - "type" => "audio", - "url" => - $this->fuckhtml - ->getTextContent( - $audio[0]["attributes"]["src"] - ) - ]; + ); } - }else{ - // check for separator elements IN THERE - $separators = + $description = $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "white-space" => "pre-line", - "word-wrap" => "break-word" - ], - self::is_class - ), + ->getElementsByAttributeValue( + "data-attrid", + "wa:/description", "div" ); - // detect container type - foreach($separators as $separator){ + if(count($description) !== 0){ - $this->fuckhtml->load($separator); + // check for date + $this->fuckhtml->load($description[0]); - // ignore wrong levels - if($separator["level"] !== 2){ - - continue; - } - - // - // Detect word definition - // - $wordwraps = + $date = $this->fuckhtml ->getElementsByClassName( - $this->findstyles( - [ - "padding-bottom" => "12px" - ], - self::is_class - ), - "div" + $date_class, + "span" ); - if(count($wordwraps) !== 0){ + if(count($date) !== 0){ - foreach($wordwraps as $word){ - - $this->fuckhtml->load($word); - - // detect title - $span = - $this->fuckhtml - ->getElementsByTagName( - "span" - ); - - if( - count($span) === 1 && + $description[0]["innerHTML"] = + str_replace( + $date[0]["outerHTML"], + "", + $description[0]["innerHTML"] + ); + + $web["date"] = + strtotime( $this->fuckhtml ->getTextContent( - str_replace( - $span[0]["outerHTML"], - "", - $word["innerHTML"] - ) - ) == "" - ){ - - $description[] = [ - "type" => "title", - "value" => - $this->fuckhtml - ->getTextContent( - $span[0] - ) - ]; - continue; - } - - // detect list element - $lists = - $this->fuckhtml - ->getElementsByTagName("ol"); - - if(count($lists) !== 0){ - foreach($lists as $list){ - - $this->fuckhtml->load($list); - - $items = - $this->fuckhtml - ->getElementsByTagName("li"); - - $w = 0; - foreach($items as $item){ - - $w++; - $this->fuckhtml->load($item); - - // get subnodes - $subnodes = - $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "white-space" => "pre-line", - "word-wrap" => "break-word" - ], - self::is_class - ), - "div" - ); - - foreach($subnodes as $subnode){ - - $this->fuckhtml->load($subnode); - - $spans = - $this->fuckhtml - ->getElementsByTagName("span"); - - if(count($spans) !== 0){ - - // append quote - $description[] = [ - "type" => "quote", - "value" => - $this->fuckhtml - ->getTextContent( - $subnode - ) - ]; - }else{ - - // append text - $description[] = [ - "type" => "text", - "value" => - $w . ". " . - $this->fuckhtml - ->getTextContent( - $subnode - ) - ]; - } - } - } - } - }else{ - - // parse without list - // get subnodes - $subnodes = - $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "white-space" => "pre-line", - "word-wrap" => "break-word" - ], - self::is_class - ), - "div" - ); - - foreach($subnodes as $subnode){ - - $this->fuckhtml->load($subnode); - - $spans = - $this->fuckhtml - ->getElementsByTagName("span"); - - if(count($spans) !== 0){ - - // append quote - $description[] = [ - "type" => "quote", - "value" => - $this->fuckhtml - ->getTextContent( - $subnode - ) - ]; - }else{ - - // append text - $description[] = [ - "type" => "text", - "value" => - $this->fuckhtml - ->getTextContent( - $subnode - ) - ]; - } - } - } - } - }else{ - - // - // Parse table - // - $spans = - $this->fuckhtml - ->getElementsByTagName("span"); - - foreach($spans as $span){ - - if(!isset($span["attributes"]["class"])){ - - // found table - $row = - explode( - ":", - $this->fuckhtml - ->getTextContent( - $separator - ), - 2 - ); - - if(count($row) === 2){ - - $table[rtrim($row[0])] = - ltrim($row[1]); - - } - continue 2; - } - } - - // - // Parse normal description - // - $links_rem = - $this->fuckhtml - ->getElementsByTagName("a"); - - foreach($links_rem as $rem){ - - $separator["innerHTML"] = - str_replace( - $rem["outerHTML"], - "", - $separator["innerHTML"] - ); - } - - $description[] = [ - "type" => "text", - "value" => - rtrim( - $this->fuckhtml - ->getTextContent( - $separator - ), - " .," + $date[0] ) - ]; + ); } + + $web["description"] = + ltrim( + $this->fuckhtml + ->getTextContent( + $description[0] + ), + ": " + ); } } + + foreach($out["web"] as $item){ + + if($item["url"] == $web["url"]){ - // detect huge buttons - $buttons = + continue 2; + } + } + + array_splice($out["web"], $item_insert_pos, 0, [$web]); + $item_insert_pos++; + } + } + } + + // reset + $this->fuckhtml->load($result_div); + + // + // Scrape word definition + // + $definition_container = + $this->fuckhtml + ->getElementsByClassName( + "lr_container", + "div" + ); + + if(count($definition_container) !== 0){ + + $this->fuckhtml->load($definition_container[0]); + + // get header + $header = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "EntryHeader", + "div" + ); + + if(count($header) !== 0){ + + $description = []; + + $this->fuckhtml->load($header[0]); + + $title_div = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "font-family" => "Google Sans,arial,sans-serif", + "font-size" => "28px", + "line-height" => "36px" + ] + ) + ); + + if(count($title_div) !== 0){ + + $title = $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "display" => "table-cell", - "vertical-align" => "middle", - "height" => "52px", - "text-align" => "center" - ], - self::is_class - ), - "a" + ->getTextContent( + $title_div[0] ); + }else{ - if(count($buttons) !== 0){ - - foreach($buttons as $button){ - - if(isset($button["attributes"]["href"])){ - - $sublink[ + $title = "Word definition"; + } + + $subtext_div = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "font-family" => "arial,sans-serif", + "font-size" => "14px", + "line-height" => "22px" + ] + ), + "span" + ); + + if(count($subtext_div) !== 0){ + + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $subtext_div[0] + ) + ]; + } + + // get audio + $audio = + $this->fuckhtml + ->getElementsByTagName( + "audio" + ); + + if(count($audio) !== 0){ + + $this->fuckhtml->load($audio[0]); + + $source = + $this->fuckhtml + ->getElementsByTagName( + "source" + ); + + if(count($source) !== 0){ + + $description[] = [ + "type" => "audio", + "url" => + preg_replace( + '/^\/\//', + "https://", $this->fuckhtml ->getTextContent( - $button + $source[0] + ["attributes"] + ["src"] ) - ] = - $this->unshiturl( - $button["attributes"]["href"] - ); - } - } + ) + ]; } + } - // append description_after (contains carousel info) - $description = array_merge( - $description, - $description_after - ); + // remove header to avoid confusion + $definition_container[0]["innerHTML"] = + str_replace( + $header[0]["outerHTML"], + "", + $definition_container[0]["innerHTML"] + ); - $out["answer"][] = [ - "title" => + // reset + $this->fuckhtml->load($definition_container[0]); + + $vmods = + $this->fuckhtml + ->getElementsByClassName( + "vmod", + "div" + ); + + foreach($vmods as $category){ + + if( + !isset( + $category + ["attributes"] + ["data-topic"] + ) || + $category + ["attributes"] + ["class"] != "vmod" + ){ + + continue; + } + + $this->fuckhtml->load($category); + + // get category type + $type = $this->fuckhtml - ->getTextContent( - $wiki_title[0] - ), + ->getElementsByTagName( + "i" + ); + + if(count($type) !== 0){ + + $description[] = [ + "type" => "title", + "value" => + $this->fuckhtml + ->getTextContent( + $type[0] + ) + ]; + } + + // get heading text + $headings = + $this->fuckhtml + ->getElementsByClassName( + "xpdxpnd", + "div" + ); + + foreach($headings as $heading){ + + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $heading + ) + ]; + } + + $definitions = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "SenseDefinition", + "div" + ); + + $i = 1; + $text = []; + + foreach($definitions as $definition){ + + $text[] = + $i . ". " . + $this->fuckhtml + ->getTextContent( + $definition + ); + + $i++; + } + + if(count($text) !== 0){ + + $description[] = [ + "type" => "text", + "value" => + implode("\n", $text) + ]; + } + } + + $out["answer"][] = [ + "title" => $title, "description" => $description, - "url" => $url, - "thumb" => $image, - "table" => $table, - "sublink" => $sublink + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] ]; - - continue; } - // - // Detect related searches containers - // - $container_title = - $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "color" => "#000", - "font-size" => "16px", - "font-weight" => "bold", - "margin" => "0", - "padding" => "12px 16px 0px 16px" - ], - self::is_class - ), - "div" - ); - - if(count($container_title) !== 0){ + // reset + $this->fuckhtml->load($result_div); + } + + // + // scrape elements with a g-section-with-header + // includes: images, news carousels + // + + $g_sections = + $this->fuckhtml + ->getElementsByTagName( + "g-section-with-header" + ); + + if(count($g_sections) !== 0){ + foreach($g_sections as $g_section){ - // get carousel entries - $carousels = $this->parsecarousels($container["innerHTML"]); - $this->fuckhtml->load($container); + // parse elements with a g-section-with-header + $this->fuckhtml->load($g_section); - foreach($carousels as $carousel){ + $div_title = + $this->fuckhtml + ->getElementsByClassName( + "a-no-hover-decoration", + "a" + ); + + if(count($div_title) !== 0){ - foreach($carousel as $item){ - - if($item["url"] !== null){ - - $out["related"][] = $item["url"]; - } - } + // title detected, skip + continue; } - - $container_title = - strtolower( - $this->fuckhtml - ->getTextContent( - $container_title[0] + + // no title detected: detect news container + $news = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "outline-offset" => "-1px", + "display" => "flex", + "flex-direction" => "column", + "flex-grow" => "1" + ] ) ); - switch($container_title){ + foreach($news as $new){ - case "related searches": - case "people also search for": - // - // Parse related searches - // - $as = - $this->fuckhtml - ->getElementsByTagName("a"); + $this->fuckhtml->load($new); + + $image = + $this->fuckhtml + ->getElementsByAttributeName( + "id", + "img" + ); + + if( + count($image) !== 0 && + !( + isset($image[0]["attributes"]["style"]) && + strpos( + $image[0]["attributes"]["style"], + "height:18px" + ) !== false + ) + ){ - foreach($as as $a){ - - $out["related"][] = - $this->fuckhtml - ->getTextContent($a); - } - break; + $thumb = [ + "url" => + $this->getdimg( + $image[0] + ["attributes"] + ["id"] + ), + "ratio" => "1:1" + ]; + } - case "people also ask": - // get related queries - $divs = + $title = + $this->titledots( $this->fuckhtml - ->getElementsByTagName("div"); + ->getTextContent( + $this->fuckhtml + ->getElementsByAttributeValue( + "role", + "heading", + "div" + )[0] + ) + ); + + $date_div = + $this->fuckhtml + ->getElementsByAttributeName( + "style", + "div" + ); + + if(count($date_div) !== 0){ - foreach($divs as $div){ + foreach($date_div as $div){ - // add accdef's here - if($has_appended_accdef === false){ - - $out["web"] = array_merge($out["web"], $accdefs); - $has_appended_accdef = true; - } - - // add accdef's questions - if(isset($div["attributes"]["role"])){ - - $out["related"][] = - $this->fuckhtml - ->getTextContent($div); + if( + strpos( + $div["attributes"]["style"], + "bottom:" + ) !== false + ){ + $date = + strtotime( + $this->fuckhtml + ->getTextContent( + $div + ) + ); - continue; + break; } } - break; + }else{ + + $date = null; + } + + $out["news"][] = [ + "title" => $title, + "description" => null, + "date" => $date, + "thumb" => $thumb, + "url" => + $this->fuckhtml + ->getTextContent( + $new + ["attributes"] + ["href"] + ) + ]; } - - continue; } - // - // Parse news - // - $title = - $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "font-size" => "16px", - "line-height" => "20px", - "font-weight" => "400" - ], - self::is_class - ), - "div" - ); + // reset + $this->fuckhtml->load($result_div); + } + + // + // Parse images (carousel, left hand-side) + // + $image_carousels = + $this->fuckhtml + ->getElementsByAttributeValue( + "id", + "media_result_group", + "div" + ); + + if(count($image_carousels) !== 0){ - if(count($title) !== 0){ + foreach($image_carousels as $image_carousel){ - $carousels = $this->parsecarousels(); - $this->fuckhtml->load($container); + $this->fuckhtml->load($image_carousel); - if(count($carousels) === 0){ - - // no carousels found - continue; - } + // get related searches in image carousel + $relateds = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "display" => "inline-block", + "margin-right" => "6px", + "outline" => "none", + "padding" => "6px 0" + ], + "a" + ) + ); - $title = - strtolower( + foreach($relateds as $related){ + + $text = $this->fuckhtml ->getTextContent( - $title[0] - ) + $related + ); + + if($text != ""){ + + $out["related"][] = $text; + } + } + + $div = + $this->fuckhtml + ->getElementsByTagName( + "div" ); - if( - preg_match( - '/^latest from|^top stories/', - $title - ) - ){ + // get loaded images + $images = + $this->fuckhtml + ->getElementsByClassName( + "ivg-i", + $div + ); + + foreach($images as $image){ + + $this->fuckhtml->load($image); - // Found news article - foreach($carousels[0] as $carousel){ + $img_tags = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if( + !isset($image["attributes"]["data-docid"]) || + !isset($this->image_arr[$image["attributes"]["data-docid"]]) + ){ - if($carousel["image"] !== null){ - - $thumb = [ - "url" => $carousel["image"], - "ratio" => "16:9" - ]; - }else{ + continue; + } + + // search for the right image tag + $image_tag = false; + foreach($img_tags as $img){ + + if( + isset( + $img + ["attributes"] + ["alt"] + ) && + trim( + $img + ["attributes"] + ["alt"] + ) != "" + ){ - $thumb = [ - "url" => null, - "ratio" => null - ]; + $image_tag = $img; + break; } + } + + if($image_tag === false){ - $out["news"][] = [ - "title" => $carousel["title"], - "description" => $carousel["description"], - "date" => $carousel["date"], - "thumb" => $thumb, - "url" => $carousel["url"] - ]; + continue; } + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $image_tag + ["attributes"] + ["alt"] + ) + ), + "source" => + $this->image_arr[ + $image + ["attributes"] + ["data-docid"] + ], + "url" => + $this->fuckhtml + ->getTextContent( + $image + ["attributes"] + ["data-lpage"] + ) + ]; } - elseif( - $title == "images" - ){ + // get unloaded javascript images + $images_js_sel = + $this->fuckhtml + ->getElementsByAttributeName( + "id", + $div + ); + + $loaded = []; + + foreach($images_js_sel as $sel){ - foreach($carousels as $carousel){ + if( + !isset($this->blobs[$sel["attributes"]["id"]]) || + in_array((string)$sel["attributes"]["id"], $loaded, true) + ){ - foreach($carousel as $item){ - - $out["image"][] = [ - "title" => $item["title"], - "source" => [ - [ - "url" => $item["url"], - "width" => $item["image_width"], - "height" => $item["image_height"] - ], - [ - "url" => $item["image"], - "width" => $item["thumb_width"], - "height" => $item["thumb_height"] - ] - ], - "url" => $item["ref"] - ]; - } + // not an unloaded javascript image + continue; } + + $loaded[] = $sel["attributes"]["id"]; + + // get yet another javascript component + $this->fuckhtml->load($this->blobs[$sel["attributes"]["id"]]); + + // get js node: contains title & url + $js_node = + $this->fuckhtml + ->getElementsByTagName( + "div" + )[0]; + + if(!isset($this->blobs[$js_node["attributes"]["id"]])){ + + // did not find refer id + continue; + } + + // load second javascript component + $this->fuckhtml->load($this->blobs[$js_node["attributes"]["id"]]); + + // get title from image alt text. + // data-src from this image is cropped, ignore it.. + $img = + $this->fuckhtml + ->getElementsByTagName( + "img" + )[0]; + + $out["image"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $img["attributes"]["alt"] + ), + "source" => + $this->image_arr[ + $js_node["attributes"]["data-docid"] + ], + "url" => + $this->fuckhtml + ->getTextContent( + $js_node["attributes"]["data-lpage"] + ) + ]; } - - continue; } - // - // Detect nodes with only text + links - // + // reset + $this->fuckhtml->load($result_div); + } + + // + // Parse videos + // + $this->fuckhtml->load($result_div); + + $videos = + $this->fuckhtml + ->getElementsByAttributeName( + "data-vid", + "div" + ); + + foreach($videos as $video){ + + $this->fuckhtml->load($video); - // ignore elements with <style> tags - $style = + // get url + $url = $this->fuckhtml - ->getElementsByTagName("style"); + ->getTextContent( + $video + ["attributes"] + ["data-surl"] + ); - if(count($style) !== 0){ + foreach($out["web"] as $link){ - continue; + if($link["url"] == $url){ + + // ignore if we already have the video in $out["web"] + continue 2; + } } - $as = + // get thumbnail + $image = $this->fuckhtml - ->getElementsByTagName("a"); + ->getElementsByAttributeName( + "id", + "img" + ); - $description = []; + if(count($image) !== 0){ + + $thumb = [ + "url" => $this->getdimg($image[0]["attributes"]["id"]), + "ratio" => "16:9" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } - $pcitems = + // get title + $title = $this->fuckhtml ->getElementsByClassName( - "pcitem", + $this->getstyle( + [ + "font-family" => "arial,sans-serif", + "font-size" => "16px", + "font-weight" => "400", + "line-height" => "24px" + ] + ), "div" ); - if(count($pcitems) !== 0){ + if(count($title) === 0){ - // ignore elements with carousels in them + // ?? no title continue; } - foreach($as as $a){ - - // - // Detect next page - // - if( - isset($a["attributes"]["aria-label"]) && - strtolower($a["attributes"]["aria-label"]) == "next page" - ){ - - $out["npt"] = - $this->backend->store( - $this->fuckhtml - ->getTextContent( - $a["attributes"]["href"] - ), - $pagetype, - $ip - ); - continue 2; - } + $title = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ); + + // get duration + $duration_div = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "border-radius" => "10px", + "font-family" => "arial,sans-serif-medium,sans-serif", + "font-size" => "12px", + "line-height" => "16px", + "padding-block" => "2px", + "padding-inline" => "8px" + ] + ), + "div" + ); + + if(count($duration_div) !== 0){ - // - // Parse as text node - // - $container["innerHTML"] = - explode( - $a["outerHTML"], - $container["innerHTML"], - 2 + $duration = + $this->hms2int( + $this->fuckhtml + ->getTextContent( + $duration_div[0] + ) ); + }else{ - $before = + // check if its a livestream + $duration = $this->fuckhtml - ->getTextContent( - $container["innerHTML"][0], - false, - false + ->getElementsByClassName( + $this->getstyle( + [ + "background-color" => "#d93025", + "border-radius" => "10px", + "color" => "#fff", + "font-family" => "arial,sans-serif-medium,sans-serif", + "font-size" => "12px", + "line-height" => "16px", + "padding-block" => "2px", + "padding-inline" => "8px" + ] + ), + "span" ); - // set after - if(count($container["innerHTML"]) === 2){ + if(count($duration) !== 0){ - $container["innerHTML"] = - $container["innerHTML"][1]; + $duration = "_LIVE"; }else{ - $container["innerHTML"] = ""; + $duration = null; } + } + + // get date + $date_div = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "color" => "var(" . $this->getcolorvar("#70757a") . ")", + "font-size" => "14px" + ] + ), + "div" + ); + + if(count($date_div) !== 0){ + + $date = strtotime( + $this->fuckhtml + ->getTextContent( + $date_div[0] + ) + ); - if($before != ""){ + if($date === false){ - $description[] = [ - "type" => "text", - "value" => $before - ]; + // failed to parse date + $date = null; } + }else{ - // add link - $description[] = [ - "type" => "link", - "url" => - $this->unshiturl( - $a["attributes"] - ["href"] - ), - "value" => - $this->fuckhtml - ->getTextContent( - $a - ) - ]; - } - - if($container["innerHTML"] != ""){ - - $description[] = [ - "type" => "text", - "value" => - $this->fuckhtml - ->getTextContent( - $container["innerHTML"] - ) - ]; + $date = null; } - $out["answer"][] = [ - "title" => "Notice", - "description" => $description, - "url" => null, - "thumb" => null, - "table" => [], - "sublink" => [] + $out["video"][] = [ + "title" => $title, + "description" => null, + "date" => $date, + "duration" => $duration, + "views" => null, + "thumb" => $thumb, + "url" => $url ]; } // - // remove duplicate web links cause instant answers - // sometimes contains duplicates + // Parse featured results (which contain images, fuck the rest desu) // - $c = count($out["web"]); - $links = []; + $this->fuckhtml->load($html); + $top = + $this->fuckhtml + ->getElementsByAttributeValue( + "aria-label", + "Featured results", + "div" + ); - for($i=0; $i<$c; $i++){ + if(count($top) !== 0){ + + $this->fuckhtml->load($top[0]); + + // get images + $grid = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "border-radius" => "20px", + "display" => "grid", + "grid-gap" => "2px", + "grid-template-rows" => "repeat(2,minmax(0,1fr))", + "overflow" => "hidden", + "bottom" => "0", + "left" => "0", + "right" => "0", + "top" => "0", + "position" => "absolute", + ] + ), + "div" + ); - foreach($links as $link){ + if(count($grid) !== 0){ + + // we found image grid + $this->fuckhtml->load($grid[0]); - if($out["web"][$i]["url"] == $link){ + $images_div = + $this->fuckhtml + ->getElementsByAttributeName( + "data-attrid", + "div" + ); + + foreach($images_div as $image_div){ - unset($out["web"][$i]); - continue 2; + $this->fuckhtml->load($image_div); + + $image = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if( + count($image) === 0 || + !isset($image_div["attributes"]["data-docid"]) || + !isset($this->image_arr[$image_div["attributes"]["data-docid"]]) + ){ + + // ?? no image, continue + continue; + } + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $image[0]["attributes"]["alt"] + ) + ), + "source" => + $this->image_arr[ + $image_div["attributes"]["data-docid"] + ], + "url" => + $this->fuckhtml + ->getTextContent( + $image_div["attributes"]["data-lpage"] + ) + ]; } } - - $links[] = $out["web"][$i]["url"]; } - $out["web"] = array_values($out["web"]); - return $out; - } - - - - - public function image($get){ - - // generate parameters - if($get["npt"]){ - - [$params, $proxy] = - $this->backend->get( - $get["npt"], - "images" - ); - - $params = json_decode($params, true); - }else{ - - $search = $get["s"]; - if(strlen($search) === 0){ - - throw new Exception("Search term is empty!"); + // + // craft $npt token + // + if( + $last_page === false && + count($out["web"]) !== 0 + ){ + if(!isset($params["start"])){ + + $params["start"] = 20; + }else{ + + $params["start"] += 20; } - $proxy = $this->backend->get_ip(); - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $lang = $get["lang"]; - $time = $get["time"]; - $size = $get["size"]; - $ratio = $get["ratio"]; - $color = $get["color"]; - $type = $get["type"]; - $format = $get["format"]; - $rights = $get["rights"]; + $out["npt"] = + $this->backend + ->store( + json_encode($params), + $pagetype, + $proxy + ); + } + + + // + // Parse right handside + // + $this->fuckhtml->load($html); + + $rhs = + $this->fuckhtml + ->getElementById( + "rhs" + ); + + if($rhs === null){ - $params = [ - "q" => $search, - "tbm" => "isch" - ]; + return $out; + } + + $this->fuckhtml->load($rhs); + + // get images gallery + $image_gallery = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-rc", + "ivg-i", + "div" + ); + + if(count($image_gallery) !== 0){ - // country - if($country != "any"){ - - $params["gl"] = $country; - } + $this->fuckhtml->load($image_gallery[0]); - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; + // get images + $images_div = + $this->fuckhtml + ->getElementsByClassName( + "ivg-i", + "div" + ); - // language - if($lang != "any"){ + foreach($images_div as $image_div){ - $params["lr"] = "lang_" . $lang; - } - - $tbs = []; - - // time - if($time != "any"){ + $this->fuckhtml->load($image_div); - $tbs[] = "qrd:" . $time; - } - - // size - if($size != "any"){ + $image = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); if( - in_array( - $size, - ["l", "s", "i"] + count($image) === 0 || + !isset( + $this->image_arr[ + $image_div + ["attributes"] + ["data-docid"] + ] ) ){ - $tbs[] = "isz:" . $size; - }else{ + continue; + } + + foreach($out["image"] as $existing_image){ - $tbs[] = "tbz:lt"; - $tbs[] = "islt:" . $size; + // might already exist + if( + $existing_image["source"][1]["url"] == + $this->image_arr[ + $image_div + ["attributes"] + ["data-docid"] + ][1]["url"] + ){ + + continue 2; + } } - } - - // ratio - if($ratio != "any"){ - $tbs[] = "iar:" . $ratio; + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $image[0] + ["attributes"] + ["alt"] + ) + ), + "source" => + $this->image_arr[ + $image_div + ["attributes"] + ["data-docid"] + ], + "url" => + $this->fuckhtml + ->getTextContent( + $image_div + ["attributes"] + ["data-lpage"] + ) + ]; } - // color - if($color != "any"){ + // reset + $this->fuckhtml->load($rhs); + } + + // get header container + $header = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "padding" => "0 0 16px 20px", + "display" => "flex" + ] + ), + "div" + ); + + // stop parsing wikipedia heads if there isn't a header + $description = []; + $title = "About"; + + if(count($header) !== 0){ + + $this->fuckhtml->load($header[0]); + + $title_tag = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "title", + "div" + ); + + if(count($title_tag) !== 0){ + $title = + $this->fuckhtml + ->getTextContent( + $title_tag[0] + ); - if( - in_array( - $color, - ["color", "gray", "trans"] - ) - ){ - - $tbs[] = "ic:" . $color; - }else{ + $header[0]["innerHTML"] = + str_replace( + $title_tag[0]["outerHTML"], + "", + $header[0]["innerHTML"] + ); + + // if header still contains text, add it as a subtitle in description + $subtitle = + $this->fuckhtml + ->getTextContent( + $header[0] + ); + + if(strlen($subtitle) !== 0){ - $tbs[] = "ic:specific"; - $tbs[] = "isc:" . $color; + $description[] = [ + "type" => "quote", + "value" => $subtitle + ]; } } - // type - if($type != "any"){ - - $tbs[] = "itp:" . $type; - } + // reset + $this->fuckhtml->load($rhs); + } + + // get description elements + $url = null; + + $text = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "description", + "div" + ); + + if(count($text) !== 0){ - // format - if($format != "any"){ - - $tbs[] = "ift:" . $format; - } + $this->fuckhtml->load($text[0]); - // rights - if($rights != "any"){ - - $tbs[] = "il:" . $rights; - } + $a = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); - // append tbs - if(count($tbs) !== 0){ + if(count($a) !== 0){ + // get link and remove it from description - $params["tbs"] = - implode(",", $tbs); + $a = $a[count($a) - 1]; + + $text[0]["innerHTML"] = + str_replace( + $a["outerHTML"], + "", + $text[0]["innerHTML"] + ); + + $url = + $this->fuckhtml + ->getTextContent( + $a + ["attributes"] + ["href"] + ); } + + $description[] = [ + "type" => "text", + "value" => + html_entity_decode( + preg_replace( + '/^Description/', + "", + $this->fuckhtml + ->getTextContent( + $text[0] + ) + ) + ) + ]; + + // reset + $this->fuckhtml->load($rhs); } - /* - $handle = fopen("scraper/google-img.html", "r"); - $html = fread($handle, filesize("scraper/google-img.html")); - fclose($handle);*/ + // get reviews (google play, steam, etc) + $review_container = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "align-items" => "start", + "display" => "flex" + ] + ), + "div" + ); - // scrape images - try{ - $html = - $this->get( - $proxy, - "https://www.google.com/search", - $params + if(count($review_container) !== 0){ + + $this->fuckhtml->load($review_container[0]); + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" ); - }catch(Exception $error){ - throw new Exception("Failed to get search page"); + if(count($as) !== 0){ + + $description[] = [ + "type" => "title", + "value" => "Ratings" + ]; + + foreach($as as $a){ + + $this->fuckhtml->load($a); + + $spans = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + if(count($spans) >= 2){ + + $value = + trim( + $this->fuckhtml + ->getTextContent( + $spans[1] + ), + "· " + ); + + if( + $value == "" && + isset($spans[2]) + ){ + + $value = + $this->fuckhtml + ->getTextContent( + $spans[2] + ); + } + + $description[] = [ + "type" => "link", + "url" => + $this->fuckhtml + ->getTextContent( + $a["attributes"] + ["href"] + ), + "value" => $value + ]; + + $description[] = [ + "type" => "text", + "value" => + ": " . + $this->fuckhtml + ->getTextContent( + $spans[0] + ) . "\n" + ]; + } + } + } + + // reset + $this->fuckhtml->load($rhs); } - if($error = $this->detect_sorry($html)){ + // abort if we didnt find any description + if(count($description) === 0){ - throw new Exception($error); + return $out; } - $out = [ - "status" => "ok", - "npt" => null, - "image" => [] - ]; - - $images = + // get table elements + $table = []; + $table_elems = $this->fuckhtml ->getElementsByClassName( - "islrtb isv-r", + $this->getstyle( + [ + "margin-top" => "7px" + ] + ), "div" ); - foreach($images as $image){ + foreach($table_elems as $elem){ - $this->fuckhtml->load($image); - $img = - $this->fuckhtml - ->getElementsByTagName("img")[0]; - - $og_width = (int)$image["attributes"]["data-ow"]; - $og_height = (int)$image["attributes"]["data-oh"]; - $thumb_width = (int)$image["attributes"]["data-tw"]; + $this->fuckhtml->load($elem); - $ratio = $og_width / $og_height; + $spans = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); - if(isset($img["attributes"]["data-src"])){ - - $src = &$img["attributes"]["data-src"]; - }else{ + if(count($spans) === 0){ - $src = &$img["attributes"]["src"]; + // ?? invalid + continue; } - $thumb_height = floor($thumb_width / $ratio); + $elem["innerHTML"] = + str_replace( + $spans[0]["outerHTML"], + "", + $elem["innerHTML"] + ); - $out["image"][] = [ - "title" => - $this->titledots( - $this->fuckhtml - ->getTextContent( - $image["attributes"]["data-pt"] - ) + $key = + rtrim( + $this->fuckhtml + ->getTextContent( + $spans[0] ), - "source" => [ - [ - "url" => - $this->fuckhtml - ->getTextContent( - $image["attributes"]["data-ou"] - ), - "width" => $og_width, - "height" => $og_height - ], - [ - "url" => - $this->fuckhtml - ->getTextContent( - $src - ), - "width" => $thumb_width, - "height" => $thumb_height - ] - ], - "url" => + ": " + ); + + if($key == ""){ + + continue; + } + + $table[$key] = + preg_replace( + '/ +/', + " ", $this->fuckhtml ->getTextContent( - $image["attributes"]["data-ru"] + $elem ) - ]; + ); + + // reset + $this->fuckhtml->load($rhs); } - // get next page - // https://www.google.com/search - // ?q=higurashi - // &tbm=isch - // &async=_id%3Aislrg_c%2C_fmt%3Ahtml - // &asearch=ichunklite - // &ved=0ahUKEwidjYXJqJSAAxWrElkFHZ07CDwQtDIIQygA - if(count($out["image"]) !== 100){ + // get sublink elements + $sublinks = []; + + // get the website div + $as = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "visit_official_site", + "a" + ); + + if(count($as) !== 0){ - // no more results - return $out; + $sublinks["Website"] = + str_replace( + "http://", + "https://", + $this->fuckhtml + ->getTextContent( + $as[0] + ["attributes"] + ["href"] + ) + ); } - if($get["npt"]){ + // get social media links + $as = + $this->fuckhtml + ->getElementsByTagName( + "g-link" + ); + + foreach($as as $a){ - // update nextpage information - $params["start"] = (int)$params["start"] + count($out["image"]); - $params["ijn"] = (int)$params["ijn"] + 1; + $this->fuckhtml->load($a); - $out["npt"] = - $this->backend->store( - json_encode($params), - "images", - $proxy + $link = + $this->fuckhtml + ->getElementsByTagName( + "a" ); - }else{ - // scrape nextpage information - $this->fuckhtml->load($html); + if(count($link) === 0){ + + continue; + } - $ved = + $sublink_title = $this->fuckhtml - ->getElementById("islrg", "div"); + ->getTextContent( + $a + ); - if($ved){ + if($sublink_title == "X (Twitter)"){ - $ved = - $this->fuckhtml - ->getTextContent( - $ved["attributes"]["data-ved"] - ); - - // &vet=1{$ved}..i (10ahUKEwidjYXJqJSAAxWrElkFHZ07CDwQtDIIQygA..i) - - /* - These 2 are handled by us - start = start + number of results - ijn = current page number - */ - // &start=100 - // &ijn=1 - - // &imgvl=CAEY7gQgBSj3Aji8VTjXVUC4AUC3AUgAYNdV - preg_match( - '/var e=\'([A-z0-9]+)\';/', - $html, - $imgvl + $sublink_title = "Twitter"; + } + + $sublinks[$sublink_title] = + $this->fuckhtml + ->getTextContent( + $link[0] + ["attributes"] + ["href"] ); + } + + // reset + $this->fuckhtml->load($rhs); + + // get those round containers + $containers = + $this->fuckhtml + ->getElementsByClassName( + "tpa-ci" + ); + + foreach($containers as $container){ + + $this->fuckhtml->load($container); + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) === 0){ - if(isset($imgvl[1])){ - $imgvl = $imgvl[1]; - - $params["async"] = "_id:islrg_c,_fmt:html"; - $params["asearch"] = "ichunklite"; - $params["ved"] = $ved; - $params["vet"] = "1" . $ved . "..i"; - $params["start"] = 100; - $params["ijn"] = 1; - $params["imgvl"] = $imgvl; - - $out["npt"] = - $this->backend->store( - json_encode($params), - "images", - $proxy - ); - } + continue; } + + $sublinks[ + $this->fuckhtml + ->getTextContent( + $as[0] + ) + ] = + $this->fuckhtml + ->getTextContent( + $as[0] + ["attributes"] + ["href"] + ); } + $out["answer"][] = [ + "title" => $title, + "description" => $description, + "url" => $url, + "thumb" => null, + "table" => $table, + "sublink" => $sublinks + ]; + return $out; } - private function hms2int($time){ + + private function scrape_dimg($html){ - $parts = explode(":", $time, 3); - $time = 0; + // get images loaded through javascript + $this->dimg = []; - if(count($parts) === 3){ + preg_match_all( + '/function\(\){google\.ldi=({.*?});/', + $html, + $dimg + ); + + if(isset($dimg[1])){ - // hours - $time = $time + ((int)$parts[0] * 3600); - array_shift($parts); + foreach($dimg[1] as $i){ + + $tmp = json_decode($i, true); + foreach($tmp as $key => $value){ + + $this->dimg[$key] = + $this->unshit_thumb( + $value + ); + } + } } - if(count($parts) === 2){ + // get additional javascript base64 images + preg_match_all( + '/var s=\'(data:image\/[^\']+)\';var ii=\[((?:\'[^\']+\',?)+)\];/', + $html, + $dimg + ); + + if(isset($dimg[1])){ - // minutes - $time = $time + ((int)$parts[0] * 60); - array_shift($parts); + for($i=0; $i<count($dimg[1]); $i++){ + + $delims = explode(",", $dimg[2][$i]); + $string = + $this->fuckhtml + ->parseJsString( + $dimg[1][$i] + ); + + foreach($delims as $delim){ + + $this->dimg[trim($delim, "'")] = $string; + } + } } + } + + + private function scrape_imagearr($html){ + // get image links arrays + preg_match_all( + '/\[0,"([^"]+)",\["([^"]+)\",([0-9]+),([0-9]+)\],\["([^"]+)",([0-9]+),([0-9]+)\]/', + $html, + $image_arr + ); - // seconds - $time = $time + (int)$parts[0]; + $this->image_arr = []; + if(isset($image_arr[1])){ + + for($i=0; $i<count($image_arr[1]); $i++){ + + $this->image_arr[$image_arr[1][$i]] = + [ + [ + "url" => + $this->fuckhtml + ->parseJsString( + $image_arr[5][$i] + ), + "width" => (int)$image_arr[7][$i], + "height" => (int)$image_arr[6][$i] + ], + [ + "url" => + $this->unshit_thumb( + $this->fuckhtml + ->parseJsString( + $image_arr[2][$i] + ) + ), + "width" => (int)$image_arr[4][$i], + "height" => (int)$image_arr[3][$i] + ] + ]; + } + } + } + + + private function getdimg($dimg){ - return $time; + return isset($this->dimg[$dimg]) ? $this->dimg[$dimg] : null; } - private function parsejavascript($html){ + + private function unshit_thumb($url){ + // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj + // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA - $this->fuckhtml->load($html); + $parts = parse_url($url); - $styles = + if( + isset($parts["host"]) && + preg_match( + '/tbn.*\.gstatic\.com/', + $parts["host"] + ) + ){ + + parse_str($parts["query"], $params); + + if(isset($params["q"])){ + + return "https://" . $parts["host"] . "/images?q=" . $params["q"]; + } + } + + return $url; + } + + + private function parsestyles(){ + + $styles = []; + + $style_div = $this->fuckhtml - ->getElementsByTagName("style"); + ->getElementsByTagName( + "style" + ); - $this->computedstyle = []; - $this->ask = []; + $raw_styles = ""; - foreach($styles as $style){ + foreach($style_div as $style){ - $this->computedstyle = - array_merge( - $this->computedstyle, - $this->parsestyles($style["innerHTML"]) - ); + $raw_styles .= $style["innerHTML"]; } - // get images in javascript var - preg_match( - '/google\.ldi=({[^}]+})/', - $html, - $this->js_image + // filter out media/keyframe queries + $raw_styles = + preg_replace( + '/@\s*(?!font-face)[^{]+\s*{[\S\s]+?}\s*}/', + "", + $raw_styles + ); + + // get styles + preg_match_all( + '/(.+?){([\S\s]*?)}/', + $raw_styles, + $matches ); - if(count($this->js_image) !== 0){ + for($i=0; $i<count($matches[1]); $i++){ - $this->js_image = json_decode($this->js_image[1], true); - }else{ - - $this->js_image = []; - } - - // additional js_images present in <script> tags - // ugh i fucking hate you - $scripts = - $this->fuckhtml - ->getElementsByTagName("script"); - - foreach($scripts as $script){ + // get style values + preg_match_all( + '/([^:;]+):([^;]*?(?:\([^)]+\)[^;]*?)?)(?:;|$)/', + $matches[2][$i], + $values_regex + ); - if(!isset($script["innerHTML"])){ + $values = []; + for($k=0; $k<count($values_regex[1]); $k++){ - continue; + $values[trim($values_regex[1][$k])] = + trim($values_regex[2][$k]); } - preg_match_all( - '/var s=\'(data:image[^\']+)\';var i=\[(\'[^\;]*\')];/', - $script["innerHTML"], - $image_grep - ); + $names = explode(",", $matches[1][$i]); - if(count($image_grep[0]) !== 0){ + // h1,h2,h3 will each get their own array index + foreach($names as $name){ - $items = explode(",", $image_grep[2][0]); - $value = - $this->fuckhtml - ->getTextContent( - $image_grep[1][0] - ); + $name = trim($name, "}\t\n\r\0\x0B"); - foreach($items as $item){ + foreach($values as $key => $value){ - $this->js_image[trim($item, "' ")] = $value; + $styles[$name][$key] = $value; } } + } + + foreach($styles as $key => $values){ - // even more javascript crap - // "People also ask" node is loaded trough javascript - preg_match_all( - '/window\.jsl\.dh\(\'([^\']+)\',\'(.+)\'\);/', - $script["innerHTML"], - $ask_grep - ); + $styles[$key]["_c"] = count($values); + } + + $this->styles = $styles; + + // get CSS colors + $this->css_colors = []; + + if(isset($this->styles[":root"])){ - for($i=0; $i<count($ask_grep[0]); $i++){ + foreach($this->styles[":root"] as $key => $value){ - $this->ask[trim($ask_grep[1][$i])] = - $this->fuckhtml->parseJsString( - $ask_grep[2][$i] - ); + $this->css_colors[$value] = $key; } } } - private function findstyles($rules, $is){ + + + private function getstyle($styles){ - $c = count($rules); + $styles["_c"] = count($styles); - foreach($this->computedstyle as $classname => $styles){ + foreach($this->styles as $style_key => $style_values){ - if($classname[0] != $is){ + if(count(array_intersect_assoc($style_values, $styles)) === $styles["_c"] + 1){ - // not a class, skip - continue; - } - - $i = 0; - foreach($styles as $stylename => $stylevalue){ + $style_key = + explode(" ", $style_key); - if( - isset($rules[$stylename]) && - $rules[$stylename] == $stylevalue - ){ - - $i++; - }else{ - - continue 2; - } - } - - if($c === $i){ + $style_key = $style_key[count($style_key) - 1]; - return ltrim($classname, $is); + return + ltrim( + str_replace( + [".", "#"], + " ", + $style_key + ) + ); } } - // fail, did not find classname. return false; } - private function parsestyles($style){ + + + private function getcolorvar($color){ - // get style tags - preg_match_all( - '/([^{]+){([^}]*)}/', - $style, - $tags_regex - ); + if(isset($this->css_colors[$color])){ + + return $this->css_colors[$color]; + } - $tags = []; + return null; + } + + + + public function web($get){ - for($i=0; $i<count($tags_regex[0]); $i++){ + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "web"); + $params = json_decode($params, true); + + $search = $params["q"]; + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $lang = $get["lang"]; + $older = $get["older"]; + $newer = $get["newer"]; + $spellcheck = $get["spellcheck"]; + $proxy = $this->backend->get_ip(); - $tagnames = explode(",", trim($tags_regex[1][$i])); + $offset = 0; + + $params = [ + "q" => $search, + "hl" => "en", + "num" => 20 // get 20 results + ]; - foreach($tagnames as $tagname){ + // country + if($country != "any"){ - $tagname = trim($tagname); + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // language + if($lang != "any"){ - if(!isset($tags[$tagname])){ - $tags[$tagname] = []; - } + $params["lr"] = "lang_" . $lang; + } + + // generate tbs + $tbs = []; + + // get date + $older = $older === false ? null : date("m/d/Y", $older); + $newer = $newer === false ? null : date("m/d/Y", $newer); + + if( + $older !== null || + $newer !== null + ){ - $values = explode(";", $tags_regex[2][$i]); + $tbs["cdr"] = "1"; + $tbs["cd_min"] = $newer; + $tbs["cd_max"] = $older; + } + + // spellcheck filter + if($spellcheck == "no"){ - foreach($values as $value){ - - $value = explode(":", $value, 2); - - if(count($value) !== 2){ - - continue; - } + $params["nfpr"] = "1"; + } + + if(count($tbs) !== 0){ + + $params["tbs"] = ""; + + foreach($tbs as $key => $value){ - $tags[$tagname][trim($value[0])] = - trim($value[1]); + $params["tbs"] .= $key . ":" . $value . ","; } + + $params["tbs"] = rtrim($params["tbs"], ","); + } + } + + try{ + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + //$html = file_get_contents("scraper/google.html"); + + return $this->parsepage($html, "web", $search, $proxy, $params); + } + + + + public function video($get){ + + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "web"); + $params = json_decode($params, true); + + $search = $params["q"]; + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $older = $get["older"]; + $newer = $get["newer"]; + $duration = $get["duration"]; + $quality = $get["quality"]; + $captions = $get["captions"]; + $proxy = $this->backend->get_ip(); + + $params = [ + "q" => $search, + "tbm" => "vid", + "hl" => "en", + "num" => "20" + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + $tbs = []; + + // get date + $older = $older === false ? null : date("m/d/Y", $older); + $newer = $newer === false ? null : date("m/d/Y", $newer); + + if( + $older !== null || + $newer !== null + ){ + + $tbs["cdr"] = "1"; + $tbs["cd_min"] = $newer; + $tbs["cd_max"] = $older; + } + + // duration + if($duration != "any"){ + + $tbs[] = "dur:" . $duration; + } + + // quality + if($quality != "any"){ + + $tbs[] = "hq:" . $quality; + } + + // captions + if($captions != "any"){ + + $tbs[] = "cc:" . $captions; + } + + // append tbs + if(count($tbs) !== 0){ + + $params["tbs"] = + implode(",", $tbs); } } - return $tags; + try{ + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + //$html = file_get_contents("scraper/google-video.html"); + + $response = $this->parsepage($html, "videos", $search, $proxy, $params); + $out = [ + "status" => "ok", + "npt" => $response["npt"], + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + foreach($response["web"] as $result){ + + $out["video"][] = [ + "title" => $result["title"], + "description" => $result["description"], + "author" => [ + "name" => isset($result["table"]["Author"]) ? $result["table"]["Author"] : null, + "url" => null, + "avatar" => null + ], + "date" => $result["date"], + "duration" => isset($result["table"]["Duration"]) ? $this->hms2int($result["table"]["Duration"]) : null, + "views" => null, + "thumb" => $result["thumb"], + "url" => $result["url"] + ]; + } + + return $out; } - private function getimage($id){ + + + public function news($get){ - if(isset($this->js_image[$id])){ + if($get["npt"]){ + + [$req, $proxy] = $this->backend->get($get["npt"], "news"); + /*parse_str( + parse_url($req, PHP_URL_QUERY), + $search + );*/ + + try{ + + $html = + $this->get( + $proxy, + "https://www.google.com" . $req, + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $older = $get["older"]; + $newer = $get["newer"]; + $sort = $get["sort"]; + $proxy = $this->backend->get_ip(); - $return = $this->fuckhtml->parseJsString($this->js_image[$id]); + $params = [ + "q" => $search, + "tbm" => "nws", + "hl" => "en", + "num" => "20" + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + $tbs = []; + + // get date + $older = $older === false ? null : date("m/d/Y", $older); + $newer = $newer === false ? null : date("m/d/Y", $newer); if( - $return != "" && - $return != "" + $older !== null || + $newer !== null ){ - if( - preg_match( - '/^\/\//', - $return - ) - ){ + $tbs["cdr"] = "1"; + $tbs["cd_min"] = $newer; + $tbs["cd_max"] = $older; + } + + // relevance + if($sort == "date"){ + + $tbs["sbd"] = "1"; + } - return 'https:' . $return; + // append tbs + if(count($tbs) !== 0){ + + $params["tbs"] = ""; + + foreach($tbs as $key => $value){ + + $params["tbs"] .= $key . ":" . $value . ","; } - return $return; + $params["tbs"] = rtrim($params["tbs"], ","); } - return null; + //$html = file_get_contents("scraper/google-news.html"); + + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); } - } - - private function parsecarousels(&$item_to_remove = false){ - $carousels = + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + $this->fuckhtml->load($html); + + $this->detect_sorry(); + + // get images + $this->scrape_dimg($html); + + // parse styles + $this->parsestyles(); + + $center_col = $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "padding" => "16px", - "position" => "relative" - ], - self::is_class - ) + ->getElementById( + "center_col", + "div" ); - $return = []; + if($center_col === null){ + + throw new Exception("Could not grep result div"); + } + + $this->fuckhtml->load($center_col); + + // get next page + $npt = + $this->fuckhtml + ->getElementById( + "pnnext", + "a" + ); - for($i=0; $i<count($carousels); $i++){ + if($npt !== false){ + + $out["npt"] = + $this->backend->store( + $this->fuckhtml + ->getTextContent( + $npt["attributes"] + ["href"] + ), + "news", + $proxy + ); + } + + $as = + $this->fuckhtml + ->getElementsByAttributeName( + "jsname", + "a" + ); + + foreach($as as $a){ + + $this->fuckhtml->load($a); + + // get title + $title = + $this->fuckhtml + ->getElementsByAttributeValue( + "role", + "heading", + "div" + ); - if(!isset($carousels[$i]["outerHTML"])){ + if(count($title) === 0){ continue; } - $this->fuckhtml->load($carousels[$i]); + $title = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ); - if($item_to_remove !== false){ - - $item_to_remove = - str_replace( - $carousels[$i]["outerHTML"], - "", - $item_to_remove - ); - } + // get thumbnail + $image = + $this->fuckhtml + ->getElementsByAttributeName( + "id", + "img" + ); - $pcitems = + // check for padded title node, if found, we're inside a carousel + $probe = $this->fuckhtml ->getElementsByClassName( - "pcitem", + $this->getstyle( + [ + "padding" => "16px 16px 40px 16px" + ] + ), "div" ); - foreach($pcitems as $pcitem){ - - $this->fuckhtml->load($pcitem); + if(count($probe) !== 0){ - $out = [ - "url" => null, - "ref" => null, - "image" => null, - "thumb_width" => null, - "thumb_height" => null, - "image_width" => null, - "image_height" => null, - "title" => null, - "description" => null, - "subtext" => null, - "date" => null - ]; - - $url = - $this->unshiturl( - $this->fuckhtml - ->getElementsByTagName("a") - [0] - ["attributes"] - ["href"], - true - ); + $probe = true; + }else{ - // set ref - $out["ref"] = $url["ref"]; + $probe = false; + } + + if( + count($image) !== 0 && + !isset($image[0]["attributes"]["width"]) + ){ - // set url - $out["url"] = $url["url"]; + $thumb = [ + "url" => + $this->getdimg( + $image[0]["attributes"]["id"] + ), + "ratio" => $probe === true ? "16:9" : "1:1" + ]; + }else{ - // set sizes - $out["thumb_width"] = $url["thumb_width"]; - $out["thumb_height"] = $url["thumb_height"]; - $out["image_width"] = $url["image_width"]; - $out["image_height"] = $url["image_height"]; + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + $description = null; + + if($probe === false){ - // get image - $out["image"] = + $desc_divs = $this->fuckhtml - ->getElementsByTagName( - "img" + ->getElementsByAttributeName( + "style", + "div" ); - if(count($out["image"]) !== 0){ + foreach($desc_divs as $desc){ - // get title from image - if(isset($out["image"][0]["attributes"]["alt"])){ + if( + strpos( + $desc["attributes"]["style"], + "margin-top:" + ) !== false + ){ - $out["title"] = + $description = $this->titledots( $this->fuckhtml ->getTextContent( - $out["image"][0]["attributes"]["alt"] + $desc ) ); + break; } - - // get image url - if(isset($out["image"][0]["attributes"]["id"])){ - - $out["image"] = $this->getimage($out["image"][0]["attributes"]["id"]); - } - - elseif(isset($out["image"][0]["attributes"]["data-ll"])){ - - $out["image"] = - $this->fuckhtml - ->getTextContent( - $out["image"][0]["attributes"]["data-ll"] - ); - }else{ - - // failed to get image information - $out["image"] = null; - } - - if($out["image"] == ''){ - - // found arrow image base64, skip entry - continue; - } - }else{ - - // Could not find any image in node - $out["image"] = null; } + } + + // get author + $author = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "overflow" => "hidden", + "text-align" => "left", + "text-overflow" => "ellipsis", + "white-space" => "nowrap", + "margin-bottom" => "8px" + ] + ), + "div" + ); + + if(count($author) !== 0){ - // get title from spans - $title = + $author = $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "color" => "#1967d2" - ], - self::is_class - ), - "span" + ->getTextContent( + $author[0] ); + }else{ - if(count($title) !== 0){ - - $out["title"] = - $this->fuckhtml - ->getTextContent( - $title[0] - ); - } + $author = null; + } + + // get date + $date = null; + + $date_div = + $this->fuckhtml + ->getElementsByAttributeName( + "style", + "div" + ); + + foreach($date_div as $d){ - // get textnodes - $textnodes = + $this->fuckhtml->load($d); + + $span = $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "white-space" => "pre-line", - "word-wrap" => "break-word" - ], - self::is_class - ) + ->getElementsByTagName( + "span" ); - $subtext = null; - - if(count($textnodes) !== 0){ + if( + strpos( + $d["attributes"]["style"], + "bottom:" + ) !== false + ){ - // get date $date = - $this->fuckhtml - ->getTextContent( - $textnodes[count($textnodes) - 1], - true - ); - - if(str_replace("\n", " ", $date) == $title){ - - $date = null; - }else{ - - if(strpos($date, "\n") !== false){ - - $date = explode("\n", $date); - $date = $date[count($date) - 1]; - } - elseif(strpos($date, "•") !== false){ - - $date = explode("•", $date); - $date = ltrim($date[count($date) - 1]); - }else{ - - $date = null; - } - } - - if($date !== null){ - - $date = strtotime($date); - } - - // get description - $description = - $this->fuckhtml - ->getTextContent( - $textnodes[0] - ); - - if($out["title"] === null){ - - if($date === null){ - - $out["title"] = $description; - $description = null; - }else{ - - $out["title"] = parse_url($out["url"], PHP_URL_HOST); - } - } - - if(isset($textnodes[1])){ - - $out["subtext"] = + strtotime( $this->fuckhtml ->getTextContent( - $textnodes[1] - ); - } + $span[count($span) - 1] + ) + ); + break; + } + } + + $out["news"][] = [ + "title" => $title, + "author" => $author, + "description" => $description, + "date" => $date, + "thumb" => $thumb, + "url" => + $this->unshiturl( + $a["attributes"] + ["href"] + ) + ]; + } + + return $out; + } + + + + + public function image($get){ + + // generate parameters + if($get["npt"]){ + + [$params, $proxy] = + $this->backend->get( + $get["npt"], + "images" + ); + + $params = json_decode($params, true); + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $time = $get["time"]; + $size = $get["size"]; + $ratio = $get["ratio"]; + $color = $get["color"]; + $type = $get["type"]; + $format = $get["format"]; + $rights = $get["rights"]; + + $params = [ + "q" => $search, + "udm" => "2" // get images + ]; + + // country (image search uses cr instead of gl) + if($country != "any"){ + + $params["cr"] = "country" . strtoupper($country); + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // generate tbs + $tbs = []; + + // time + if($time != "any"){ + + $tbs["qdr"] = $time; + } + + // size + if($size != "any"){ + + $params["imgsz"] = $size; + } + + // ratio + if($ratio != "any"){ + + $params["imgar"] = $ratio; + } + + // color + if($color != "any"){ + + if( + $color == "color" || + $color == "trans" + ){ + + $params["imgc"] = $color; + }elseif($color == "bnw"){ + $params["imgc"] = "gray"; }else{ - $date = null; - $description = null; + $tbs["ic"] = "specific"; + $tbs["isc"] = $color; } + } + + // type + if($type != "any"){ - $out["date"] = $date; - $out["description"] = $this->titledots($description); + $tbs["itp"] = $type; + } + + // format + if($format != "any"){ - if($out["url"] === null){ - - $out["url"] = $out["title"]; - } + $params["as_filetype"] = $format; + } + + // rights (tbs) + if($rights != "any"){ - if($out["title"] == $out["description"]){ + $tbs["sur"] = $rights; + } + + // append tbs + if(count($tbs) !== 0){ + + $params["tbs"] = ""; + + foreach($tbs as $key => $value){ - $out["description"] = null; + $params["tbs"] .= $key . ":" . $value . ","; } - $return[$i][] = $out; + $params["tbs"] = rtrim($params["tbs"], ","); } } + /* + $handle = fopen("scraper/google-img.html", "r"); + $html = fread($handle, filesize("scraper/google-img.html")); + fclose($handle);*/ - return $return; - } - - private function unshiturl($url, $return_size = false){ + try{ + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get search page"); + } - // get parameters from URL - $url = + $this->fuckhtml->load($html); + + $this->detect_sorry(); + + // get javascript images + $this->scrape_imagearr($html); + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + $images = $this->fuckhtml - ->getTextContent($url); + ->getElementsByClassName( + "ivg-i", + "div" + ); - $newurl = parse_url($url, PHP_URL_QUERY); + foreach($images as $div){ + + $this->fuckhtml->load($div); + + $image = + $this->fuckhtml + ->getElementsByTagName("img")[0]; + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $image["attributes"]["alt"] + ) + ), + "source" => + $this->image_arr[ + $div["attributes"]["data-docid"] + ], + "url" => + $this->fuckhtml + ->getTextContent( + $div["attributes"]["data-lpage"] + ) + ]; + } - if($newurl == ""){ + // as usual, no way to check if there is a next page reliably + if(count($out["image"]) > 50){ - // probably telephone number - return $url; + if(!isset($params["start"])){ + + $params["start"] = 10; + }else{ + + $params["start"] += 10; + } + + $out["npt"] = + $this->backend + ->store( + json_encode($params), + "image", + $proxy + ); } - $url = $newurl; - unset($newurl); + return $out; + } + + private function unshiturl($url, $return_size = false){ - parse_str($url, $query); + // decode + $url = + $this->fuckhtml + ->getTextContent($url); + + $url_parts = parse_url($url); - if(isset($query["imgurl"])){ + if( + !isset( + $url_parts["host"] + ) + ){ - $url = $query["imgurl"]; - } - elseif(isset($query["q"])){ + // no host, we have a tracking url + parse_str($url_parts["query"], $query); - $url = $query["q"]; + if(isset($query["imgurl"])){ + + $url = $query["imgurl"]; + } + elseif(isset($query["q"])){ + + $url = $query["q"]; + } } // rewrite URLs to remove extra tracking parameters @@ -3567,7 +4702,6 @@ class google{ $domain ) ){ - // remove more referrers from twitter.com $oldquery = parse_url($url, PHP_URL_QUERY); if($oldquery !== null){ @@ -3577,14 +4711,9 @@ class google{ $query = http_build_query($query); - if($query != ""){ - - $query .= "?" . $query; - } - $url = str_replace( - '?' . $oldquery, + $oldquery, $query, $url ); @@ -3644,47 +4773,46 @@ class google{ private function titledots($title){ - return rtrim($title, ". \t\n\r\0\x0B"); + return trim($title, " .\t\n\r\0\x0B…"); } - private function detect_sorry($html){ + private function hms2int($time){ - $this->fuckhtml->load($html); - $detect_sorry = - $this->fuckhtml - ->getElementsByTagName("title"); + $parts = explode(":", $time, 3); + $time = 0; - if( - isset($detect_sorry[0]) && - $detect_sorry[0]["innerHTML"] == "302 Moved" - ){ - - // may be consent.google.com in europe or /sorry captcha page - $url = - $this->fuckhtml - ->getElementsByTagName("a"); + if(count($parts) === 3){ - if( - strpos( - parse_url( - $this->fuckhtml - ->getTextContent( - $url[0]["attributes"]["href"] - ), - PHP_URL_PATH - ), - "/sorry" - ) === 0 - ){ - - // found /sorry - return "Google blocked this 4get instance. Please setup a proxy!"; - } + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ - // found consent.google, should not happen anymore - return "Google served a GPDR consent form. This should not happen, please report if you encounter this message"; + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); } - return false; + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + private function detect_sorry(){ + + $recaptcha = + $this->fuckhtml + ->getElementById( + "recaptcha", + "div" + ); + + if($recaptcha !== false){ + + throw new Exception("Google returned a captcha"); + } } } |