diff options
author | lolcat <will@lolcat.ca> | 2023-08-27 01:45:59 -0400 |
---|---|---|
committer | lolcat <will@lolcat.ca> | 2023-08-27 01:45:59 -0400 |
commit | 1fd4c2de6d2552f4619c6e67aac60bc635465cd1 (patch) | |
tree | 12c015d718606e5d245008ea7a42899aafae0dd5 /scraper/google.php | |
parent | 12a6278a5fb0a207d2d24ae97957c37b5bd9b4d7 (diff) |
added yandex web and video search, removed fb search
Diffstat (limited to 'scraper/google.php')
-rw-r--r-- | scraper/google.php | 505 |
1 files changed, 502 insertions, 3 deletions
diff --git a/scraper/google.php b/scraper/google.php index 7ed3577..d0e90ca 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -824,8 +824,6 @@ class google{ $html = fread($handle, filesize("scraper/google.html")); fclose($handle); - $this->fuckhtml->load($html); - $out = [ "status" => "ok", "spelling" => [ @@ -841,6 +839,507 @@ class google{ "news" => [], "related" => [] ]; + + $this->parsejavascript($html); + + $containers = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "background-color" => "#fff", + "margin-bottom" => "10px", + "-webkit-box-shadow" => "0 1px 6px rgba(32,33,36,0.28)", + "border-radius" => "8px" + ], + self::is_class + ), + "div" + ); + + foreach($containers as $container){ + + $this->fuckhtml->load($container); + + $title = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "color" => "#1967d2", + "font-size" => "20px", + "line-height" => "26px" + ], + self::is_class + ), + "div" + ); + + if(count($title) !== 0){ + + /* + Container is a web link + */ + $web = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ), + "description" => null, + "url" => + $this->decodeurl( + $this->fuckhtml + ->getElementsByTagName("a") + [0] + ["attributes"] + ["href"] + ), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + $container = $container["innerHTML"]; + + $description_container = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "padding" => "12px 16px 12px" + ], + self::is_class + ), + "div" + )[1]; + + $description = + $description_container["innerHTML"]; + + // get sublinks + $this->fuckhtml->load($description); + + $links = + $this->fuckhtml + ->getElementsByTagName("a"); + + $skip = true; + foreach($links as $link){ + + $description = + str_replace( + $link["outerHTML"], + "", + $description + ); + + if($skip){ + + $skip = false; + continue; + } + + $sublink = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null + ]; + + $sublink["title"] = + $this->fuckhtml + ->getTextContent( + $link + ); + + $sublink["url"] = + $this->decodeurl( + $link + ["attributes"] + ["href"] + ); + + $web["sublink"][] = $sublink; + } + + // get thumbnail before we call loadhtml again + $img = + $this->fuckhtml + ->getElementsByTagName("img"); + + if(count($img) !== 0){ + + if( + isset($img[0]["attributes"]["alt"]) && + stripos($img[0]["attributes"]["alt"], "Video for") !== false + ){ + + // is a video thumbnail + $web["thumb"]["ratio"] = "16:9"; + }else{ + + // is a google thumbnail + $web["thumb"]["ratio"] = "1:1"; + } + + $web["thumb"]["url"] = + $this->getimage( + $img[0]["attributes"]["id"] + ); + } + + // get table elements + $this->fuckhtml->load($description); + + $levels = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "padding-bottom" => "8px" + ], + self::is_class + ), + "div" + ); + + $additional_info = []; + foreach($levels as $level){ + + $this->fuckhtml->load($level); + + $spans = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + $is_rating = -2; + + foreach($spans as $span){ + + // clean up description + $description = + str_replace( + $span["outerHTML"], + "", + $description + ); + + $innertext = + $this->fuckhtml + ->getTextContent( + $span + ); + + if($innertext == ""){ continue; } + + if( + strtolower($innertext) + == "rating" + ){ + + $is_rating = -1; + continue; + } + + /* + Parse rating object + */ + + if($is_rating >= -1){ + + if($span["level"] !== 1){ continue; } + + $is_rating++; + + // 10/10 (123) + if($is_rating === 0){ + + $innertext = explode(" ", $innertext, 2); + + $web["table"]["Rating"] = $innertext[0]; + $web["table"]["Hits"] = + trim( + str_replace( + [ + "(", + ")" + ], + "", + $innertext[1] + ) + ); + continue; + } + + // US$4.99 + // MYR 50.00 + // $38.34 + // JP¥6,480 + if($is_rating === 2){ + + $web["table"]["Price"] = $innertext; + continue; + } + + // Android / In stock + if($is_rating === 4){ + + $web["table"]["Support"] = $innertext; + continue; + } + + // ignore the rest + continue; + } + + /* + Parse standalone text + */ + $additional_info[] = $innertext; + } + } + + for($i=0; $i<count($additional_info); $i++){ + + // @TODO + // generate better node names + $web["table"]["Info node #$i"] = $additional_info[$i]; + } + + $this->fuckhtml->load($description); + + // get date node + $span = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + if(count($span) !== 0){ + + $description = + str_replace( + $span[0]["outerHTML"], + "", + $description + ); + + $span = + strtotime( + $this->fuckhtml + ->getTextContent( + $span[0] + ) + ); + + if($span){ + + $web["date"] = $span; + } + } + + $web["description"] = + trim( + $this->fuckhtml + ->getTextContent( + $description + ), + " ·." + ); + + $out["web"][] = $web; + + continue; + } + + // check for container title header + $container_title = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "font-weight" => "bold", + "font-size" => "16px", + "color" => "#000", + "margin" => "0", + "padding" => "12px 16px 0 16px" + ], + self::is_class + ), + "div" + ); + + if(count($container_title) !== 0){ + + $container_title = + strtolower( + $this->fuckhtml + ->getTextContent( + $container_title[0] + ) + ); + + if( + $container_title == "related searches" || + $container_title == "people also search for" + ){ + + /* + Parse related searches + */ + $as = + $this->fuckhtml + ->getElementsByTagName("a"); + + foreach($as as $a){ + + $out["related"][] = + $this->fuckhtml + ->getTextContent($a); + } + } + + continue; + } + + /* + Parse image carousel + */ + $title_container = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "padding" => "12px 16px 12px" + ], + self::is_class + ), + "div" + ); + + if(count($title_container) !== 0){ + + $title_container = + strtolower( + $this->fuckhtml + ->getTextContent( + $title_container[0] + ) + ); + + if($title_container == "imagesview all"){ + + /* + Image carousel + */ + $pcitem = + $this->fuckhtml + ->getElementsByClassName( + "pcitem", + "div" + ); + + foreach($pcitem as $item){ + + $this->fuckhtml->load($item); + + $link = + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0]; + + parse_str( + parse_url( + $this->fuckhtml + ->getTextContent( + $link + ["attributes"] + ["href"] + ), + PHP_URL_QUERY + ), + $link + ); + + if(isset($link["tbm"])){ + + continue; + } + + $image = + $this->fuckhtml + ->getElementsByTagName("img")[0]; + + $title = + $this->fuckhtml + ->getTextContent( + $image + ["attributes"] + ["alt"] + ); + + $image = + $this->getimage( + $image + ["attributes"] + ["id"] + ); + + $out["image"][] = [ + "title" => $title, + "source" => [ + [ + "url" => $link["imgurl"], + "width" => (int)$link["w"], + "height" => (int)$link["h"] + ], + [ + "url" => $image, + "width" => (int)$link["tbnw"], + "height" => (int)$link["tbnh"] + ] + ], + "url" => $link["imgrefurl"] + ]; + } + } + } + + /* + Get next page + */ + $as = + $this->fuckhtml + ->getElementsByTagName("a"); + + foreach($as as $a){ + + if( + isset($a["attributes"]["aria-label"]) && + strtolower($a["attributes"]["aria-label"]) == "next page" + ){ + + $out["npt"] = + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ); + } + } + } + + return $out; } @@ -1163,7 +1662,7 @@ class google{ return $time; } - private function loadjavascriptcrap($html){ + private function parsejavascript($html){ $this->fuckhtml->load($html); |