diff options
author | gesang <gesang@itinerariummentis.org> | 2024-06-23 16:17:21 +0000 |
---|---|---|
committer | gesang <gesang@itinerariummentis.org> | 2024-06-23 16:17:21 +0000 |
commit | 121ab66eecd5798b10461274b8f210f1caa3009a (patch) | |
tree | 3d8ee2c5c95398845681f509325ad900a67a7789 /scraper | |
parent | a5f137dee7ed176a5b4577f85a5d6c0abc96d280 (diff) |
ipv6
Diffstat (limited to 'scraper')
-rw-r--r-- | scraper/googlealt.php | 278 |
1 files changed, 234 insertions, 44 deletions
diff --git a/scraper/googlealt.php b/scraper/googlealt.php index 77c90a9..aa523db 100644 --- a/scraper/googlealt.php +++ b/scraper/googlealt.php @@ -531,11 +531,10 @@ class googlealt{ } curl_setopt($curlproc, CURLOPT_URL, $url); - curl_setopt($curlproc, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V6); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); - + curl_setopt($curlproc, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V6); // use http2 curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); @@ -2592,6 +2591,12 @@ class googlealt{ foreach($relateds as $related){ + if(!isset($related["innerHTML"])){ + + // found an image + continue; + } + $text = $this->fuckhtml ->getTextContent( @@ -3193,41 +3198,52 @@ class googlealt{ $this->fuckhtml->load($header[0]); - $title_tag = - $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "title", - "div" - ); - - if(count($title_tag) !== 0){ - $title = + // g-snackbar-action present: we found a button instead + if( + count( $this->fuckhtml - ->getTextContent( - $title_tag[0] - ); - - $header[0]["innerHTML"] = - str_replace( - $title_tag[0]["outerHTML"], - "", - $header[0]["innerHTML"] - ); + ->getElementsByTagName( + "g-snackbar-action" + ) + ) !== 0 + ){ - // if header still contains text, add it as a subtitle in description - $subtitle = + $title_tag = $this->fuckhtml - ->getTextContent( - $header[0] + ->getElementsByAttributeValue( + "data-attrid", + "title", + "div" ); - if(strlen($subtitle) !== 0){ + if(count($title_tag) !== 0){ + $title = + $this->fuckhtml + ->getTextContent( + $title_tag[0] + ); - $description[] = [ - "type" => "quote", - "value" => $subtitle - ]; + $header[0]["innerHTML"] = + str_replace( + $title_tag[0]["outerHTML"], + "", + $header[0]["innerHTML"] + ); + + // if header still contains text, add it as a subtitle in description + $subtitle = + $this->fuckhtml + ->getTextContent( + $header[0] + ); + + if(strlen($subtitle) !== 0){ + + $description[] = [ + "type" => "quote", + "value" => $subtitle + ]; + } } } @@ -3387,9 +3403,117 @@ class googlealt{ $this->fuckhtml->load($rhs); } - // abort if we didnt find any description + // initialize sublinks + $sublinks = []; + + // get description from business if(count($description) === 0){ + $data_attrid = + $this->fuckhtml + ->getElementsByAttributeName( + "data-attrid" + ); + + $summary = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "kc:/local:one line summary", + $data_attrid + ); + + if(count($summary) !== 0){ + + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $summary[0] + ) + ]; + + // remove summary so it doesnt get parsed as a table + $rhs["innerHTML"] = + str_replace( + $summary[0]["outerHTML"], + "", + $rhs["innerHTML"] + ); + + $this->fuckhtml->load($rhs); + } + + $address = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "kc:/location/location:address", + $data_attrid + ); + + if(count($address) !== 0){ + + $description[] = [ + "type" => "text", + "value" => + $this->fuckhtml + ->getTextContent( + $address[0] + ) + ]; + } + + // get title + $title_div = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "title", + $data_attrid + ); + + if(count($title_div) !== 0){ + + $title = + $this->fuckhtml + ->getTextContent( + $title_div[0] + ); + } + + // get phone number + $phone = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-attrid", + "kc:/local:alt phone", + $data_attrid + ); + + if(count($phone) !== 0){ + + $this->fuckhtml->load($phone[0]); + + $sublinks["Call"] = + "tel:" . + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByAttributeName( + "aria-label", + "span" + )[0] + ); + + $this->fuckhtml->load($rhs); + } + } + + if(count($description) === 0){ + + // still no description? abort return $out; } @@ -3438,7 +3562,55 @@ class googlealt{ ": " ); - if($key == ""){ + if( + $key == "" || + $key == "Phone" + ){ + + continue; + } + + if($key == "Hours"){ + + $hours = []; + + $this->fuckhtml->load($elem); + + $trs = + $this->fuckhtml + ->getElementsByTagName( + "tr" + ); + + foreach($trs as $tr){ + + $this->fuckhtml->load($tr); + + $tds = + $this->fuckhtml + ->getElementsByTagName( + "td" + ); + + if(count($tds) === 2){ + + $hours[] = + $this->fuckhtml + ->getTextContent( + $tds[0] + ) . ": " . + $this->fuckhtml + ->getTextContent( + $tds[1] + ); + } + } + + if(count($hours) !== 0){ + + $hours = implode("\n", $hours); + $table["Hours"] = $hours; + } continue; } @@ -3452,14 +3624,10 @@ class googlealt{ $elem ) ); - - // reset - $this->fuckhtml->load($rhs); } - - // get sublink elements - $sublinks = []; + // reset + $this->fuckhtml->load($rhs); // get the website div $as = @@ -3483,6 +3651,28 @@ class googlealt{ ["href"] ) ); + }else{ + + // get website through button + $button = + $this->fuckhtml + ->getElementsByClassName( + "ab_button", + "a" + ); + + if(count($button) !== 0){ + + $sublinks["Website"] = + $this->unshiturl( + $this->fuckhtml + ->getTextContent( + $button[0] + ["attributes"] + ["href"] + ) + ); + } } // get social media links @@ -3911,7 +4101,7 @@ class googlealt{ $html = $this->get( $proxy, - "https://ipv6.google.com/search", + "https://www.google.com/search", $params ); }catch(Exception $error){ @@ -4008,7 +4198,7 @@ class googlealt{ $html = $this->get( $proxy, - "https://ipv6.google.com/search", + "https://www.google.com/search", $params ); }catch(Exception $error){ @@ -4067,7 +4257,7 @@ class googlealt{ $html = $this->get( $proxy, - "https://ipv6.google.com" . $req, + "https://www.google.com" . $req, [] ); }catch(Exception $error){ @@ -4140,7 +4330,7 @@ class googlealt{ $html = $this->get( $proxy, - "https://ipv6.google.com/search", + "https://www.google.com/search", $params ); } @@ -4521,7 +4711,7 @@ class googlealt{ $html = $this->get( $proxy, - "https://ipv6.google.com/search", + "https://www.google.com/search", $params ); }catch(Exception $error){ |