summaryrefslogtreecommitdiff
path: root/scraper/googlealt.php
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/googlealt.php')
-rw-r--r--scraper/googlealt.php278
1 files changed, 234 insertions, 44 deletions
diff --git a/scraper/googlealt.php b/scraper/googlealt.php
index 77c90a9..aa523db 100644
--- a/scraper/googlealt.php
+++ b/scraper/googlealt.php
@@ -531,11 +531,10 @@ class googlealt{
}
curl_setopt($curlproc, CURLOPT_URL, $url);
- curl_setopt($curlproc, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V6);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
-
+ curl_setopt($curlproc, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V6);
// use http2
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
@@ -2592,6 +2591,12 @@ class googlealt{
foreach($relateds as $related){
+ if(!isset($related["innerHTML"])){
+
+ // found an image
+ continue;
+ }
+
$text =
$this->fuckhtml
->getTextContent(
@@ -3193,41 +3198,52 @@ class googlealt{
$this->fuckhtml->load($header[0]);
- $title_tag =
- $this->fuckhtml
- ->getElementsByAttributeValue(
- "data-attrid",
- "title",
- "div"
- );
-
- if(count($title_tag) !== 0){
- $title =
+ // g-snackbar-action present: we found a button instead
+ if(
+ count(
$this->fuckhtml
- ->getTextContent(
- $title_tag[0]
- );
-
- $header[0]["innerHTML"] =
- str_replace(
- $title_tag[0]["outerHTML"],
- "",
- $header[0]["innerHTML"]
- );
+ ->getElementsByTagName(
+ "g-snackbar-action"
+ )
+ ) !== 0
+ ){
- // if header still contains text, add it as a subtitle in description
- $subtitle =
+ $title_tag =
$this->fuckhtml
- ->getTextContent(
- $header[0]
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "title",
+ "div"
);
- if(strlen($subtitle) !== 0){
+ if(count($title_tag) !== 0){
+ $title =
+ $this->fuckhtml
+ ->getTextContent(
+ $title_tag[0]
+ );
- $description[] = [
- "type" => "quote",
- "value" => $subtitle
- ];
+ $header[0]["innerHTML"] =
+ str_replace(
+ $title_tag[0]["outerHTML"],
+ "",
+ $header[0]["innerHTML"]
+ );
+
+ // if header still contains text, add it as a subtitle in description
+ $subtitle =
+ $this->fuckhtml
+ ->getTextContent(
+ $header[0]
+ );
+
+ if(strlen($subtitle) !== 0){
+
+ $description[] = [
+ "type" => "quote",
+ "value" => $subtitle
+ ];
+ }
}
}
@@ -3387,9 +3403,117 @@ class googlealt{
$this->fuckhtml->load($rhs);
}
- // abort if we didnt find any description
+ // initialize sublinks
+ $sublinks = [];
+
+ // get description from business
if(count($description) === 0){
+ $data_attrid =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "data-attrid"
+ );
+
+ $summary =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "kc:/local:one line summary",
+ $data_attrid
+ );
+
+ if(count($summary) !== 0){
+
+ $description[] = [
+ "type" => "quote",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $summary[0]
+ )
+ ];
+
+ // remove summary so it doesnt get parsed as a table
+ $rhs["innerHTML"] =
+ str_replace(
+ $summary[0]["outerHTML"],
+ "",
+ $rhs["innerHTML"]
+ );
+
+ $this->fuckhtml->load($rhs);
+ }
+
+ $address =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "kc:/location/location:address",
+ $data_attrid
+ );
+
+ if(count($address) !== 0){
+
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $address[0]
+ )
+ ];
+ }
+
+ // get title
+ $title_div =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "title",
+ $data_attrid
+ );
+
+ if(count($title_div) !== 0){
+
+ $title =
+ $this->fuckhtml
+ ->getTextContent(
+ $title_div[0]
+ );
+ }
+
+ // get phone number
+ $phone =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-attrid",
+ "kc:/local:alt phone",
+ $data_attrid
+ );
+
+ if(count($phone) !== 0){
+
+ $this->fuckhtml->load($phone[0]);
+
+ $sublinks["Call"] =
+ "tel:" .
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "aria-label",
+ "span"
+ )[0]
+ );
+
+ $this->fuckhtml->load($rhs);
+ }
+ }
+
+ if(count($description) === 0){
+
+ // still no description? abort
return $out;
}
@@ -3438,7 +3562,55 @@ class googlealt{
": "
);
- if($key == ""){
+ if(
+ $key == "" ||
+ $key == "Phone"
+ ){
+
+ continue;
+ }
+
+ if($key == "Hours"){
+
+ $hours = [];
+
+ $this->fuckhtml->load($elem);
+
+ $trs =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "tr"
+ );
+
+ foreach($trs as $tr){
+
+ $this->fuckhtml->load($tr);
+
+ $tds =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "td"
+ );
+
+ if(count($tds) === 2){
+
+ $hours[] =
+ $this->fuckhtml
+ ->getTextContent(
+ $tds[0]
+ ) . ": " .
+ $this->fuckhtml
+ ->getTextContent(
+ $tds[1]
+ );
+ }
+ }
+
+ if(count($hours) !== 0){
+
+ $hours = implode("\n", $hours);
+ $table["Hours"] = $hours;
+ }
continue;
}
@@ -3452,14 +3624,10 @@ class googlealt{
$elem
)
);
-
- // reset
- $this->fuckhtml->load($rhs);
}
-
- // get sublink elements
- $sublinks = [];
+ // reset
+ $this->fuckhtml->load($rhs);
// get the website div
$as =
@@ -3483,6 +3651,28 @@ class googlealt{
["href"]
)
);
+ }else{
+
+ // get website through button
+ $button =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "ab_button",
+ "a"
+ );
+
+ if(count($button) !== 0){
+
+ $sublinks["Website"] =
+ $this->unshiturl(
+ $this->fuckhtml
+ ->getTextContent(
+ $button[0]
+ ["attributes"]
+ ["href"]
+ )
+ );
+ }
}
// get social media links
@@ -3911,7 +4101,7 @@ class googlealt{
$html =
$this->get(
$proxy,
- "https://ipv6.google.com/search",
+ "https://www.google.com/search",
$params
);
}catch(Exception $error){
@@ -4008,7 +4198,7 @@ class googlealt{
$html =
$this->get(
$proxy,
- "https://ipv6.google.com/search",
+ "https://www.google.com/search",
$params
);
}catch(Exception $error){
@@ -4067,7 +4257,7 @@ class googlealt{
$html =
$this->get(
$proxy,
- "https://ipv6.google.com" . $req,
+ "https://www.google.com" . $req,
[]
);
}catch(Exception $error){
@@ -4140,7 +4330,7 @@ class googlealt{
$html =
$this->get(
$proxy,
- "https://ipv6.google.com/search",
+ "https://www.google.com/search",
$params
);
}
@@ -4521,7 +4711,7 @@ class googlealt{
$html =
$this->get(
$proxy,
- "https://ipv6.google.com/search",
+ "https://www.google.com/search",
$params
);
}catch(Exception $error){