summaryrefslogtreecommitdiff
path: root/scraper/google.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2023-07-27 23:06:49 -0400
committerlolcat <will@lolcat.ca>2023-07-27 23:06:49 -0400
commit7c771c82c8e03b337f9f03ae2d4afc25d3f0faca (patch)
tree7a4deca784caee3aebaf29b99f7f1f087a2bb9c9 /scraper/google.php
parent16ee0b368fcf24b48574172726e32a19c275d691 (diff)
bug fixes (without google support)
Diffstat (limited to 'scraper/google.php')
-rw-r--r--scraper/google.php346
1 files changed, 175 insertions, 171 deletions
diff --git a/scraper/google.php b/scraper/google.php
index df10754..28ede6d 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -1565,18 +1565,17 @@ class google{
}
/*
- Fallback to parsing it as an embed
+ Detect if its a wikipedia thing
*/
+ $h3 =
+ $this->fuckhtml
+ ->getElementsByTagName("h3");
+
- $table = [
- "title" => null,
- "description" => [],
- "url" => null,
- "thumb" => null,
- "table" => [],
- "sublink" => []
- ];
+ /*
+ Fallback to parsing the word definitions
+ */
$parts =
$this->fuckhtml
->getElementsByClassName(
@@ -1596,12 +1595,17 @@ class google{
$head = $parts[0];
- $h3 =
- $this->fuckhtml
- ->getElementsByTagName("h3");
-
if(count($h3) !== 0){
+ $table = [
+ "title" => null,
+ "description" => [],
+ "url" => null,
+ "thumb" => null,
+ "table" => [],
+ "sublink" => []
+ ];
+
$h3 = $h3[0];
$table["title"] =
@@ -1626,201 +1630,201 @@ class google{
$head
)
];
- }
-
- $audio =
- $this->fuckhtml
- ->getElementsByTagName("audio");
-
- if(count($audio) !== 0){
- $table["description"][] = [
- "type" => "audio",
- "url" =>
- str_replace(
- "http://",
- "https://",
- $this->fuckhtml
- ->getTextContent(
- $audio[0]["attributes"]["src"]
- )
- )
- ];
- }
-
- if(count($parts) >= 2){
-
- $this->fuckhtml->load($parts[1]);
-
- $parts =
+ $audio =
$this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "padding-bottom" => "12px"
- ],
- self::is_class
- ),
- "div"
- );
+ ->getElementsByTagName("audio");
- foreach($parts as $part){
+ if(count($audio) !== 0){
- $this->fuckhtml->load($part);
+ $table["description"][] = [
+ "type" => "audio",
+ "url" =>
+ str_replace(
+ "http://",
+ "https://",
+ $this->fuckhtml
+ ->getTextContent(
+ $audio[0]["attributes"]["src"]
+ )
+ )
+ ];
+ }
+
+ if(count($parts) >= 2){
+
+ $this->fuckhtml->load($parts[1]);
- $lists =
+ $parts =
$this->fuckhtml
- ->getElementsByTagName("ol");
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "padding-bottom" => "12px"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
- if(count($lists) !== 0){
+ foreach($parts as $part){
- foreach($lists as $list){
-
- $this->fuckhtml->load($list);
-
- $list_items =
- $this->fuckhtml
- ->getElementsByTagName("li");
-
- $index = 0;
+ $this->fuckhtml->load($part);
+
+ $lists =
+ $this->fuckhtml
+ ->getElementsByTagName("ol");
+
+ if(count($lists) !== 0){
- if(count($list_items) !== 0){
+ foreach($lists as $list){
- foreach($list_items as $list_item){
-
- $index++;
-
- $this->fuckhtml->load($list_item);
-
- $list_subitems =
- $this->fuckhtml
- ->getElementsByTagName("div");
+ $this->fuckhtml->load($list);
+
+ $list_items =
+ $this->fuckhtml
+ ->getElementsByTagName("li");
+
+ $index = 0;
+
+ if(count($list_items) !== 0){
- foreach($list_subitems as $subitem){
+ foreach($list_items as $list_item){
- if($subitem["level"] !== 1){ continue; }
+ $index++;
- $this->fuckhtml->load($subitem);
+ $this->fuckhtml->load($list_item);
- $spans =
+ $list_subitems =
$this->fuckhtml
- ->getElementsByTagName("span");
+ ->getElementsByTagName("div");
- if(count($spans) !== 0){
+ foreach($list_subitems as $subitem){
- $type = "quote";
- }else{
+ if($subitem["level"] !== 1){ continue; }
- $type = "text";
- }
-
- $value =
- $this->fuckhtml
- ->getTextContent(
- $subitem
- );
-
- if($type == "text"){
+ $this->fuckhtml->load($subitem);
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName("span");
+
+ if(count($spans) !== 0){
+
+ $type = "quote";
+ }else{
+
+ $type = "text";
+ }
+
+ $value =
+ $this->fuckhtml
+ ->getTextContent(
+ $subitem
+ );
- $value = $index . ". " . $value;
+ if($type == "text"){
+
+ $value = $index . ". " . $value;
+ }
+
+ $table["description"][] = [
+ "type" => $type,
+ "value" => $value
+ ];
}
-
- $table["description"][] = [
- "type" => $type,
- "value" => $value
- ];
}
}
}
- }
-
- continue;
- }
-
- // get title
- $spans =
- $this->fuckhtml
- ->getElementsByTagName("span");
-
- if(count($spans) !== 0){
-
- foreach($spans as $span){
- $part["innerHTML"] =
- str_replace(
- $span["outerHTML"],
- "",
- $part["innerHTML"]
- );
+ continue;
}
- if(
+ // get title
+ $spans =
$this->fuckhtml
- ->getTextContent(
- $part
- )
- == ""
- ){
-
- $table["description"][] = [
- "type" => "title",
- "value" =>
- $this->fuckhtml
- ->getTextContent(
- $spans[0]
- )
- ];
-
- continue;
- }
- }
-
- // fallback to getting non-numbered list
- $nlist =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "white-space" => "pre-line",
- "word-wrap" => "break-word"
- ],
- self::is_class
- ),
- "div"
- );
-
- if(count($nlist) !== 0){
+ ->getElementsByTagName("span");
- foreach($nlist as $nlist_item){
+ if(count($spans) !== 0){
- $text =
- $this->fuckhtml
- ->getTextContent($nlist_item);
-
- if($text == ""){
+ foreach($spans as $span){
- continue;
+ $part["innerHTML"] =
+ str_replace(
+ $span["outerHTML"],
+ "",
+ $part["innerHTML"]
+ );
}
- $this->fuckhtml->load($nlist_item);
-
- $spans =
+ if(
$this->fuckhtml
- ->getElementsByTagName("span");
-
- if(count($spans) !== 0){
+ ->getTextContent(
+ $part
+ )
+ == ""
+ ){
- // is a quote node
- $type = "quote";
- }else{
+ $table["description"][] = [
+ "type" => "title",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $spans[0]
+ )
+ ];
- $type = "text";
+ continue;
}
+ }
+
+ // fallback to getting non-numbered list
+ $nlist =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "white-space" => "pre-line",
+ "word-wrap" => "break-word"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
+
+ if(count($nlist) !== 0){
- $table["description"][] = [
- "type" => $type,
- "value" => $text
- ];
+ foreach($nlist as $nlist_item){
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent($nlist_item);
+
+ if($text == ""){
+
+ continue;
+ }
+
+ $this->fuckhtml->load($nlist_item);
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName("span");
+
+ if(count($spans) !== 0){
+
+ // is a quote node
+ $type = "quote";
+ }else{
+
+ $type = "text";
+ }
+
+ $table["description"][] = [
+ "type" => $type,
+ "value" => $text
+ ];
+ }
}
}
}