diff options
Diffstat (limited to 'scraper')
-rw-r--r-- | scraper/google.php | 346 |
1 files changed, 175 insertions, 171 deletions
diff --git a/scraper/google.php b/scraper/google.php index df10754..28ede6d 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -1565,18 +1565,17 @@ class google{ } /* - Fallback to parsing it as an embed + Detect if its a wikipedia thing */ + $h3 = + $this->fuckhtml + ->getElementsByTagName("h3"); + - $table = [ - "title" => null, - "description" => [], - "url" => null, - "thumb" => null, - "table" => [], - "sublink" => [] - ]; + /* + Fallback to parsing the word definitions + */ $parts = $this->fuckhtml ->getElementsByClassName( @@ -1596,12 +1595,17 @@ class google{ $head = $parts[0]; - $h3 = - $this->fuckhtml - ->getElementsByTagName("h3"); - if(count($h3) !== 0){ + $table = [ + "title" => null, + "description" => [], + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + $h3 = $h3[0]; $table["title"] = @@ -1626,201 +1630,201 @@ class google{ $head ) ]; - } - - $audio = - $this->fuckhtml - ->getElementsByTagName("audio"); - - if(count($audio) !== 0){ - $table["description"][] = [ - "type" => "audio", - "url" => - str_replace( - "http://", - "https://", - $this->fuckhtml - ->getTextContent( - $audio[0]["attributes"]["src"] - ) - ) - ]; - } - - if(count($parts) >= 2){ - - $this->fuckhtml->load($parts[1]); - - $parts = + $audio = $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "padding-bottom" => "12px" - ], - self::is_class - ), - "div" - ); + ->getElementsByTagName("audio"); - foreach($parts as $part){ + if(count($audio) !== 0){ - $this->fuckhtml->load($part); + $table["description"][] = [ + "type" => "audio", + "url" => + str_replace( + "http://", + "https://", + $this->fuckhtml + ->getTextContent( + $audio[0]["attributes"]["src"] + ) + ) + ]; + } + + if(count($parts) >= 2){ + + $this->fuckhtml->load($parts[1]); - $lists = + $parts = $this->fuckhtml - ->getElementsByTagName("ol"); + ->getElementsByClassName( + $this->findstyles( + [ + "padding-bottom" => "12px" + ], + self::is_class + ), + "div" + ); - if(count($lists) !== 0){ + foreach($parts as $part){ - foreach($lists as $list){ - - $this->fuckhtml->load($list); - - $list_items = - $this->fuckhtml - ->getElementsByTagName("li"); - - $index = 0; + $this->fuckhtml->load($part); + + $lists = + $this->fuckhtml + ->getElementsByTagName("ol"); + + if(count($lists) !== 0){ - if(count($list_items) !== 0){ + foreach($lists as $list){ - foreach($list_items as $list_item){ - - $index++; - - $this->fuckhtml->load($list_item); - - $list_subitems = - $this->fuckhtml - ->getElementsByTagName("div"); + $this->fuckhtml->load($list); + + $list_items = + $this->fuckhtml + ->getElementsByTagName("li"); + + $index = 0; + + if(count($list_items) !== 0){ - foreach($list_subitems as $subitem){ + foreach($list_items as $list_item){ - if($subitem["level"] !== 1){ continue; } + $index++; - $this->fuckhtml->load($subitem); + $this->fuckhtml->load($list_item); - $spans = + $list_subitems = $this->fuckhtml - ->getElementsByTagName("span"); + ->getElementsByTagName("div"); - if(count($spans) !== 0){ + foreach($list_subitems as $subitem){ - $type = "quote"; - }else{ + if($subitem["level"] !== 1){ continue; } - $type = "text"; - } - - $value = - $this->fuckhtml - ->getTextContent( - $subitem - ); - - if($type == "text"){ + $this->fuckhtml->load($subitem); + + $spans = + $this->fuckhtml + ->getElementsByTagName("span"); + + if(count($spans) !== 0){ + + $type = "quote"; + }else{ + + $type = "text"; + } + + $value = + $this->fuckhtml + ->getTextContent( + $subitem + ); - $value = $index . ". " . $value; + if($type == "text"){ + + $value = $index . ". " . $value; + } + + $table["description"][] = [ + "type" => $type, + "value" => $value + ]; } - - $table["description"][] = [ - "type" => $type, - "value" => $value - ]; } } } - } - - continue; - } - - // get title - $spans = - $this->fuckhtml - ->getElementsByTagName("span"); - - if(count($spans) !== 0){ - - foreach($spans as $span){ - $part["innerHTML"] = - str_replace( - $span["outerHTML"], - "", - $part["innerHTML"] - ); + continue; } - if( + // get title + $spans = $this->fuckhtml - ->getTextContent( - $part - ) - == "" - ){ - - $table["description"][] = [ - "type" => "title", - "value" => - $this->fuckhtml - ->getTextContent( - $spans[0] - ) - ]; - - continue; - } - } - - // fallback to getting non-numbered list - $nlist = - $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "white-space" => "pre-line", - "word-wrap" => "break-word" - ], - self::is_class - ), - "div" - ); - - if(count($nlist) !== 0){ + ->getElementsByTagName("span"); - foreach($nlist as $nlist_item){ + if(count($spans) !== 0){ - $text = - $this->fuckhtml - ->getTextContent($nlist_item); - - if($text == ""){ + foreach($spans as $span){ - continue; + $part["innerHTML"] = + str_replace( + $span["outerHTML"], + "", + $part["innerHTML"] + ); } - $this->fuckhtml->load($nlist_item); - - $spans = + if( $this->fuckhtml - ->getElementsByTagName("span"); - - if(count($spans) !== 0){ + ->getTextContent( + $part + ) + == "" + ){ - // is a quote node - $type = "quote"; - }else{ + $table["description"][] = [ + "type" => "title", + "value" => + $this->fuckhtml + ->getTextContent( + $spans[0] + ) + ]; - $type = "text"; + continue; } + } + + // fallback to getting non-numbered list + $nlist = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "white-space" => "pre-line", + "word-wrap" => "break-word" + ], + self::is_class + ), + "div" + ); + + if(count($nlist) !== 0){ - $table["description"][] = [ - "type" => $type, - "value" => $text - ]; + foreach($nlist as $nlist_item){ + + $text = + $this->fuckhtml + ->getTextContent($nlist_item); + + if($text == ""){ + + continue; + } + + $this->fuckhtml->load($nlist_item); + + $spans = + $this->fuckhtml + ->getElementsByTagName("span"); + + if(count($spans) !== 0){ + + // is a quote node + $type = "quote"; + }else{ + + $type = "text"; + } + + $table["description"][] = [ + "type" => $type, + "value" => $text + ]; + } } } } |