diff options
author | lolcat <will@lolcat.ca> | 2023-11-09 08:50:53 -0500 |
---|---|---|
committer | lolcat <will@lolcat.ca> | 2023-11-09 08:50:53 -0500 |
commit | 9fd993b47b8c1b6e3c9d9f0e8d43a85860230f59 (patch) | |
tree | 591390482d9cbad144d387439247dd18eef2cdd8 | |
parent | 165d80f80bc88552de22f45becac25a07187fa8c (diff) |
fixed brave news
-rw-r--r-- | scraper/brave.php | 153 |
1 files changed, 38 insertions, 115 deletions
diff --git a/scraper/brave.php b/scraper/brave.php index 91e3f9e..bd1cd80 100644 --- a/scraper/brave.php +++ b/scraper/brave.php @@ -1139,131 +1139,54 @@ class brave{ $proxy ); - $news = - $this->fuckhtml - ->getElementsByClassName( - "snippet inline gap-standard", - "div" - ); + preg_match( + '/const data ?= ?(\[{.*}]);/', + $html, + $json + ); - foreach($news as $article){ - - $data = [ - "title" => null, - "author" => null, - "description" => null, - "date" => null, - "thumb" => - [ - "url" => null, - "ratio" => null - ], - "url" => null - ]; - - $this->fuckhtml->load($article); - $elems = - $this->fuckhtml - ->getElementsByTagName("*"); - - // get title - $data["title"] = - $this->fuckhtml - ->getTextContent( - $this->fuckhtml - ->getElementsByClassName( - "snippet-title", - $elems - ) - [0] - ["innerHTML"] - ); - - // get description - $data["description"] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $this->fuckhtml - ->getElementsByClassName( - "snippet-description", - $elems - ) - [0] - ["innerHTML"] - ) - ); - - // get date - $date = - explode( - "•", - $this->fuckhtml - ->getTextContent( - $this->fuckhtml - ->getElementsByClassName( - "snippet-url", - $elems - )[0] - ) - ); - - if( - count($date) !== 1 && - trim($date[1]) != "" - ){ - - $data["date"] = - strtotime( - $date[1] - ); - } + if(!isset($json[1])){ - // get URL - $data["url"] = - $this->fuckhtml->getTextContent( - $this->unshiturl( - $this->fuckhtml - ->getElementsByClassName( - "result-header", - $elems - ) - [0] - ["attributes"] - ["href"] - ) - ); + throw new Exception("Failed to grep javascript object"); + } + + $json = $this->fuckhtml->parseJsObject($json[1], true); + + if($json === null){ - // get thumbnail - $thumb = - $this->fuckhtml - ->getElementsByTagName( - "img" - ); + throw new Exception("Failed to parse javascript object"); + } + + foreach( + $json[1]["data"]["body"]["response"]["news"]["results"] + as $news + ){ if( - count($thumb) === 2 && - trim( - $thumb[1] - ["attributes"] - ["src"] - ) != "" + !isset($news["thumbnail"]["src"]) || + $news["thumbnail"]["src"] == "void 0" ){ - - $data["thumb"] = [ - "url" => - $this->fuckhtml->getTextContent( - $this->unshiturl( - $thumb[1] - ["attributes"] - ["src"] - ) - ), + + $thumb = [ + "url" => null, + "ratio" => null + ]; + }else{ + + $thumb = [ + "url" => $this->unshiturl($news["thumbnail"]["src"]), "ratio" => "16:9" ]; } - $out["news"][] = $data; + $out["news"][] = [ + "title" => $news["title"], + "author" => null, + "description" => $news["description"], + "date" => !isset($news["age"]) || $news["age"] == "void 0" ? null : strtotime($news["age"]), + "thumb" => $thumb, + "url" => $news["url"] + ]; } return $out; |