summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2023-11-09 08:50:53 -0500
committerlolcat <will@lolcat.ca>2023-11-09 08:50:53 -0500
commit9fd993b47b8c1b6e3c9d9f0e8d43a85860230f59 (patch)
tree591390482d9cbad144d387439247dd18eef2cdd8
parent165d80f80bc88552de22f45becac25a07187fa8c (diff)
fixed brave news
-rw-r--r--scraper/brave.php153
1 files changed, 38 insertions, 115 deletions
diff --git a/scraper/brave.php b/scraper/brave.php
index 91e3f9e..bd1cd80 100644
--- a/scraper/brave.php
+++ b/scraper/brave.php
@@ -1139,131 +1139,54 @@ class brave{
$proxy
);
- $news =
- $this->fuckhtml
- ->getElementsByClassName(
- "snippet inline gap-standard",
- "div"
- );
+ preg_match(
+ '/const data ?= ?(\[{.*}]);/',
+ $html,
+ $json
+ );
- foreach($news as $article){
-
- $data = [
- "title" => null,
- "author" => null,
- "description" => null,
- "date" => null,
- "thumb" =>
- [
- "url" => null,
- "ratio" => null
- ],
- "url" => null
- ];
-
- $this->fuckhtml->load($article);
- $elems =
- $this->fuckhtml
- ->getElementsByTagName("*");
-
- // get title
- $data["title"] =
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- "snippet-title",
- $elems
- )
- [0]
- ["innerHTML"]
- );
-
- // get description
- $data["description"] =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- "snippet-description",
- $elems
- )
- [0]
- ["innerHTML"]
- )
- );
-
- // get date
- $date =
- explode(
- "•",
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- "snippet-url",
- $elems
- )[0]
- )
- );
-
- if(
- count($date) !== 1 &&
- trim($date[1]) != ""
- ){
-
- $data["date"] =
- strtotime(
- $date[1]
- );
- }
+ if(!isset($json[1])){
- // get URL
- $data["url"] =
- $this->fuckhtml->getTextContent(
- $this->unshiturl(
- $this->fuckhtml
- ->getElementsByClassName(
- "result-header",
- $elems
- )
- [0]
- ["attributes"]
- ["href"]
- )
- );
+ throw new Exception("Failed to grep javascript object");
+ }
+
+ $json = $this->fuckhtml->parseJsObject($json[1], true);
+
+ if($json === null){
- // get thumbnail
- $thumb =
- $this->fuckhtml
- ->getElementsByTagName(
- "img"
- );
+ throw new Exception("Failed to parse javascript object");
+ }
+
+ foreach(
+ $json[1]["data"]["body"]["response"]["news"]["results"]
+ as $news
+ ){
if(
- count($thumb) === 2 &&
- trim(
- $thumb[1]
- ["attributes"]
- ["src"]
- ) != ""
+ !isset($news["thumbnail"]["src"]) ||
+ $news["thumbnail"]["src"] == "void 0"
){
-
- $data["thumb"] = [
- "url" =>
- $this->fuckhtml->getTextContent(
- $this->unshiturl(
- $thumb[1]
- ["attributes"]
- ["src"]
- )
- ),
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }else{
+
+ $thumb = [
+ "url" => $this->unshiturl($news["thumbnail"]["src"]),
"ratio" => "16:9"
];
}
- $out["news"][] = $data;
+ $out["news"][] = [
+ "title" => $news["title"],
+ "author" => null,
+ "description" => $news["description"],
+ "date" => !isset($news["age"]) || $news["age"] == "void 0" ? null : strtotime($news["age"]),
+ "thumb" => $thumb,
+ "url" => $news["url"]
+ ];
}
return $out;