summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
Diffstat (limited to 'scraper')
-rw-r--r--scraper/brave.php1861
1 files changed, 547 insertions, 1314 deletions
diff --git a/scraper/brave.php b/scraper/brave.php
index 0a73158..3a1c0ce 100644
--- a/scraper/brave.php
+++ b/scraper/brave.php
@@ -74,6 +74,13 @@ class brave{
"older" => [
"display" => "Older than",
"option" => "_DATE"
+ ],
+ "spellcheck" => [
+ "display" => "Spellcheck",
+ "option" => [
+ "no" => "No",
+ "yes" => "Yes"
+ ]
]
];
break;
@@ -203,7 +210,7 @@ class brave{
$q = json_decode($this->nextpage->get($get["npt"], "web"), true);
$search = $q["q"];
- $q["spellcheck"] = 0;
+ $q["spellcheck"] = "0";
$nsfw = $q["nsfw"];
unset($q["nsfw"]);
@@ -215,6 +222,7 @@ class brave{
// get _GET data instead
$search = $get["s"];
+
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
@@ -229,6 +237,7 @@ class brave{
$country = $get["country"];
$older = $get["older"];
$newer = $get["newer"];
+ $spellcheck = $get["spellcheck"];
$q = [
"q" => $search
@@ -264,12 +273,18 @@ class brave{
$q["tf"] = "{$newer}to{$older}";
}
+
+ // spellcheck
+ if($spellcheck == "no"){
+
+ $q["spellcheck"] = "0";
+ }
}
- /*
+
$handle = fopen("scraper/brave.html", "r");
$html = fread($handle, filesize("scraper/brave.html"));
fclose($handle);
- */
+ /*
try{
$html =
$this->get(
@@ -283,7 +298,7 @@ class brave{
throw new Exception("Could not fetch search page");
}
-
+ */
$out = [
"status" => "ok",
"spelling" => [
@@ -308,1458 +323,510 @@ class brave{
*/
$nextpage =
$this->fuckhtml
- ->getElementsByClassName(
- "btn ml-15",
- "a"
- );
-
- if(count($nextpage) !== 0){
-
- preg_match(
- '/offset=([0-9]+)/',
- $this->fuckhtml->getTextContent($nextpage[0]["attributes"]["href"]),
- $nextpage
- );
-
- $q["offset"] = (int)$nextpage[1];
- $q["nsfw"] = $nsfw;
- $q["country"] = $country;
-
- $out["npt"] =
- $this->nextpage->store(
- json_encode($q),
- "web"
- );
- }
-
- /*
- Get discussions (and append them to web results)
- */
-
- // they're loaded using javascript!!
- $discussion =
- $this->fuckhtml
->getElementById(
- "js-discussions",
- "script"
+ "pagination",
+ "div"
);
- if(
- $discussion &&
- isset($discussion["attributes"]["data"])
- ){
+ if($nextpage){
- $discussion =
- json_decode(
- $this->fuckhtml
- ->getTextContent(
- $discussion["attributes"]["data"]
- ),
- true
- );
+ $this->fuckhtml->load($nextpage);
+
+ $nextpage =
+ $this->fuckhtml
+ ->getElementsByClassName("btn", "a");
- foreach($discussion["results"] as $result){
+ if(count($nextpage) !== 0){
- $data = [
- "title" => $this->titledots($result["title"]),
- "description" => null,
- "url" => $result["url"],
- "date" => null,
- "type" => "web",
- "thumb" => [
- "url" => null,
- "ratio" => null
- ],
- "sublink" => [],
- "table" => []
- ];
+ $nextpage =
+ $nextpage[count($nextpage) - 1];
- // description
- $data["description"] =
- $this->limitstrlen(
- $this->limitwhitespace(
- $this->titledots(
- $this->fuckhtml->getTextContent(
- $result["description"]
- )
- )
+ if(
+ strtolower(
+ $this->fuckhtml
+ ->getTextContent(
+ $nextpage
)
- );
-
- if($result["age"] != ""){
- $data["date"] = strtotime($result["age"]);
- }
-
- // populate table
-
- if($result["data"]["num_answers"] != ""){
- $data["table"]["Replies"] = (int)$result["data"]["num_answers"];
- }
-
- if($result["data"]["score"] != ""){
-
- $score = explode("|", $result["data"]["score"]);
+ ) == "next"
+ ){
- if(count($score) === 2){
-
- $score = ((int)$score[1]) . " (" . trim($score[0]) . ")";
- }else{
+ preg_match(
+ '/offset=([0-9]+)/',
+ $this->fuckhtml->getTextContent($nextpage["attributes"]["href"]),
+ $nextpage
+ );
- $score = (int)$score[0];
- }
-
- $data["table"]["Votes"] = $score;
- }
-
- if($result["thumbnail"] != ""){
+ $q["offset"] = (int)$nextpage[1];
+ $q["nsfw"] = $nsfw;
+ $q["country"] = $country;
- $data["thumb"]["url"] = $result["thumbnail"];
- $data["thumb"]["ratio"] = "16:9";
+ $out["npt"] =
+ $this->nextpage->store(
+ json_encode($q),
+ "web"
+ );
}
-
- $out["web"][] = $data;
}
}
- /*
- Get related searches
- */
- $faq =
+ $this->fuckhtml->load($html);
+
+ $script_disc =
$this->fuckhtml
- ->getElementById("js-faq", "script");
+ ->getElementsByTagName(
+ "script"
+ );
- if(
- $faq &&
- isset($faq["attributes"]["data"])
- ){
+ $grep = [];
+ foreach($script_disc as $discs){
- $faq =
- json_decode(
- $this->fuckhtml
- ->getTextContent(
- $faq["attributes"]["data"]
- ),
- true
- );
+ preg_match(
+ '/const data ?= ?(\[{.*}]);/',
+ $discs["innerHTML"],
+ $grep
+ );
- foreach($faq["items"] as $related){
+ if(isset($grep[1])){
- $out["related"][] = $related["question"];
+ break;
}
}
- /*
- Get spelling autocorrect
- */
- $altered =
- $this->fuckhtml
- ->getElementById("altered-query", "div");
-
- if($altered){
-
- $this->fuckhtml->load($altered);
-
- $altered =
- $this->fuckhtml
- ->getElementsByTagName("a");
+ if(!isset($grep[1])){
- if(count($altered) === 2){
-
- $out["spelling"] = [
- "type" => "including",
- "using" =>
- $this->fuckhtml
- ->getTextContent($altered[0]),
- "correction" =>
- $this->fuckhtml
- ->getTextContent($altered[1])
- ];
- }
-
- $this->fuckhtml->load($html);
+ throw new Exception("Could not get data JS");
}
+
+ $data =
+ $this->fuckhtml
+ ->parseJsObject(
+ $grep[1]
+ );
+ unset($grep);
+
+ $data = $data[1]["data"]["body"]["response"];
/*
Get web results
*/
- $resulthtml =
- $this->fuckhtml
- ->getElementById(
- "results",
- "div"
- );
-
- $this->fuckhtml->load($resulthtml);
- $items = 0;
- foreach(
- $this->fuckhtml
- ->getElementsByClassName("snippet fdb")
- as $result
- ){
+ if(!isset($data["web"]["results"])){
- $data = [
- "title" => null,
- "description" => null,
- "url" => null,
- "date" => null,
- "type" => "web",
- "thumb" => [
- "url" => null,
- "ratio" => null
- ],
- "sublink" => [],
- "table" => []
- ];
+ return $out;
+ }
+
+ //$has_so_answer = false;
+
+ foreach($data["web"]["results"] as $result){
- if(
- isset($result["attributes"]["data-type"]) &&
- $result["attributes"]["data-type"] == "ad"
- ){
+ if(isset($result["thumbnail"])){
+
+ $thumb = [
+ "ratio" => $result["thumbnail"]["logo"] == "false" ? "16:9" : "1:1",
+ "url" => $result["thumbnail"]["original"]
+ ];
+ }else{
- // is an ad, skip
- continue;
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
}
- $this->fuckhtml->load($result);
-
- /*
- Get title
- */
- $title =
- $this->fuckhtml
- ->getElementsByClassName(
- "snippet-title",
- "span"
- );
-
- if(count($title) === 0){
+ $sublink = [];
+ if(isset($result["cluster"])){
- // encountered AI summarizer
- // or misspelling indicator @TODO
- continue;
+ foreach($result["cluster"] as $cluster){
+
+ $sublink[] = [
+ "title" => $this->titledots($cluster["title"]),
+ "description" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $cluster["description"]
+ )
+ ),
+ "url" => $cluster["url"],
+ "date" => null
+ ];
+ }
}
- if(isset($title[0]["attributes"]["title"])){
+ // parse table elements
+ $table = [];
+
+ // product
+ $ref = null;
+
+ if(isset($result["product"])){
- $data["title"] =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $title[0]["attributes"]["title"]
- )
- );
- }else{
+ $ref = &$result["product"];
+ }elseif(isset($result["creative_work"])){
- $data["title"] =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $title[0]
- )
- );
+ $ref = &$result["creative_work"];
}
- /*
- Get description
- */
- $description =
- $this->fuckhtml
- ->getElementsByClassName(
- "snippet-description",
- "p"
- );
-
- if(count($description) !== 0){
- $data["description"] =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $description[0]
- )
- );
-
- // also check for thumbnail in here
- $img =
- $this->fuckhtml
- ->getElementsByClassName(
- "thumb",
- "img"
- );
+ if($ref !== null){
- if(count($img) !== 0){
+ if(isset($ref["offers"])){
- $data["thumb"] = [
- "url" => $this->unshiturl($img[0]["attributes"]["src"]),
- "ratio" => "16:9"
- ];
- }else{
-
- // might be a video thumbnail wrapper?
- $wrapper =
- $this->fuckhtml
- ->getElementsByClassName(
- "video-thumb",
- "a"
- );
-
- if(count($wrapper) !== 0){
-
- // we found a video
- $this->fuckhtml->load($wrapper[0]);
+ foreach($ref["offers"] as $offer){
- $img =
- $this->fuckhtml
- ->getElementsByTagName("img");
+ $price = null;
- $data["thumb"] = [
- "url" => $this->unshiturl($img[0]["attributes"]["src"]),
- "ratio" => "16:9"
- ];
+ if(isset($offer["price"])){
+
+ if((float)$offer["price"] == 0){
+
+ $price = "Free";
+ }else{
+
+ $price = $offer["price"];
+ }
+ }
- // get the video length, if its there
- $duration =
- $this->fuckhtml
- ->getElementsByClassName(
- "duration",
- "div"
- );
+ if($price !== "Free"){
+ if(isset($offer["priceCurrency"])){
+
+ $price .= " " . $offer["priceCurrency"];
+ }
+ }
- if(count($duration) !== 0){
+ if($price !== null){
- $data["table"]["Duration"] = $duration[0]["innerHTML"];
+ $table["Price"] = trim($price);
}
-
- // reset html load
- $this->fuckhtml->load($result);
}
}
- }else{
-
- // is a steam/shop listing
- $description_alt =
- $this->fuckhtml
- ->getElementsByClassName(
- "text-sm",
- "div"
- );
-
- if(count($description_alt) !== 0){
+ if(isset($ref["rating"])){
- switch($description_alt[0]["attributes"]["class"]){
+ $rating = null;
+ if(isset($ref["rating"]["ratingValue"])){
+
+ $rating = $ref["rating"]["ratingValue"];
- case "text-sm text-gray":
- case "description text-sm":
+ if(isset($ref["rating"]["bestRating"])){
- $data["description"] =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $description_alt[0]
- )
- );
- break;
+ $rating .= "/" . $ref["rating"]["bestRating"];
+ }
}
- // get table sublink
- $sublink =
- $this->fuckhtml
- ->getElementsByClassName(
- "r-attr text-sm",
- "div"
- );
-
- if(count($sublink) !== 0){
+ if(isset($ref["rating"]["reviewCount"])){
- $this->tablesublink($sublink, $data);
- }
-
- // check for thumb element
- $data["thumb"] = $this->getimagelinkfromstyle("thumb");
- }else{
-
- // ok... finally...
- // maybe its the instant answer thingy
- $answer =
- $this->fuckhtml
- ->getElementsByClassName("answer");
-
- if(count($answer) !== 0){
+ $isnull = $rating === null ? false : true;
- $data["description"] =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent($answer[0])
- );
- }
- }
- }
-
- // finally, fix brave's date format sucking balls
- $data["description"] = explode(" - ", $data["description"], 2);
-
- if(count($data["description"]) === 0){
-
- // nothing to do
- $data["description"] = $data["description"][0];
- }else{
-
- // attempt to parse
- $time = strtotime($data["description"][0]);
-
- if($time !== false){
-
- // got response
- $data["date"] = $time;
-
- array_shift($data["description"]);
- }
-
- // merge back
- $data["description"] =
- implode(" - ", $data["description"]);
- }
-
- /*
- Check content type
- */
- $content_type =
- $this->fuckhtml
- ->getElementsByClassName(
- "content-type",
- "span"
- );
-
- if(count($content_type) !== 0){
-
- $data["type"] =
- strtolower($this->fuckhtml->getTextContent($content_type[0]));
- }
-
- /*
- Check subtext table thingy
- */
- $table_items =
- array_merge(
- $this->fuckhtml
- ->getElementsByClassName(
- "item-attributes",
- "div"
- ),
- $this->fuckhtml
- ->getElementsByClassName(
- "r",
- "div"
- )
- );
-
- /*
- DIV: item-attributes
- */
- if(count($table_items) !== 0){
-
- foreach($table_items as $table){
-
- $this->fuckhtml->load($table);
-
- $span =
- $this->fuckhtml
- ->getElementsByClassName(
- "text-sm",
- "*"
- );
-
- foreach($span as $item){
+ if($isnull){
+
+ $rating .= " (";
+ }
- $item =
- explode(
- ":",
- $this->fuckhtml->getTextContent(preg_replace('/\n/', " ", $item["innerHTML"])),
- 2
- );
+ $rating .= number_format($ref["rating"]["reviewCount"]) . " hits";
- if(count($item) === 2){
+ if($isnull){
- $data["table"][trim($item[0])] = trim($this->limitwhitespace($item[1]));
+ $rating .= ")";
}
}
+
+ if($rating !== null){
+
+ $table["Rating"] = $rating;
+ }
}
-
- $this->fuckhtml->load($result);
}
- // get video sublinks
- $table_items =
- $this->fuckhtml
- ->getElementsByClassName(
- "snippet-description published-time",
- "p"
- );
-
- if(count($table_items) !== 0){
+ // video
+ if(isset($result["video"])){
- $table_items =
- explode(
- '<span class="mr-15"></span>',
- $table_items[0]["innerHTML"],
- 2
- );
- if(count($table_items) === 2){
-
- $item2 = [];
-
- $item2[] = explode(":", $this->fuckhtml->getTextContent($table_items[0]));
+ foreach($result["video"] as $key => $value){
- if(trim($table_items[1]) != ""){
- $item2[] = explode(":", $this->fuckhtml->getTextContent($table_items[1]));
- }
-
- foreach($item2 as $it){
+ if(is_string($result["video"][$key]) === false){
- $data["table"][trim($it[0])] = trim($it[1]);
+ continue;
}
+
+ $table[ucfirst($key)] = $value;
}
}
/*
- Get URL
+ Get StackOverflow answers
*/
- $data["url"] =
- $this->fuckhtml->getTextContent(
- $this->fuckhtml
- ->getElementsByTagName("a")
- [0]
- ["attributes"]
- ["href"]
- );
-
- /*
- Get sublinks
- */
- $sublinks_elems =
- $this->fuckhtml
- ->getElementsByClassName(
- "snippet",
- "div"
- );
-
- $sublinks = [];
-
- foreach($sublinks_elems as $sublink){
-
- $this->fuckhtml->load($sublink);
-
- $a =
- $this->fuckhtml
- ->getElementsByTagName("a")[0];
-
- $title =
- $this->fuckhtml
- ->getTextContent($a);
-
- $url = $a["attributes"]["href"];
-
- $description =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByTagName("p")[0]
- )
- );
-
- $sublinks[] = [
- "title" => $title,
- "date" => null,
- "description" => $description,
- "url" => $url
- ];
- }
-
+ // commented out since it also returns alot of garbage
/*
- Get smaller sublinks
- */
- $sublinks_elems =
- $this->fuckhtml
- ->getElementsByClassName(
- "deep-link",
- "a"
- );
-
- foreach($sublinks_elems as $sublink){
-
- $sublinks[] = [
- "title" => $this->fuckhtml->getTextContent($sublink),
- "date" => null,
- "description" => null,
- "url" => $sublink["attributes"]["href"]
- ];
- }
-
- // append sublinks to $data !!
- $data["sublink"] = $sublinks;
-
- // append first result to start of $out["web"]
- // other results are after
- if($items === 0){
-
- $out["web"] = [$data, ...$out["web"]];
- }else{
-
- $out["web"][] = $data;
- }
- $items++;
- }
-
- /*
- Get news
- */
- $this->fuckhtml->load($resulthtml);
- $news_carousel = $this->fuckhtml->getElementById("news-carousel");
-
- $this->fuckhtml->load($news_carousel);
-
- if($news_carousel){
-
- $a =
- $this->fuckhtml
- ->getElementsByClassName(
- "card fdb",
- "a"
- );
-
- foreach($a as $news){
-
- $this->fuckhtml->load($news);
-
- $out["news"][] = [
- "title" =>
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- "title",
- "div"
- )[0]
- )
- ),
- "description" => null,
- "date" =>
- strtotime(
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- "card-footer__timestamp",
- "span"
- )[0]
- )
- ),
- "thumb" => $this->getimagelinkfromstyle("img-bg"),
- "url" => $this->fuckhtml->getTextContent($news["attributes"]["href"])
- ];
- }
- }
-
-
-
- /*
- Get videos
- */
- $this->fuckhtml->load($resulthtml);
- $news_carousel = $this->fuckhtml->getElementById("video-carousel");
-
- $this->fuckhtml->load($news_carousel);
-
- if($news_carousel){
-
- $a =
- $this->fuckhtml
- ->getElementsByClassName(
- "card fdb",
- "a"
- );
-
- foreach($a as $video){
-
- $this->fuckhtml->load($video);
+ if(
+ $has_so_answer === false &&
+ isset($result["qa"])
+ ){
- $date = null;
+ $has_so_answer = true;
+ $answer = $this->stackoverflow_parse($result["qa"]["answer"]["text"]);
- $date_o =
- $this->fuckhtml
- ->getElementsByClassName(
- "text-gray text-xs",
- "span"
- );
-
- if(count($date_o) !== 0){
+ if(isset($result["qa"]["answer"]["author"])){
- $date =
- strtotime(
- $this->fuckhtml
- ->getTextContent(
- $date_o[0]
- )
- );
+ $answer[] = [
+ "type" => "quote",
+ "value" => "Answer from " . $result["qa"]["answer"]["author"]
+ ];
}
- $out["video"][] = [
+ $out["answer"][] = [
"title" =>
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- "title",
- "div"
- )[0]
- )
+ $this->fuckhtml
+ ->getTextContent(
+ $result["qa"]["question"]
),
- "description" => null,
- "date" => $date,
- "duration" => null,
- "views" => null,
- "thumb" => $this->getimagelinkfromstyle("img-bg"),
- "url" => $this->fuckhtml->getTextContent($video["attributes"]["href"])
+ "description" => $answer,
+ "url" => $result["url"],
+ "thumb" => null,
+ "table" => [],
+ "sublink" => []
];
- }
+ }*/
+
+ $out["web"][] = [
+ "title" =>
+ $this->titledots(
+ $result["title"]
+ ),
+ "description" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $result["description"]
+ )
+ ),
+ "url" => $result["url"],
+ "date" => isset($result["age"]) ? strtotime($result["age"]) : null,
+ "type" => "web",
+ "thumb" => $thumb,
+ "sublink" => $sublink,
+ "table" => $table
+ ];
}
-
/*
- Get DEFINITION snippet
+ Get spelling autocorrect
*/
- $this->fuckhtml->load($html);
- $infobox = $this->fuckhtml->getElementById("rh-definitions", "div");
-
- if($infobox !== false){
+ if(
+ isset($data["query"]["bo_altered_diff"][0][0]) &&
+ $data["query"]["bo_altered_diff"][0][0] == "true"
+ ){
- $answer = [
- "title" => null,
- "description" => [],
- "url" => null,
- "thumb" => null,
- "table" => [],
- "sublink" => []
+ $out["spelling"] = [
+ "type" => "including",
+ "using" => $data["query"]["bo_altered_diff"][0][1],
+ "correction" => $get["s"]
];
-
- $this->fuckhtml->load($infobox);
-
- $answer["title"] =
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- "header",
- "h5"
- )[0]
- );
-
- $sections =
- $this->fuckhtml
- ->getElementsByTagName("section");
-
- $i = -1;
- foreach($sections as $section){
-
- $this->fuckhtml->load($section);
- $items =
- $this->fuckhtml
- ->getElementsByTagName("*");
-
- $li = 1;
- $pronounce = false;
- foreach($items as $item){
-
- switch($item["tagName"]){
-
- case "h6":
-
- if(
- isset($item["attributes"]["class"]) &&
- $item["attributes"]["class"] == "h6 pronunciation"
- ){
-
- if($pronounce){
-
- break;
- }
-
- $answer["description"][] = [
- "type" => "quote",
- "value" =>
- $this->fuckhtml
- ->getTextContent(
- $item
- )
- ];
-
- $answer["description"][] =
- [
- "type" => "audio",
- "url" => "https://search.brave.com/api/rhfetch?rhtype=definitions&word={$answer["title"]}&source=ahd-5"
- ];
-
- $pronounce = true;
- $i = $i + 2;
- break;
- }
-
- $answer["description"][] = [
- "type" => "title",
- "value" =>
- $this->fuckhtml
- ->getTextContent(
- $item
- )
- ];
- $i++;
- break;
-
- case "li":
-
- if(
- $i !== -1 &&
- $answer["description"][$i]["type"] == "text"
- ){
-
- $answer["description"][$i]["value"] .=
- "\n" . $li . ". " .
- $this->fuckhtml
- ->getTextContent(
- $item
- );
-
- }else{
- $answer["description"][] = [
- "type" => "text",
- "value" =>
- $li . ". " .
- $this->fuckhtml
- ->getTextContent(
- $item
- )
- ];
- $i++;
- }
- $li++;
- break;
-
- case "a":
- $answer["url"] =
- $this->fuckhtml
- ->getTextContent(
- $item["attributes"]["href"]
- );
- break;
- }
- }
- }
-
- $out["answer"][] = $answer;
}
-
/*
- Get instant answer
+ Get wikipedia heads
*/
- $this->fuckhtml->load($html);
- $infobox = $this->fuckhtml->getElementById("infobox", "div");
-
- if($infobox !== false){
-
- $answer = [
- "title" => null,
- "description" => [],
- "url" => null,
- "thumb" => null,
- "table" => [],
- "sublink" => []
- ];
+ if(isset($data["infobox"]["results"][0])){
- $this->fuckhtml->load($infobox);
- $div = $this->fuckhtml->getElementsByTagName("div");
-
- /*
- Get small description
- */
- $small_desc =
- $this->fuckhtml
- ->getElementsByClassName(
- "infobox-description",
- $div
- );
-
- if(count($small_desc) !== 0){
-
- $answer["description"][] = [
- "type" => "quote",
- "value" =>
- $this->fuckhtml
- ->getTextContent(
- $small_desc[0]
- )
- ];
- }
-
- /*
- Get title + url
- */
- $title =
- $this->fuckhtml
- ->getElementsByClassName("infobox-title", "a");
-
- if(count($title) !== 0){
+ foreach($data["infobox"]["results"] as $info){
- $answer["title"] =
- $this->fuckhtml
- ->getTextContent(
- $title[0]
- );
-
- $answer["url"] =
- $this->fuckhtml
- ->getTextContent(
- $title[0]["attributes"]["href"]
- );
- }
-
- /*
- Get thumbnail
- */
- $thumb = $this->getimagelinkfromstyle("thumb");
-
- if($thumb["url"] !== null){
-
- $answer["thumb"] = $thumb["url"];
- }
-
- /*
- Get table
- */
- $title =
- $this->fuckhtml
- ->getElementsByClassName(
- "infobox-attr-header",
- "div"
- );
-
- $rowhtml = $infobox;
-
- if(count($title) >= 2){
-
- $rowhtml =
- explode(
- $title[1]["outerHTML"],
- $infobox["innerHTML"],
- 2
- )[0];
- }
-
- $this->fuckhtml->load($rowhtml);
-
- $rows =
- $this->fuckhtml
- ->getElementsByClassName("infobox-attr", "div");
-
- foreach($rows as $row){
-
- if(!isset($row["innerHTML"])){
+ if($info["subtype"] == "code"){
- continue;
- }
-
- $this->fuckhtml->load($row);
- $span =
- $this->fuckhtml
- ->getElementsByTagName("span");
-
- if(count($span) === 2){
+ $description =
+ $this->stackoverflow_parse($info["data"]["answer"]["text"]);
- $answer["table"][
- $this->fuckhtml->getTextContent($span[0])
- ] = str_replace("\n", ", ", $this->fuckhtml->getTextContent($span[1], true));
- }
- }
-
- $this->fuckhtml->load($infobox);
-
- /*
- Parse stackoverflow answers
- */
- $code =
- $this->fuckhtml
- ->getElementById("codebox-answer", $div);
-
- if($code){
-
- // this might be standalone text with no paragraphs, check for that
- $author =
- $this->fuckhtml
- ->getElementById("author");
-
- $desc_tmp =
- str_replace(
- $author["outerHTML"],
- "",
- $code["innerHTML"]
- );
-
- $this->fuckhtml->load($desc_tmp);
- $code =
- $this->fuckhtml
- ->getElementsByTagName("*");
-
- if(count($code) === 0){
-
- $answer["description"][] = [
- "type" => "text",
- "value" =>
- $this->fuckhtml
- ->getTextContent(
- $desc_tmp
- )
- ];
-
- $answer["description"][] = [
- "type" => "quote",
- "value" =>
- $this->fuckhtml
- ->getTextContent(
- $author
- )
- ];
- }else{
-
- $i = 0;
-
- foreach($code as $snippet){
+ if(isset($info["data"]["answer"]["author"])){
- switch($snippet["tagName"]){
-
- case "p":
- $this->fuckhtml->load($snippet["innerHTML"]);
-
- $codetags =
- $this->fuckhtml
- ->getElementsByTagName("*");
-
- $tmphtml = $snippet["innerHTML"];
-
- foreach($codetags as $tag){
-
- if(!isset($tag["outerHTML"])){
-
- continue;
- }
-
- $tmphtml =
- explode(
- $tag["outerHTML"],
- $tmphtml,
- 2
- );
-
- $value = $this->fuckhtml->getTextContent($tmphtml[0], false, false);
- $this->appendtext($value, $answer["description"], $i);
-
- $type = null;
- switch($tag["tagName"]){
-
- case "code": $type = "inline_code"; break;
- case "em": $type = "italic"; break;
- case "blockquote": $type = "quote"; break;
- default: $type = "text";
- }
-
- if($type !== null){
- $value = $this->fuckhtml->getTextContent($tag, false, true);
-
- if(trim($value) != ""){
-
- if(
- $i !== 0 &&
- $type == "title"
- ){
-
- $answer["description"][$i - 1]["value"] = rtrim($answer["description"][$i - 1]["value"]);
- }
-
- $answer["description"][] = [
- "type" => $type,
- "value" => $value
- ];
- $i++;
- }
- }
-
- if(count($tmphtml) === 2){
-
- $tmphtml = $tmphtml[1];
- }else{
-
- break;
- }
- }
-
- if(is_array($tmphtml)){
-
- $tmphtml = $tmphtml[0];
- }
-
- if(strlen($tmphtml) !== 0){
-
- $value = $this->fuckhtml->getTextContent($tmphtml, false, false);
- $this->appendtext($value, $answer["description"], $i);
- }
- break;
-
- case "pre":
-
- switch($answer["description"][$i - 1]["type"]){
-
- case "text":
- case "italic":
- $answer["description"][$i - 1]["value"] = rtrim($answer["description"][$i - 1]["value"]);
- break;
- }
-
- $answer["description"][] =
- [
- "type" => "code",
- "value" =>
- rtrim(
- $this->fuckhtml
- ->getTextContent(
- $snippet,
- true,
- false
- )
- )
- ];
- $i++;
-
- break;
-
- case "ol":
- $o = 0;
-
- $this->fuckhtml->load($snippet);
- $li =
- $this->fuckhtml
- ->getElementsByTagName("li");
-
- foreach($li as $elem){
- $o++;
-
- $this->appendtext(
- $o . ". " .
- $this->fuckhtml
- ->getTextContent(
- $elem
- ),
- $answer["description"],
- $i
- );
- }
- break;
- }
+ $description[] = [
+ "type" => "quote",
+ "value" => "Answer from " . $info["data"]["answer"]["author"]
+ ];
}
+ }else{
+
+ $description = [];
if(
- $i !== 0 &&
- $answer["description"][$i - 1]["type"] == "text"
+ isset($info["description"]) &&
+ $info["description"] != ""
){
-
- $answer["description"][$i - 1]["value"] = rtrim($answer["description"][$i - 1]["value"]);
- }
-
- if($author){
-
- $answer["description"][] = [
+ $description[] = [
"type" => "quote",
- "value" => $this->fuckhtml->getTextContent($author)
+ "value" => $info["description"]
];
}
- }
- }else{
-
- /*
- Get normal description
- */
- $description =
- $this->fuckhtml
- ->getElementsByClassName(
- "mb-6",
- "div"
- );
-
- if(count($description) !== 0){
- $answer["description"][] =
- [
+ if(
+ isset($info["long_desc"]) &&
+ $info["long_desc"] != ""
+ ){
+ $description[] = [
"type" => "text",
- "value" =>
- $this->titledots(
- preg_replace(
- '/ Wikipedia$/',
- "",
- $this->fuckhtml
- ->getTextContent(
- $description[0]
- )
- )
- )
+ "value" => $this->titledots($info["long_desc"])
];
+ }
+ }
+
+ $table = [];
+ if(isset($info["attributes"])){
- $ratings =
- $this->fuckhtml
- ->getElementById("ratings");
-
- if($ratings){
-
- $this->fuckhtml->load($ratings);
-
- $ratings =
- $this->fuckhtml
- ->getElementsByClassName(
- "flex-hcenter mb-10",
- "div"
- );
+ foreach($info["attributes"] as $row){
- $answer["description"][] = [
- "type" => "title",
- "value" => "Ratings"
- ];
-
- foreach($ratings as $rating){
-
- $this->fuckhtml->load($rating);
-
- $num =
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- "r-num",
- "div"
- )[0]
- );
-
- $href =
- $this->fuckhtml
- ->getElementsByClassName(
- "mr-10",
- "a"
- )[0];
-
- $votes =
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- "text-sm",
- "span"
- )[0]
- );
-
- $c = count($answer["description"]) - 1;
+ if(
+ $row[1] == "null" &&
+ count($table) !== 0
+ ){
- if(
- $c !== -1 &&
- $answer["description"][$c]["type"] == "text"
- ){
-
- $answer["description"][$c]["value"] .= $num . " ";
- }else{
-
- $answer["description"][] = [
- "type" => "text",
- "value" => $num . " "
- ];
- }
-
- $answer["description"][] = [
- "type" => "link",
- "value" => $this->fuckhtml->getTextContent($href),
- "url" => $this->fuckhtml->getTextContent($href["attributes"]["href"])
- ];
+ break;
+ }
+
+ if($row[1] == "null"){
- $answer["description"][] = [
- "type" => "text",
- "value" => " (" . $votes . ")\n"
- ];
+ continue;
}
+
+ $table[
+ $this->fuckhtml->getTextContent($row[0])
+ ] =
+ $this->fuckhtml->getTextContent($row[1]);
}
}
- }
-
- /*
- Get sublinks
- */
- $this->fuckhtml->load($infobox);
-
- $profiles =
- $this->fuckhtml
- ->getElementById("profiles");
-
- if($profiles){
- $profiles =
- $this->fuckhtml
- ->getElementsByClassName(
- "chip",
- "a"
- );
- foreach($profiles as $profile){
-
- $name = $this->fuckhtml->getTextContent($profile["attributes"]["title"]);
+ $sublink = [];
+ if(isset($info["profiles"])){
- if(strtolower($name) == "steampowered"){
+ foreach($info["profiles"] as $row){
+
+ $name = $this->fuckhtml->getTextContent($row["name"]);
+
+ if(strtolower($name) == "steampowered"){
+
+ $name = "Steam";
+ }
- $name = "Steam";
+ $sublink[
+ $this->fuckhtml->getTextContent($name)
+ ] =
+ $this->fuckhtml->getTextContent($row["url"]);
}
-
- $answer["sublink"][$name] =
- $this->fuckhtml->getTextContent($profile["attributes"]["href"]);
}
- }
-
- $actors =
- $this->fuckhtml
- ->getElementById("panel-movie-cast");
-
- if($actors){
- $this->fuckhtml->load($actors);
+ $out["answer"][] = [
+ "title" => $this->fuckhtml->getTextContent($info["title"]),
+ "description" => $description,
+ "url" => $info["url"],
+ "thumb" => isset($info["images"][0]["original"]) ? $info["images"][0]["original"] : null,
+ "table" => $table,
+ "sublink" => $sublink
+ ];
- $actors =
- $this->fuckhtml
- ->getElementsByClassName("card");
+ break; // only iterate once, we get garbage most of the time
+ }
+ }
+
+ /*
+ Get videos
+ */
+ if(isset($data["videos"]["results"])){
+
+ foreach($data["videos"]["results"] as $video){
- $answer["description"][] = [
- "type" => "title",
- "value" => "Cast"
+ $out["video"][] = [
+ "title" => $this->titledots($video["title"]),
+ "description" => $this->titledots($video["description"]),
+ "date" => isset($video["age"]) ? strtotime($video["age"]) : null,
+ "duration" => isset($video["video"]["duration"]) ? $this->hms2int($video["video"]["duration"]) : null,
+ "views" => null,
+ "thumb" =>
+ isset($video["thumbnail"]["src"]) ?
+ [
+ "ratio" => "16:9",
+ "url" => $this->unshiturl($video["thumbnail"]["src"])
+ ] :
+ [
+ "ratio" => null,
+ "url" => null
+ ],
+ "url" => $video["url"]
];
-
- foreach($actors as $actor){
-
- $this->fuckhtml->load($actor);
-
- $answer["description"][] = [
- "type" => "text",
- "value" =>
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName("card-body")
- [0]
- )
- ];
-
- $answer["description"][] = [
- "type" => "image",
- "url" => $this->getimagelinkfromstyle("person-thumb")["url"]
- ];
- }
}
+ }
+
+ /*
+ Get news
+ */
+ if(isset($data["news"]["results"])){
- $out["answer"][] = $answer;
+ foreach($data["news"]["results"] as $news){
+
+ $out["news"][] = [
+ "title" => $this->titledots($news["title"]),
+ "description" => $this->titledots($news["description"]),
+ "date" => isset($news["age"]) ? strtotime($news["age"]) : null,
+ "thumb" =>
+ isset($video["thumbnail"]["src"]) ?
+ [
+ "ratio" => "16:9",
+ "url" => $this->unshiturl($video["thumbnail"]["src"])
+ ] :
+ [
+ "ratio" => null,
+ "url" => null
+ ],
+ "url" => $news["url"]
+ ];
+ }
}
/*
- Get actor standalone thingy
+ Get discussions
*/
- $this->fuckhtml->load($resulthtml);
- $actors =
- $this->fuckhtml
- ->getElementById("predicate-entity");
+ $disc_out = [];
- if($actors){
-
- $this->fuckhtml->load($actors);
+ if(isset($data["discussions"]["results"])){
- $cards =
- $this->fuckhtml
- ->getElementsByClassName("card");
-
- $url =
- $this->fuckhtml
- ->getElementsByClassName(
- "disclaimer",
- "div"
- )[0];
-
- $this->fuckhtml->load($url);
-
- $url =
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByTagName("a")
- [0]
- ["attributes"]
- ["href"]
- );
-
- $this->fuckhtml->load($actors);
-
- $answer = [
- "title" =>
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- "entity",
- "span"
- )[0]
- ) . " (Cast)",
- "description" => [],
- "url" => $url,
- "sublink" => [],
- "thumb" => null,
- "table" => []
- ];
-
- foreach($cards as $card){
+ foreach($data["discussions"]["results"] as $disc){
- $this->fuckhtml->load($card);
+ $table = [];
- $answer["description"][] = [
- "type" => "title",
- "value" =>
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- "title"
- )[0]
- )
- ];
+ if(isset($disc["data"]["num_votes"])){
+
+ $table["Votes"] = (int)$disc["data"]["num_votes"];
+ }
- $answer["description"][] = [
- "type" => "text",
- "value" =>
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- "text-xs desc"
- )[0]
- )
- ];
+ if(isset($disc["data"]["num_answers"])){
+
+ $table["Comments"] = (int)$disc["data"]["num_answers"];
+ }
- $answer["description"][] = [
- "type" => "image",
- "url" => $this->getimagelinkfromstyle("img-bg")["url"]
+ $disc_out[] = [
+ "title" =>
+ $this->titledots(
+ $disc["title"]
+ ),
+ "description" =>
+ $this->limitstrlen(
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $disc["description"]
+ )
+ )
+ ),
+ "url" => $disc["url"],
+ "date" => isset($disc["age"]) ? strtotime($disc["age"]) : null,
+ "type" => "web",
+ "thumb" => [
+ "ratio" => null,
+ "url" => null
+ ],
+ "sublink" => [],
+ "table" => $table
];
}
-
- $out["answer"][] = $answer;
}
+ // append discussions at position 2
+ array_splice($out["web"], 1, 0, $disc_out);
+
return $out;
}
@@ -2124,6 +1191,172 @@ class brave{
return $out;
}
+ private function stackoverflow_parse($html){
+
+ $i = 0;
+ $answer = [];
+
+ $this->fuckhtml->load($html);
+
+ foreach(
+ $this->fuckhtml->getElementsByTagName("*")
+ as $snippet
+ ){
+
+ switch($snippet["tagName"]){
+
+ case "p":
+ $this->fuckhtml->load($snippet["innerHTML"]);
+
+ $codetags =
+ $this->fuckhtml
+ ->getElementsByTagName("*");
+
+ $tmphtml = $snippet["innerHTML"];
+
+ foreach($codetags as $tag){
+
+ if(!isset($tag["outerHTML"])){
+
+ continue;
+ }
+
+ $tmphtml =
+ explode(
+ $tag["outerHTML"],
+ $tmphtml,
+ 2
+ );
+
+ $value = $this->fuckhtml->getTextContent($tmphtml[0], false, false);
+ $this->appendtext($value, $answer, $i);
+
+ $type = null;
+ switch($tag["tagName"]){
+
+ case "code": $type = "inline_code"; break;
+ case "em": $type = "italic"; break;
+ case "blockquote": $type = "quote"; break;
+ default: $type = "text";
+ }
+
+ if($type !== null){
+ $value = $this->fuckhtml->getTextContent($tag, false, true);
+
+ if(trim($value) != ""){
+
+ if(
+ $i !== 0 &&
+ $type == "title"
+ ){
+
+ $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]);
+ }
+
+ $answer[] = [
+ "type" => $type,
+ "value" => $value
+ ];
+ $i++;
+ }
+ }
+
+ if(count($tmphtml) === 2){
+
+ $tmphtml = $tmphtml[1];
+ }else{
+
+ break;
+ }
+ }
+
+ if(is_array($tmphtml)){
+
+ $tmphtml = $tmphtml[0];
+ }
+
+ if(strlen($tmphtml) !== 0){
+
+ $value = $this->fuckhtml->getTextContent($tmphtml, false, false);
+ $this->appendtext($value, $answer, $i);
+ }
+ break;
+
+ case "img":
+ $answer[] = [
+ "type" => "image",
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $tag["attributes"]["src"]
+ )
+ ];
+ $i++;
+ break;
+
+ case "pre":
+
+ switch($answer[$i - 1]["type"]){
+
+ case "text":
+ case "italic":
+ $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]);
+ break;
+ }
+
+ $answer[] =
+ [
+ "type" => "code",
+ "value" =>
+ rtrim(
+ $this->fuckhtml
+ ->getTextContent(
+ $snippet,
+ true,
+ false
+ )
+ )
+ ];
+ $i++;
+
+ break;
+
+ case "ol":
+ $o = 0;
+
+ $this->fuckhtml->load($snippet);
+ $li =
+ $this->fuckhtml
+ ->getElementsByTagName("li");
+
+ foreach($li as $elem){
+ $o++;
+
+ $this->appendtext(
+ $o . ". " .
+ $this->fuckhtml
+ ->getTextContent(
+ $elem
+ ),
+ $answer,
+ $i
+ );
+ }
+ break;
+ }
+ }
+
+ if(
+ $i !== 0 &&
+ $answer[$i - 1]["type"] == "text"
+ ){
+
+ $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]);
+ }
+
+ return $answer;
+ }
+
private function hms2int($time){
$parts = explode(":", $time, 3);