summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
Diffstat (limited to 'scraper')
-rw-r--r--scraper/brave.php349
-rw-r--r--scraper/google.php486
-rw-r--r--scraper/mojeek.php32
3 files changed, 611 insertions, 256 deletions
diff --git a/scraper/brave.php b/scraper/brave.php
index 4d48c33..c598c80 100644
--- a/scraper/brave.php
+++ b/scraper/brave.php
@@ -86,6 +86,8 @@ class brave{
];
break;
+ case "images":
+ case "videos":
case "news":
return [
"country" => [
@@ -143,7 +145,7 @@ class brave{
}
}
- private function get($url, $get = [], $nsfw, $country/*, $is_post = false, $additional_cookies = null*/){
+ private function get($url, $get = [], $nsfw, $country){
switch($nsfw){
@@ -152,13 +154,6 @@ class brave{
case "no": $nsfw = "strict"; break;
}
- //$cookie = "safesearch={$nsfw}; country={$country}; useLocation=0";
- /*
- if($additional_cookies !== null){
-
- $cookie = $additional_cookies . "; " . $cookie;
- }*/
-
$headers = [
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
@@ -171,8 +166,7 @@ class brave{
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
- "Sec-Fetch-User: ?1"//,
- //"Content-Type: application/json"
+ "Sec-Fetch-User: ?1"
];
if($country == "any"){
@@ -182,22 +176,10 @@ class brave{
$curlproc = curl_init();
- /*if($is_post){
-
- curl_setopt($curlproc, CURLOPT_POST, true);
- curl_setopt(
- $curlproc,
- CURLOPT_POSTFIELDS,
- json_encode($get)
- );
-
- }else{
- */
- if($get !== []){
- $get = http_build_query($get);
- $url .= "?" . $get;
- }
- //}
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
curl_setopt($curlproc, CURLOPT_URL, $url);
@@ -1950,18 +1932,24 @@ class brave{
return $out;
}
- /*
- public function bypasscaptcha($html, $nsfw, $country){
+ public function image($get){
- // @TODO figure out why I still cant go trough
- // the captcha wall even after breaking it
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "image" => []
+ ];
try{
$html =
$this->get(
- "https://search.brave.com/goggles",
+ "https://search.brave.com/images",
[
- "q" => "site:dailymotion.com my bloody valentine"
+ "q" => $search
],
$nsfw,
$country
@@ -1969,177 +1957,192 @@ class brave{
}catch(Exception $error){
- throw new Exception("Could not fetch html");
+ throw new Exception("Could not fetch search page");
}
+ /*
+ $handle = fopen("scraper/brave-image.html", "r");
+ $html = fread($handle, filesize("scraper/brave-image.html"));
+ fclose($handle);*/
- // Bypass brave search captcha
- // this captcha only appears on the goggles page
preg_match(
- '/this\.img\.src = "(.*)"/',
+ '/const data = (\[{.*}\]);/',
$html,
- $image
+ $json
);
- $image =
- base64_decode(
- explode(
- "data:image/png;base64,",
- $image[1]
- )[1]
- );
-
- $im = new Imagick();
- $im->readImageBlob($image);
-
- $im->blurImage(20, 20);
- $im->posterizeImage(2, imagick::IMGTYPE_COLORSEPARATION);
+ if(!isset($json[1])){
+
+ throw new Exception("Failed to get data object");
+ }
- // if we encounter a white line thats longer than 45px
- // we found the circle position
- $iterator = $im->getPixelRegionIterator(0, 77, 310, 1);
+ $json =
+ $this->fuckhtml
+ ->parseJsObject(
+ $json[1]
+ );
- $found = null;
foreach(
- $iterator as $row
+ $json[1]
+ ["data"]
+ ["body"]
+ ["response"]
+ ["results"]
+ as $result
){
- $whitecount = 0;
- $count = 0;
-
- foreach($row as $pixel){
-
- if($pixel->getColor()["r"] === 255){
-
- $whitecount++;
- $pixel->setColor("rgba(255,0,0,0)");
-
- if($whitecount === 45){
-
- $found = $count - 45;
- break 2;
- }
- }else{
-
- $whitecount = 0;
- }
-
- $count++;
- $iterator->syncIterator();
- }
+ $out["image"][] = [
+ "title" => $result["title"],
+ "source" => [
+ [
+ "url" => $result["properties"]["url"],
+ "width" => null,
+ "height" => null
+ ],
+ [
+ "url" => $result["thumbnail"]["src"],
+ "width" => null,
+ "height" => null
+ ]
+ ],
+ "url" => $result["url"]
+ ];
}
- $found = $found + 10;
+ return $out;
+ }
+
+ public function video($get){
- //header("Content-Type: image/png");
- //echo $im;
- //die();
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "video" => [],
+ "author" => [],
+ "livestream" => [],
+ "playlist" => [],
+ "reel" => []
+ ];
- if($found === null){
+ try{
+ $html =
+ $this->get(
+ "https://search.brave.com/videos",
+ [
+ "q" => $search
+ ],
+ $nsfw,
+ $country
+ );
+
+ }catch(Exception $error){
- throw new Exception("Could not bypass captcha");
+ throw new Exception("Could not fetch search page");
}
+ /*
+ $handle = fopen("scraper/brave-video.html", "r");
+ $html = fread($handle, filesize("scraper/brave-video.html"));
+ fclose($handle);*/
preg_match(
- '/data="{"captcha_id":"([0-9A-z-]+)"}"/',
+ '/const data = (\[{.*}\]);/',
$html,
- $key
+ $json
);
- $key = $key[1];
- // we bypassed captcha, send POST data
- $order =
- $this->get(
- "https://search.brave.com/api/captcha?brave=0&captcha_id={$key}",
- [
- "solution" => (string)$found
- ],
- $nsfw,
- $country,
- true
- );
-
- $order = json_decode($order, true)["orderId"];
+ if(!isset($json[1])){
+
+ throw new Exception("Failed to get data object");
+ }
- $orderpayload =
- $this->get(
- "https://search.brave.com/api/rewards/v1/orders/{$order}",
- [],
- $nsfw,
- $country
+ $json =
+ $this->fuckhtml
+ ->parseJsObject(
+ $json[1]
);
- $orderpayload = json_decode($orderpayload, true);
-
- $creds =
- $this->get(
- "https://search.brave.com/api/rewards/v1/orders/{$order}/credentials",
- [
- "itemId" => $orderpayload["items"][0]["id"],
- "blindedCreds" => [
- "fuYAVcB/m7BU66vf3wkNGxJCSaRhshB9o+8km3F1h2c=",
- "uswvcWJuPK/1qFlVdzBP3eQd0+V1EQgfAtnEoMIK+Uk=",
- "fJWKGLBxl3Gyn4n9FjTLq1PjupfABT7Ni8MeB+iGzUs=",
- "Aq9enJ/VZP9GxQIza3n65ZK7xQhY4VwDxv53BCb/Txg=",
- "FMJA9eSLHq71K+Pcwgm4gIQOmdR/6KMy5cMgXhpd5Ro=",
- "2NVhIAbvI317SP9/xXbVe/U57eWgvHyqVbHL/5+Gdmw=",
- "6mpjsjSCmYEzK2xlbL8DI2P4LuhWUOxjTLvsTAL9l24=",
- "kAn4wuHvIlKWhfuFfPTSfD4tZ5le9t7/61YbdEc/L3k=",
- "BjjUyG16aTfd1c0h4oBzgQQOekrH1f+a5CmcXqMPTR4=",
- "SBNgpCt4/V44yaQTfh+D027Yv1GJFHkjUEpPw6rAwRI=",
- "XDENAtdQ7PyYx+Qx1wQGQtDWgg8WpIMgWGmd4RDOVWE=",
- "tF7rB4sqamsiUk3K7fojdQSI0Q6iip72yKyhnvg/bC0=",
- "VsAqflirAd/u4VsLdfRS2UvnH24ZNkFh6YN3DctLjzQ=",
- "MntLbXkoI0LdcisCbNazmooiHXJyX91L1KERDAu1JRU=",
- "TH6Zs8JBvFDbTDWgKbfGE4M5/cSwCtHD8ms5Y/U8zHQ=",
- "jsZg0Z+qDPHymrbhdnesodhLNJ26QdunyMko1aVe4So=",
- "rpKsyj6/vdnuMgLI2BApeijtGq9g5USRDL0w6X2bnlQ=",
- "vCzliGT8A9vcLXj2sFf2kavOuYw69d70NpfgA22B4lI=",
- "7OWoxSCtYXWcaBSifF7AXNBif/sjcuO0IelzXG/3PFk=",
- "iiXtByNlT6nDMN9De5B58Jl8J0p6LCjnZ9aS3w2FEQU=",
- "zDhd7gsJ4h4JkDeGK0Y0mfFd8IBdkLhMOANzwO+4Dig=",
- "qANZ+AikwFReEA61JF009d/c3IHM/aSfIYwljckhJWE=",
- "nNC30pDLxtXvUr+WDwfDSrAInNBpfSZkPsV2JlpheWI=",
- "kGXE1pkt25P71kdJzmKIg4+yMR1VA5wNmbpBb/FhJQ8=",
- "aLqPsY1Qiz2UCa2Jx3YNNt8r4JINMphks/43EiyZfXU=",
- "bHGYZoQARZEM5LdFF6B74PkRqNd9EKxzuTvGYxjq+hk=",
- "JOsYQjfE/9Y1u29hR+GvEkNyxUI8blgLhX1iJI/aGRQ=",
- "yKjHjH5j600TJD/3WPsA1N3OmItDLifdjlysq4H6NV0=",
- "9lTnUbsPp7BJ7XVN5/T4yGfzD9DJdqWB7xk72s19MAA=",
- "5KHG8iY45em7zDhO/HlI0ydcZ0Ubn+XSyjifMmy7qXM="
- ]
- ],
- $nsfw,
- $country,
- true
- );
+ foreach(
+ $json
+ [1]
+ ["data"]
+ ["body"]
+ ["response"]
+ ["results"]
+ as $result
+ ){
+
+ if($result["video"]["author"] != "null"){
+
+ $author = [
+ "name" => $result["video"]["author"]["name"] == "null" ? null : $result["video"]["author"]["name"],
+ "url" => $result["video"]["author"]["url"] == "null" ? null : $result["video"]["author"]["url"],
+ "avatar" => $result["video"]["author"]["img"] == "null" ? null : $result["video"]["author"]["img"]
+ ];
+ }else{
+
+ $author = [
+ "name" => null,
+ "url" => null,
+ "avatar" => null
+ ];
+ }
+
+ if($result["thumbnail"] != "null"){
+
+ $thumb = [
+ "url" => $result["thumbnail"]["original"],
+ "ratio" => "16:9"
+ ];
+ }else{
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }
+
+ $out["video"][] = [
+ "title" => $result["title"],
+ "description" => $result["description"] == "null" ? null : $this->titledots($result["description"]),
+ "author" => $author,
+ "date" => $result["age"] == "null" ? null : strtotime($result["age"]),
+ "duration" => $result["video"]["duration"] == "null" ? null : $this->hms2int($result["video"]["duration"]),
+ "views" => $result["video"]["views"] == "null" ? null : (int)$result["video"]["views"],
+ "thumb" => $thumb,
+ "url" => $result["url"]
+ ];
+ }
+
+ return $out;
+ }
+
+ private function hms2int($time){
- var_dump($creds);
+ $parts = explode(":", $time, 3);
+ $time = 0;
- sleep(2);
- $test =
- $this->get(
- "https://search.brave.com/api/rewards/v1/orders/{$order}/credentials",
- [],
- $nsfw,
- $country
- );
+ if(count($parts) === 3){
+
+ // hours
+ $time = $time + ((int)$parts[0] * 3600);
+ array_shift($parts);
+ }
- var_dump($test);
+ if(count($parts) === 2){
+
+ // minutes
+ $time = $time + ((int)$parts[0] * 60);
+ array_shift($parts);
+ }
- $html =
- $this->get(
- "https://search.brave.com/goggles",
- [
- "q" => "site:dailymotion.com my bloody valentine"
- ],
- $nsfw,
- $country,
- false,
- "__Secure-sku#brave-search-captcha=eyJ0eXBlIjoic2luZ2xlLXVzZSIsInZlcnNpb24iOjEsInNrdSI6ImJyYXZlLXNlYXJjaC1jYXB0Y2hhIiwicHJlc2VudGF0aW9uIjoiZXlKcGMzTjFaWElpT2lKaWNtRjJaUzVqYjIwL2MydDFQV0p5WVhabExYTmxZWEpqYUMxallYQjBZMmhoSWl3aWMybG5ibUYwZFhKbElqb2lNRzl0VDBneWQxZ3dTazkzU0VFMVJ6QTJaR1V5WjFOQ1dDdGhSM3B2Y2xsTVQwVTJZVVJtTUc5a1IweG1Wa3RhZEd0cU4xbHdia3BPT0VOVGNGbE5lVWR2YmpGRlNTOUhhMlZYU1RWNGQxTjJPWGxJTTNjOVBTSXNJblFpT2lKWlJWWldaVzR5TTJwQ01tSnZkakJ2U1hGNGJtSndUMGxEUW5Kd1drRjBRbWQxVnpoRlNURTNVREY2UVRaQlpUTXJSVGRFYm5NeVFqUmhka0pGYTFWM2FGY3JWRVZJVjNWcE9TdFllRU1yYlVSTVkyMTBRVDA5SW4wPSJ9"
- );
+ // seconds
+ $time = $time + (int)$parts[0];
- var_dump($html);
- }*/
+ return $time;
+ }
private function appendtext($payload, &$text, &$index){
diff --git a/scraper/google.php b/scraper/google.php
index 28ede6d..af243ba 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -808,6 +808,7 @@ class google{
->getElementsByTagName("style");
$this->computedstyle = [];
+ $this->ask = [];
foreach($styles as $style){
@@ -860,6 +861,22 @@ class google{
$image_grep[1][0]
);
}
+
+ // even more javascript crap
+ // "People also ask" node is loaded trough javascript
+ preg_match_all(
+ '/window\.jsl\.dh\(\'([^\']+)\',\'(.+)\'\);/',
+ $script["innerHTML"],
+ $ask_grep
+ );
+
+ for($i=0; $i<count($ask_grep[0]); $i++){
+
+ $this->ask[trim($ask_grep[1][$i])] =
+ stripcslashes(
+ $ask_grep[2][$i]
+ );
+ }
}
// get nodes
@@ -926,22 +943,22 @@ class google{
"div"
);
- $carousel_title =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "font-size" => "16px",
- "line-height" => "20px",
- "font-weight" => "400"
- ],
- self::is_class
- ),
- "div"
- );
-
if(count($carousel) !== 0){
+ $carousel_title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "font-size" => "16px",
+ "line-height" => "20px",
+ "font-weight" => "400"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
+
$sublink = []; // twitter carousel sublinks
foreach($carousel as $item){
@@ -1212,6 +1229,136 @@ class google{
continue;
}
+ $people_title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "font-weight" => "bold",
+ "font-size" => "16px",
+ "color" => "#000",
+ "margin" => "0",
+ "padding" => "12px 16px 0 16px"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
+
+ if(
+ count($people_title) !== 0 &&
+ strtolower(
+ $this->fuckhtml
+ ->getTextContent(
+ $people_title[0]
+ )
+ ) == "people also ask"
+ ){
+ /*
+ Parse "people also ask" node
+ */
+
+ $div =
+ $this->fuckhtml
+ ->getElementsByTagName("div");
+
+ // add suggestions
+ $suggestions =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "display" => "inline-block",
+ "padding-right" => "26px"
+ ],
+ self::is_class
+ ),
+ $div
+ );
+
+ foreach($suggestions as $suggestion){
+
+ $out["related"][] =
+ $this->fuckhtml
+ ->getTextContent($suggestion);
+ }
+
+ // parse websites
+ foreach($div as $d){
+
+ if(
+ isset($d["attributes"]["id"]) &&
+ strpos(
+ $d["attributes"]["id"],
+ "accdef_"
+ ) !== false
+ ){
+
+ $this->fuckhtml->load(
+ $this->ask[
+ $d["attributes"]["id"]
+ ]
+ );
+
+ $description =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "white-space" => "pre-line",
+ "word-wrap" => "break-word"
+ ],
+ self::is_class
+ ),
+ "div"
+ )[0]
+ )
+ );
+
+ $a =
+ $this->fuckhtml
+ ->getElementsByTagName("a")
+ [0];
+
+ $this->fuckhtml->load($a);
+
+ $out["web"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByTagName("span")[0]
+ )
+ ),
+ "description" => $description,
+ "url" =>
+ $this->decodeurl(
+ $this->fuckhtml
+ ->getTextContent(
+ $a
+ ["attributes"]
+ ["href"]
+ )
+ ),
+ "date" => null,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => []
+ ];
+ }
+ }
+
+ continue;
+ }
+
if(count($title) !== 0){
/*
@@ -1231,6 +1378,19 @@ class google{
"url" => $this->getimage($thumb[0]["attributes"]["id"]),
"ratio" => "1:1"
];
+
+ if(parse_url($thumb["url"], PHP_URL_HOST) == "i.ytimg.com"){
+
+ $thumb = [
+ "url" =>
+ str_replace(
+ "default.jpg",
+ "maxresdefault.jpg",
+ $thumb["url"]
+ ),
+ "ratio" => "16:9"
+ ];
+ }
}else{
$thumb = [
@@ -1287,18 +1447,33 @@ class google{
$cat = explode(":", $cat, 2);
- $table[
+ $name =
$this->fuckhtml
->getTextContent(
$cat[0]
- )
- ] =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $cat[1]
- )
);
+
+ if(strtolower($name) != "posted"){
+
+ $table[$name] =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $cat[1]
+ )
+ );
+ }else{
+
+ $date =
+ strtotime(
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $cat[1]
+ )
+ )
+ );
+ }
}
continue;
}
@@ -1307,6 +1482,7 @@ class google{
$this->fuckhtml
->getElementsByTagName("span");
+ $encounter_rating = false;
foreach($spans as $span){
// replace element with nothing
@@ -1319,10 +1495,53 @@ class google{
);
}
+ if($encounter_rating !== false){
+
+ switch($encounter_rating){
+
+ case 3:
+ $table["Votes"] =
+ number_format(
+ str_replace(
+ [
+ "(",
+ ")",
+ ","
+ ],
+ "",
+ $this->fuckhtml
+ ->getTextContent(
+ $span["innerHTML"]
+ )
+ )
+ );
+ break;
+
+ case 6:
+ $table["Price"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $span["innerHTML"]
+ );
+ break;
+
+ case 8:
+ $table["Support"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $span["innerHTML"]
+ );
+ break;
+ }
+
+ $encounter_rating++;
+ }
+
// get rating
if(isset($span["attributes"]["aria-hidden"])){
$table["Rating"] = $span["innerHTML"];
+ $encounter_rating = 0;
continue;
}
}
@@ -1565,16 +1784,7 @@ class google{
}
/*
- Detect if its a wikipedia thing
- */
- $h3 =
- $this->fuckhtml
- ->getElementsByTagName("h3");
-
-
-
- /*
- Fallback to parsing the word definitions
+ Parse instant answers with parts
*/
$parts =
$this->fuckhtml
@@ -1588,15 +1798,8 @@ class google{
"div"
);
- if(count($parts) === 0){
-
- continue;
- }
-
- $head = $parts[0];
+ if(count($parts) !== 0){
- if(count($h3) !== 0){
-
$table = [
"title" => null,
"description" => [],
@@ -1606,30 +1809,130 @@ class google{
"sublink" => []
];
- $h3 = $h3[0];
-
- $table["title"] =
+ // get thumb
+ $thumb =
$this->fuckhtml
- ->getTextContent(
- $h3
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "float" => "right",
+ "padding-left" => "16px"
+ ],
+ self::is_class
+ ),
+ "div"
);
+
+ if(count($thumb) !== 0){
+
+ $this->fuckhtml->load($thumb[0]);
+
+ $img =
+ $this->fuckhtml
+ ->getElementsByTagName("img");
+
+ if(count($img) !== 0){
+
+ $table["thumb"] =
+ $this->getimage(
+ $img[0]["attributes"]["id"]
+ );
+ }
+
+ $this->fuckhtml->load($container);
+ }
- $head["innerHTML"] =
- str_replace(
- $h3["outerHTML"],
- "",
- $head["innerHTML"]
+ $h =
+ $this->fuckhtml
+ ->getElementsByTagName("h3");
+
+ if(count($h) === 0){
+
+ $h =
+ $this->fuckhtml
+ ->getElementsByTagName("h2");
+ }
+
+ if(count($h) !== 0){
+ // set title + subtext for when a word definition
+ // appears
+ $h = $h[0];
+
+ $table["title"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $h
+ );
+
+ $parts[0]["innerHTML"] =
+ str_replace(
+ $h["outerHTML"],
+ "",
+ $parts[0]["innerHTML"]
+ );
+
+ $table["description"][] =
+ [
+ "type" => "quote",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $parts[0]
+ )
+ ];
+ }else{
+
+ // parse it as a wikipedia header
+
+ }
+
+ // get table elements
+ $tables =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "display" => "table",
+ "width" => "100%",
+ "padding-right" => "16px",
+ "-webkit-box-sizing" => "border-box"
+ ],
+ self::is_class
+ ),
+ "div"
);
- $table["description"][] =
- [
- "type" => "quote",
- "value" =>
+ foreach($tables as $tbl){
+
+ $this->fuckhtml->load($tbl);
+
+ $images =
+ $this->fuckhtml
+ ->getElementsByTagName("img");
+
+ if(count($images) !== 0){
+
+ $image = $this->getimage($images[0]["attributes"]["id"]);
+
+ $text =
$this->fuckhtml
->getTextContent(
- $head
- )
- ];
+ $tbl
+ );
+
+ $table["description"][] = [
+ "type" => "link",
+ "value" => $text,
+ "url" => "?s=" . urlencode($text) . "&scraper=google"
+ ];
+
+ $table["description"][] = [
+ "type" => "image",
+ "url" => $image
+ ];
+ }
+
+ }
$audio =
$this->fuckhtml
@@ -1828,9 +2131,9 @@ class google{
}
}
}
+
+ $out["answer"][] = $table;
}
-
- $out["answer"][] = $table;
}
if($dmca_table){
@@ -2136,20 +2439,65 @@ class google{
$match
);
- if(count($match) !== 0){
+ if(count($match) === 0){
- if(!empty($match[1])){
-
- return urldecode($match[1]);
- }
+ return null;
+ }
+
+ $url = empty($match[1]) ? urldecode($match[2]) : urldecode($match[1]);
+
+ $domain = parse_url($url, PHP_URL_HOST);
+
+ if(
+ preg_match(
+ '/wikipedia.org$/',
+ $domain
+ )
+ ){
- if(!empty($match[2])){
-
- return urldecode($match[2]);
- }
+ // rewrite wikipedia mobile URLs to desktop
+ $url =
+ $this->replacedomain(
+ $url,
+ preg_replace(
+ '/([a-z0-9]+)(\.m\.)/',
+ '$1.',
+ $domain
+ )
+ );
}
- return null;
+ if(
+ preg_match(
+ '/imdb\.com$|youtube\.[^.]+$/',
+ $domain
+ )
+ ){
+
+ // rewrite imdb and youtube mobile URLs too
+ $url =
+ $this->replacedomain(
+ $url,
+ preg_replace(
+ '/^m\./',
+ "",
+ $domain
+ )
+ );
+
+ }
+
+ return $url;
+ }
+
+ private function replacedomain($url, $domain){
+
+ return
+ preg_replace(
+ '/(https?:\/\/)([^\/]+)/',
+ '$1' . $domain,
+ $url
+ );
}
private function titledots($title){
diff --git a/scraper/mojeek.php b/scraper/mojeek.php
index a0b5016..e7e8abc 100644
--- a/scraper/mojeek.php
+++ b/scraper/mojeek.php
@@ -909,6 +909,23 @@ class mojeek{
$a = $a[0];
+ $date =
+ explode(
+ " - ",
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ )[0]
+ )
+ );
+
+ $date =
+ strtotime(
+ $date[count($date) - 1]
+ );
+
$out["news"][] = [
"title" =>
html_entity_decode(
@@ -918,20 +935,7 @@ class mojeek{
)
),
"description" => null,
- "date" =>
- strtotime(
- explode(
- " - ",
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByTagName(
- "span"
- )[0]
- ),
- 2
- )[1]
- ),
+ "date" => $date,
"thumb" => [
"url" => null,
"ratio" => null