diff options
Diffstat (limited to 'scraper')
-rw-r--r-- | scraper/brave.php | 349 | ||||
-rw-r--r-- | scraper/google.php | 486 | ||||
-rw-r--r-- | scraper/mojeek.php | 32 |
3 files changed, 611 insertions, 256 deletions
diff --git a/scraper/brave.php b/scraper/brave.php index 4d48c33..c598c80 100644 --- a/scraper/brave.php +++ b/scraper/brave.php @@ -86,6 +86,8 @@ class brave{ ]; break; + case "images": + case "videos": case "news": return [ "country" => [ @@ -143,7 +145,7 @@ class brave{ } } - private function get($url, $get = [], $nsfw, $country/*, $is_post = false, $additional_cookies = null*/){ + private function get($url, $get = [], $nsfw, $country){ switch($nsfw){ @@ -152,13 +154,6 @@ class brave{ case "no": $nsfw = "strict"; break; } - //$cookie = "safesearch={$nsfw}; country={$country}; useLocation=0"; - /* - if($additional_cookies !== null){ - - $cookie = $additional_cookies . "; " . $cookie; - }*/ - $headers = [ "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", @@ -171,8 +166,7 @@ class brave{ "Sec-Fetch-Dest: document", "Sec-Fetch-Mode: navigate", "Sec-Fetch-Site: none", - "Sec-Fetch-User: ?1"//, - //"Content-Type: application/json" + "Sec-Fetch-User: ?1" ]; if($country == "any"){ @@ -182,22 +176,10 @@ class brave{ $curlproc = curl_init(); - /*if($is_post){ - - curl_setopt($curlproc, CURLOPT_POST, true); - curl_setopt( - $curlproc, - CURLOPT_POSTFIELDS, - json_encode($get) - ); - - }else{ - */ - if($get !== []){ - $get = http_build_query($get); - $url .= "?" . $get; - } - //} + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } curl_setopt($curlproc, CURLOPT_URL, $url); @@ -1950,18 +1932,24 @@ class brave{ return $out; } - /* - public function bypasscaptcha($html, $nsfw, $country){ + public function image($get){ - // @TODO figure out why I still cant go trough - // the captcha wall even after breaking it + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; try{ $html = $this->get( - "https://search.brave.com/goggles", + "https://search.brave.com/images", [ - "q" => "site:dailymotion.com my bloody valentine" + "q" => $search ], $nsfw, $country @@ -1969,177 +1957,192 @@ class brave{ }catch(Exception $error){ - throw new Exception("Could not fetch html"); + throw new Exception("Could not fetch search page"); } + /* + $handle = fopen("scraper/brave-image.html", "r"); + $html = fread($handle, filesize("scraper/brave-image.html")); + fclose($handle);*/ - // Bypass brave search captcha - // this captcha only appears on the goggles page preg_match( - '/this\.img\.src = "(.*)"/', + '/const data = (\[{.*}\]);/', $html, - $image + $json ); - $image = - base64_decode( - explode( - "data:image/png;base64,", - $image[1] - )[1] - ); - - $im = new Imagick(); - $im->readImageBlob($image); - - $im->blurImage(20, 20); - $im->posterizeImage(2, imagick::IMGTYPE_COLORSEPARATION); + if(!isset($json[1])){ + + throw new Exception("Failed to get data object"); + } - // if we encounter a white line thats longer than 45px - // we found the circle position - $iterator = $im->getPixelRegionIterator(0, 77, 310, 1); + $json = + $this->fuckhtml + ->parseJsObject( + $json[1] + ); - $found = null; foreach( - $iterator as $row + $json[1] + ["data"] + ["body"] + ["response"] + ["results"] + as $result ){ - $whitecount = 0; - $count = 0; - - foreach($row as $pixel){ - - if($pixel->getColor()["r"] === 255){ - - $whitecount++; - $pixel->setColor("rgba(255,0,0,0)"); - - if($whitecount === 45){ - - $found = $count - 45; - break 2; - } - }else{ - - $whitecount = 0; - } - - $count++; - $iterator->syncIterator(); - } + $out["image"][] = [ + "title" => $result["title"], + "source" => [ + [ + "url" => $result["properties"]["url"], + "width" => null, + "height" => null + ], + [ + "url" => $result["thumbnail"]["src"], + "width" => null, + "height" => null + ] + ], + "url" => $result["url"] + ]; } - $found = $found + 10; + return $out; + } + + public function video($get){ - //header("Content-Type: image/png"); - //echo $im; - //die(); + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; - if($found === null){ + try{ + $html = + $this->get( + "https://search.brave.com/videos", + [ + "q" => $search + ], + $nsfw, + $country + ); + + }catch(Exception $error){ - throw new Exception("Could not bypass captcha"); + throw new Exception("Could not fetch search page"); } + /* + $handle = fopen("scraper/brave-video.html", "r"); + $html = fread($handle, filesize("scraper/brave-video.html")); + fclose($handle);*/ preg_match( - '/data="{"captcha_id":"([0-9A-z-]+)"}"/', + '/const data = (\[{.*}\]);/', $html, - $key + $json ); - $key = $key[1]; - // we bypassed captcha, send POST data - $order = - $this->get( - "https://search.brave.com/api/captcha?brave=0&captcha_id={$key}", - [ - "solution" => (string)$found - ], - $nsfw, - $country, - true - ); - - $order = json_decode($order, true)["orderId"]; + if(!isset($json[1])){ + + throw new Exception("Failed to get data object"); + } - $orderpayload = - $this->get( - "https://search.brave.com/api/rewards/v1/orders/{$order}", - [], - $nsfw, - $country + $json = + $this->fuckhtml + ->parseJsObject( + $json[1] ); - $orderpayload = json_decode($orderpayload, true); - - $creds = - $this->get( - "https://search.brave.com/api/rewards/v1/orders/{$order}/credentials", - [ - "itemId" => $orderpayload["items"][0]["id"], - "blindedCreds" => [ - "fuYAVcB/m7BU66vf3wkNGxJCSaRhshB9o+8km3F1h2c=", - "uswvcWJuPK/1qFlVdzBP3eQd0+V1EQgfAtnEoMIK+Uk=", - "fJWKGLBxl3Gyn4n9FjTLq1PjupfABT7Ni8MeB+iGzUs=", - "Aq9enJ/VZP9GxQIza3n65ZK7xQhY4VwDxv53BCb/Txg=", - "FMJA9eSLHq71K+Pcwgm4gIQOmdR/6KMy5cMgXhpd5Ro=", - "2NVhIAbvI317SP9/xXbVe/U57eWgvHyqVbHL/5+Gdmw=", - "6mpjsjSCmYEzK2xlbL8DI2P4LuhWUOxjTLvsTAL9l24=", - "kAn4wuHvIlKWhfuFfPTSfD4tZ5le9t7/61YbdEc/L3k=", - "BjjUyG16aTfd1c0h4oBzgQQOekrH1f+a5CmcXqMPTR4=", - "SBNgpCt4/V44yaQTfh+D027Yv1GJFHkjUEpPw6rAwRI=", - "XDENAtdQ7PyYx+Qx1wQGQtDWgg8WpIMgWGmd4RDOVWE=", - "tF7rB4sqamsiUk3K7fojdQSI0Q6iip72yKyhnvg/bC0=", - "VsAqflirAd/u4VsLdfRS2UvnH24ZNkFh6YN3DctLjzQ=", - "MntLbXkoI0LdcisCbNazmooiHXJyX91L1KERDAu1JRU=", - "TH6Zs8JBvFDbTDWgKbfGE4M5/cSwCtHD8ms5Y/U8zHQ=", - "jsZg0Z+qDPHymrbhdnesodhLNJ26QdunyMko1aVe4So=", - "rpKsyj6/vdnuMgLI2BApeijtGq9g5USRDL0w6X2bnlQ=", - "vCzliGT8A9vcLXj2sFf2kavOuYw69d70NpfgA22B4lI=", - "7OWoxSCtYXWcaBSifF7AXNBif/sjcuO0IelzXG/3PFk=", - "iiXtByNlT6nDMN9De5B58Jl8J0p6LCjnZ9aS3w2FEQU=", - "zDhd7gsJ4h4JkDeGK0Y0mfFd8IBdkLhMOANzwO+4Dig=", - "qANZ+AikwFReEA61JF009d/c3IHM/aSfIYwljckhJWE=", - "nNC30pDLxtXvUr+WDwfDSrAInNBpfSZkPsV2JlpheWI=", - "kGXE1pkt25P71kdJzmKIg4+yMR1VA5wNmbpBb/FhJQ8=", - "aLqPsY1Qiz2UCa2Jx3YNNt8r4JINMphks/43EiyZfXU=", - "bHGYZoQARZEM5LdFF6B74PkRqNd9EKxzuTvGYxjq+hk=", - "JOsYQjfE/9Y1u29hR+GvEkNyxUI8blgLhX1iJI/aGRQ=", - "yKjHjH5j600TJD/3WPsA1N3OmItDLifdjlysq4H6NV0=", - "9lTnUbsPp7BJ7XVN5/T4yGfzD9DJdqWB7xk72s19MAA=", - "5KHG8iY45em7zDhO/HlI0ydcZ0Ubn+XSyjifMmy7qXM=" - ] - ], - $nsfw, - $country, - true - ); + foreach( + $json + [1] + ["data"] + ["body"] + ["response"] + ["results"] + as $result + ){ + + if($result["video"]["author"] != "null"){ + + $author = [ + "name" => $result["video"]["author"]["name"] == "null" ? null : $result["video"]["author"]["name"], + "url" => $result["video"]["author"]["url"] == "null" ? null : $result["video"]["author"]["url"], + "avatar" => $result["video"]["author"]["img"] == "null" ? null : $result["video"]["author"]["img"] + ]; + }else{ + + $author = [ + "name" => null, + "url" => null, + "avatar" => null + ]; + } + + if($result["thumbnail"] != "null"){ + + $thumb = [ + "url" => $result["thumbnail"]["original"], + "ratio" => "16:9" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + $out["video"][] = [ + "title" => $result["title"], + "description" => $result["description"] == "null" ? null : $this->titledots($result["description"]), + "author" => $author, + "date" => $result["age"] == "null" ? null : strtotime($result["age"]), + "duration" => $result["video"]["duration"] == "null" ? null : $this->hms2int($result["video"]["duration"]), + "views" => $result["video"]["views"] == "null" ? null : (int)$result["video"]["views"], + "thumb" => $thumb, + "url" => $result["url"] + ]; + } + + return $out; + } + + private function hms2int($time){ - var_dump($creds); + $parts = explode(":", $time, 3); + $time = 0; - sleep(2); - $test = - $this->get( - "https://search.brave.com/api/rewards/v1/orders/{$order}/credentials", - [], - $nsfw, - $country - ); + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } - var_dump($test); + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } - $html = - $this->get( - "https://search.brave.com/goggles", - [ - "q" => "site:dailymotion.com my bloody valentine" - ], - $nsfw, - $country, - false, - "__Secure-sku#brave-search-captcha=eyJ0eXBlIjoic2luZ2xlLXVzZSIsInZlcnNpb24iOjEsInNrdSI6ImJyYXZlLXNlYXJjaC1jYXB0Y2hhIiwicHJlc2VudGF0aW9uIjoiZXlKcGMzTjFaWElpT2lKaWNtRjJaUzVqYjIwL2MydDFQV0p5WVhabExYTmxZWEpqYUMxallYQjBZMmhoSWl3aWMybG5ibUYwZFhKbElqb2lNRzl0VDBneWQxZ3dTazkzU0VFMVJ6QTJaR1V5WjFOQ1dDdGhSM3B2Y2xsTVQwVTJZVVJtTUc5a1IweG1Wa3RhZEd0cU4xbHdia3BPT0VOVGNGbE5lVWR2YmpGRlNTOUhhMlZYU1RWNGQxTjJPWGxJTTNjOVBTSXNJblFpT2lKWlJWWldaVzR5TTJwQ01tSnZkakJ2U1hGNGJtSndUMGxEUW5Kd1drRjBRbWQxVnpoRlNURTNVREY2UVRaQlpUTXJSVGRFYm5NeVFqUmhka0pGYTFWM2FGY3JWRVZJVjNWcE9TdFllRU1yYlVSTVkyMTBRVDA5SW4wPSJ9" - ); + // seconds + $time = $time + (int)$parts[0]; - var_dump($html); - }*/ + return $time; + } private function appendtext($payload, &$text, &$index){ diff --git a/scraper/google.php b/scraper/google.php index 28ede6d..af243ba 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -808,6 +808,7 @@ class google{ ->getElementsByTagName("style"); $this->computedstyle = []; + $this->ask = []; foreach($styles as $style){ @@ -860,6 +861,22 @@ class google{ $image_grep[1][0] ); } + + // even more javascript crap + // "People also ask" node is loaded trough javascript + preg_match_all( + '/window\.jsl\.dh\(\'([^\']+)\',\'(.+)\'\);/', + $script["innerHTML"], + $ask_grep + ); + + for($i=0; $i<count($ask_grep[0]); $i++){ + + $this->ask[trim($ask_grep[1][$i])] = + stripcslashes( + $ask_grep[2][$i] + ); + } } // get nodes @@ -926,22 +943,22 @@ class google{ "div" ); - $carousel_title = - $this->fuckhtml - ->getElementsByClassName( - $this->findstyles( - [ - "font-size" => "16px", - "line-height" => "20px", - "font-weight" => "400" - ], - self::is_class - ), - "div" - ); - if(count($carousel) !== 0){ + $carousel_title = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "font-size" => "16px", + "line-height" => "20px", + "font-weight" => "400" + ], + self::is_class + ), + "div" + ); + $sublink = []; // twitter carousel sublinks foreach($carousel as $item){ @@ -1212,6 +1229,136 @@ class google{ continue; } + $people_title = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "font-weight" => "bold", + "font-size" => "16px", + "color" => "#000", + "margin" => "0", + "padding" => "12px 16px 0 16px" + ], + self::is_class + ), + "div" + ); + + if( + count($people_title) !== 0 && + strtolower( + $this->fuckhtml + ->getTextContent( + $people_title[0] + ) + ) == "people also ask" + ){ + /* + Parse "people also ask" node + */ + + $div = + $this->fuckhtml + ->getElementsByTagName("div"); + + // add suggestions + $suggestions = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "display" => "inline-block", + "padding-right" => "26px" + ], + self::is_class + ), + $div + ); + + foreach($suggestions as $suggestion){ + + $out["related"][] = + $this->fuckhtml + ->getTextContent($suggestion); + } + + // parse websites + foreach($div as $d){ + + if( + isset($d["attributes"]["id"]) && + strpos( + $d["attributes"]["id"], + "accdef_" + ) !== false + ){ + + $this->fuckhtml->load( + $this->ask[ + $d["attributes"]["id"] + ] + ); + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "white-space" => "pre-line", + "word-wrap" => "break-word" + ], + self::is_class + ), + "div" + )[0] + ) + ); + + $a = + $this->fuckhtml + ->getElementsByTagName("a") + [0]; + + $this->fuckhtml->load($a); + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName("span")[0] + ) + ), + "description" => $description, + "url" => + $this->decodeurl( + $this->fuckhtml + ->getTextContent( + $a + ["attributes"] + ["href"] + ) + ), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + } + + continue; + } + if(count($title) !== 0){ /* @@ -1231,6 +1378,19 @@ class google{ "url" => $this->getimage($thumb[0]["attributes"]["id"]), "ratio" => "1:1" ]; + + if(parse_url($thumb["url"], PHP_URL_HOST) == "i.ytimg.com"){ + + $thumb = [ + "url" => + str_replace( + "default.jpg", + "maxresdefault.jpg", + $thumb["url"] + ), + "ratio" => "16:9" + ]; + } }else{ $thumb = [ @@ -1287,18 +1447,33 @@ class google{ $cat = explode(":", $cat, 2); - $table[ + $name = $this->fuckhtml ->getTextContent( $cat[0] - ) - ] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $cat[1] - ) ); + + if(strtolower($name) != "posted"){ + + $table[$name] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $cat[1] + ) + ); + }else{ + + $date = + strtotime( + $this->titledots( + $this->fuckhtml + ->getTextContent( + $cat[1] + ) + ) + ); + } } continue; } @@ -1307,6 +1482,7 @@ class google{ $this->fuckhtml ->getElementsByTagName("span"); + $encounter_rating = false; foreach($spans as $span){ // replace element with nothing @@ -1319,10 +1495,53 @@ class google{ ); } + if($encounter_rating !== false){ + + switch($encounter_rating){ + + case 3: + $table["Votes"] = + number_format( + str_replace( + [ + "(", + ")", + "," + ], + "", + $this->fuckhtml + ->getTextContent( + $span["innerHTML"] + ) + ) + ); + break; + + case 6: + $table["Price"] = + $this->fuckhtml + ->getTextContent( + $span["innerHTML"] + ); + break; + + case 8: + $table["Support"] = + $this->fuckhtml + ->getTextContent( + $span["innerHTML"] + ); + break; + } + + $encounter_rating++; + } + // get rating if(isset($span["attributes"]["aria-hidden"])){ $table["Rating"] = $span["innerHTML"]; + $encounter_rating = 0; continue; } } @@ -1565,16 +1784,7 @@ class google{ } /* - Detect if its a wikipedia thing - */ - $h3 = - $this->fuckhtml - ->getElementsByTagName("h3"); - - - - /* - Fallback to parsing the word definitions + Parse instant answers with parts */ $parts = $this->fuckhtml @@ -1588,15 +1798,8 @@ class google{ "div" ); - if(count($parts) === 0){ - - continue; - } - - $head = $parts[0]; + if(count($parts) !== 0){ - if(count($h3) !== 0){ - $table = [ "title" => null, "description" => [], @@ -1606,30 +1809,130 @@ class google{ "sublink" => [] ]; - $h3 = $h3[0]; - - $table["title"] = + // get thumb + $thumb = $this->fuckhtml - ->getTextContent( - $h3 + ->getElementsByClassName( + $this->findstyles( + [ + "float" => "right", + "padding-left" => "16px" + ], + self::is_class + ), + "div" ); + + if(count($thumb) !== 0){ + + $this->fuckhtml->load($thumb[0]); + + $img = + $this->fuckhtml + ->getElementsByTagName("img"); + + if(count($img) !== 0){ + + $table["thumb"] = + $this->getimage( + $img[0]["attributes"]["id"] + ); + } + + $this->fuckhtml->load($container); + } - $head["innerHTML"] = - str_replace( - $h3["outerHTML"], - "", - $head["innerHTML"] + $h = + $this->fuckhtml + ->getElementsByTagName("h3"); + + if(count($h) === 0){ + + $h = + $this->fuckhtml + ->getElementsByTagName("h2"); + } + + if(count($h) !== 0){ + // set title + subtext for when a word definition + // appears + $h = $h[0]; + + $table["title"] = + $this->fuckhtml + ->getTextContent( + $h + ); + + $parts[0]["innerHTML"] = + str_replace( + $h["outerHTML"], + "", + $parts[0]["innerHTML"] + ); + + $table["description"][] = + [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $parts[0] + ) + ]; + }else{ + + // parse it as a wikipedia header + + } + + // get table elements + $tables = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "display" => "table", + "width" => "100%", + "padding-right" => "16px", + "-webkit-box-sizing" => "border-box" + ], + self::is_class + ), + "div" ); - $table["description"][] = - [ - "type" => "quote", - "value" => + foreach($tables as $tbl){ + + $this->fuckhtml->load($tbl); + + $images = + $this->fuckhtml + ->getElementsByTagName("img"); + + if(count($images) !== 0){ + + $image = $this->getimage($images[0]["attributes"]["id"]); + + $text = $this->fuckhtml ->getTextContent( - $head - ) - ]; + $tbl + ); + + $table["description"][] = [ + "type" => "link", + "value" => $text, + "url" => "?s=" . urlencode($text) . "&scraper=google" + ]; + + $table["description"][] = [ + "type" => "image", + "url" => $image + ]; + } + + } $audio = $this->fuckhtml @@ -1828,9 +2131,9 @@ class google{ } } } + + $out["answer"][] = $table; } - - $out["answer"][] = $table; } if($dmca_table){ @@ -2136,20 +2439,65 @@ class google{ $match ); - if(count($match) !== 0){ + if(count($match) === 0){ - if(!empty($match[1])){ - - return urldecode($match[1]); - } + return null; + } + + $url = empty($match[1]) ? urldecode($match[2]) : urldecode($match[1]); + + $domain = parse_url($url, PHP_URL_HOST); + + if( + preg_match( + '/wikipedia.org$/', + $domain + ) + ){ - if(!empty($match[2])){ - - return urldecode($match[2]); - } + // rewrite wikipedia mobile URLs to desktop + $url = + $this->replacedomain( + $url, + preg_replace( + '/([a-z0-9]+)(\.m\.)/', + '$1.', + $domain + ) + ); } - return null; + if( + preg_match( + '/imdb\.com$|youtube\.[^.]+$/', + $domain + ) + ){ + + // rewrite imdb and youtube mobile URLs too + $url = + $this->replacedomain( + $url, + preg_replace( + '/^m\./', + "", + $domain + ) + ); + + } + + return $url; + } + + private function replacedomain($url, $domain){ + + return + preg_replace( + '/(https?:\/\/)([^\/]+)/', + '$1' . $domain, + $url + ); } private function titledots($title){ diff --git a/scraper/mojeek.php b/scraper/mojeek.php index a0b5016..e7e8abc 100644 --- a/scraper/mojeek.php +++ b/scraper/mojeek.php @@ -909,6 +909,23 @@ class mojeek{ $a = $a[0]; + $date = + explode( + " - ", + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName( + "span" + )[0] + ) + ); + + $date = + strtotime( + $date[count($date) - 1] + ); + $out["news"][] = [ "title" => html_entity_decode( @@ -918,20 +935,7 @@ class mojeek{ ) ), "description" => null, - "date" => - strtotime( - explode( - " - ", - $this->fuckhtml - ->getTextContent( - $this->fuckhtml - ->getElementsByTagName( - "span" - )[0] - ), - 2 - )[1] - ), + "date" => $date, "thumb" => [ "url" => null, "ratio" => null |