diff options
author | lolcat <will@lolcat.ca> | 2023-08-27 01:45:59 -0400 |
---|---|---|
committer | lolcat <will@lolcat.ca> | 2023-08-27 01:45:59 -0400 |
commit | 1fd4c2de6d2552f4619c6e67aac60bc635465cd1 (patch) | |
tree | 12c015d718606e5d245008ea7a42899aafae0dd5 | |
parent | 12a6278a5fb0a207d2d24ae97957c37b5bd9b4d7 (diff) |
added yandex web and video search, removed fb search
-rw-r--r-- | banner/cynic.png | bin | 0 -> 68909 bytes | |||
-rw-r--r-- | lib/frontend.php | 10 | ||||
-rw-r--r-- | scraper/brave.php | 121 | ||||
-rw-r--r-- | scraper/facebook.php | 8 | ||||
-rw-r--r-- | scraper/google.php | 505 | ||||
-rw-r--r-- | scraper/yandex.php | 596 | ||||
-rw-r--r-- | settings.php | 16 |
7 files changed, 1178 insertions, 78 deletions
diff --git a/banner/cynic.png b/banner/cynic.png Binary files differnew file mode 100644 index 0000000..05c728b --- /dev/null +++ b/banner/cynic.png diff --git a/lib/frontend.php b/lib/frontend.php index 0f2a1ff..9350230 100644 --- a/lib/frontend.php +++ b/lib/frontend.php @@ -878,6 +878,7 @@ class frontend{ "option" => [ "ddg" => "DuckDuckGo", "brave" => "Brave", + "yandex" => "Yandex", //"google" => "Google", "mojeek" => "Mojeek", "marginalia" => "Marginalia", @@ -903,9 +904,10 @@ class frontend{ "display" => "Scraper", "option" => [ "yt" => "YouTube", - "fb" => "Facebook videos", + //"fb" => "Facebook videos", "ddg" => "DuckDuckGo", - "brave" => "Brave"//, + "brave" => "Brave", + "yandex" => "Yandex" //"google" => "Google" ] ]; @@ -972,11 +974,11 @@ class frontend{ include "scraper/google.php"; $lib = new google(); break; - + /* case "fb": include "scraper/facebook.php"; $lib = new facebook(); - break; + break;*/ case "mojeek": include "scraper/mojeek.php"; diff --git a/scraper/brave.php b/scraper/brave.php index 50e7b49..0a73158 100644 --- a/scraper/brave.php +++ b/scraper/brave.php @@ -1183,6 +1183,28 @@ class brave{ $div = $this->fuckhtml->getElementsByTagName("div"); /* + Get small description + */ + $small_desc = + $this->fuckhtml + ->getElementsByClassName( + "infobox-description", + $div + ); + + if(count($small_desc) !== 0){ + + $answer["description"][] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $small_desc[0] + ) + ]; + } + + /* Get title + url */ $title = @@ -1292,28 +1314,25 @@ class brave{ if(count($code) === 0){ - $answer["description"] = - [ - [ - "type" => "text", - "value" => - $this->fuckhtml - ->getTextContent( - $desc_tmp - ) - ], - [ - "type" => "quote", - "value" => - $this->fuckhtml - ->getTextContent( - $author - ) - ] + $answer["description"][] = [ + "type" => "text", + "value" => + $this->fuckhtml + ->getTextContent( + $desc_tmp + ) + ]; + + $answer["description"][] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $author + ) ]; }else{ - $text = []; $i = 0; foreach($code as $snippet){ @@ -1344,7 +1363,7 @@ class brave{ ); $value = $this->fuckhtml->getTextContent($tmphtml[0], false, false); - $this->appendtext($value, $text, $i); + $this->appendtext($value, $answer["description"], $i); $type = null; switch($tag["tagName"]){ @@ -1365,10 +1384,10 @@ class brave{ $type == "title" ){ - $text[$i - 1]["value"] = rtrim($text[$i - 1]["value"]); + $answer["description"][$i - 1]["value"] = rtrim($answer["description"][$i - 1]["value"]); } - $text[] = [ + $answer["description"][] = [ "type" => $type, "value" => $value ]; @@ -1393,21 +1412,21 @@ class brave{ if(strlen($tmphtml) !== 0){ $value = $this->fuckhtml->getTextContent($tmphtml, false, false); - $this->appendtext($value, $text, $i); + $this->appendtext($value, $answer["description"], $i); } break; case "pre": - switch($text[$i - 1]["type"]){ + switch($answer["description"][$i - 1]["type"]){ case "text": case "italic": - $text[$i - 1]["value"] = rtrim($text[$i - 1]["value"]); + $answer["description"][$i - 1]["value"] = rtrim($answer["description"][$i - 1]["value"]); break; } - $text[] = + $answer["description"][] = [ "type" => "code", "value" => @@ -1441,7 +1460,7 @@ class brave{ ->getTextContent( $elem ), - $text, + $answer["description"], $i ); } @@ -1451,21 +1470,19 @@ class brave{ if( $i !== 0 && - $text[$i - 1]["type"] == "text" + $answer["description"][$i - 1]["type"] == "text" ){ - $text[$i - 1]["value"] = rtrim($text[$i - 1]["value"]); + $answer["description"][$i - 1]["value"] = rtrim($answer["description"][$i - 1]["value"]); } if($author){ - $text[] = [ + $answer["description"][] = [ "type" => "quote", "value" => $this->fuckhtml->getTextContent($author) ]; } - - $answer["description"] = $text; } }else{ @@ -1481,22 +1498,20 @@ class brave{ if(count($description) !== 0){ - $description = + $answer["description"][] = [ - [ - "type" => "text", - "value" => - $this->titledots( - preg_replace( - '/ Wikipedia$/', - "", - $this->fuckhtml - ->getTextContent( - $description[0] - ) + "type" => "text", + "value" => + $this->titledots( + preg_replace( + '/ Wikipedia$/', + "", + $this->fuckhtml + ->getTextContent( + $description[0] ) ) - ] + ) ]; $ratings = @@ -1514,7 +1529,7 @@ class brave{ "div" ); - $description[] = [ + $answer["description"][] = [ "type" => "title", "value" => "Ratings" ]; @@ -1550,36 +1565,34 @@ class brave{ )[0] ); - $c = count($description) - 1; + $c = count($answer["description"]) - 1; if( $c !== -1 && - $description[$c]["type"] == "text" + $answer["description"][$c]["type"] == "text" ){ - $description[$c]["value"] .= $num . " "; + $answer["description"][$c]["value"] .= $num . " "; }else{ - $description[] = [ + $answer["description"][] = [ "type" => "text", "value" => $num . " " ]; } - $description[] = [ + $answer["description"][] = [ "type" => "link", "value" => $this->fuckhtml->getTextContent($href), "url" => $this->fuckhtml->getTextContent($href["attributes"]["href"]) ]; - $description[] = [ + $answer["description"][] = [ "type" => "text", "value" => " (" . $votes . ")\n" ]; } } - - $answer["description"] = $description; } } diff --git a/scraper/facebook.php b/scraper/facebook.php index 46d58d6..7bd576b 100644 --- a/scraper/facebook.php +++ b/scraper/facebook.php @@ -228,16 +228,16 @@ class facebook{ ) ); } - + /* $html = $this->get( "https://www.facebook.com/watch/search/", $req - ); - /* + );*/ + $handle = fopen("scraper/facebook.html", "r"); $html = fread($handle, filesize("scraper/facebook.html")); - fclose($handle);*/ + fclose($handle); preg_match_all( '/({"__bbox":.*,"sequence_number":0}})\]\]/', diff --git a/scraper/google.php b/scraper/google.php index 7ed3577..d0e90ca 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -824,8 +824,6 @@ class google{ $html = fread($handle, filesize("scraper/google.html")); fclose($handle); - $this->fuckhtml->load($html); - $out = [ "status" => "ok", "spelling" => [ @@ -841,6 +839,507 @@ class google{ "news" => [], "related" => [] ]; + + $this->parsejavascript($html); + + $containers = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "background-color" => "#fff", + "margin-bottom" => "10px", + "-webkit-box-shadow" => "0 1px 6px rgba(32,33,36,0.28)", + "border-radius" => "8px" + ], + self::is_class + ), + "div" + ); + + foreach($containers as $container){ + + $this->fuckhtml->load($container); + + $title = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "color" => "#1967d2", + "font-size" => "20px", + "line-height" => "26px" + ], + self::is_class + ), + "div" + ); + + if(count($title) !== 0){ + + /* + Container is a web link + */ + $web = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ), + "description" => null, + "url" => + $this->decodeurl( + $this->fuckhtml + ->getElementsByTagName("a") + [0] + ["attributes"] + ["href"] + ), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + $container = $container["innerHTML"]; + + $description_container = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "padding" => "12px 16px 12px" + ], + self::is_class + ), + "div" + )[1]; + + $description = + $description_container["innerHTML"]; + + // get sublinks + $this->fuckhtml->load($description); + + $links = + $this->fuckhtml + ->getElementsByTagName("a"); + + $skip = true; + foreach($links as $link){ + + $description = + str_replace( + $link["outerHTML"], + "", + $description + ); + + if($skip){ + + $skip = false; + continue; + } + + $sublink = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null + ]; + + $sublink["title"] = + $this->fuckhtml + ->getTextContent( + $link + ); + + $sublink["url"] = + $this->decodeurl( + $link + ["attributes"] + ["href"] + ); + + $web["sublink"][] = $sublink; + } + + // get thumbnail before we call loadhtml again + $img = + $this->fuckhtml + ->getElementsByTagName("img"); + + if(count($img) !== 0){ + + if( + isset($img[0]["attributes"]["alt"]) && + stripos($img[0]["attributes"]["alt"], "Video for") !== false + ){ + + // is a video thumbnail + $web["thumb"]["ratio"] = "16:9"; + }else{ + + // is a google thumbnail + $web["thumb"]["ratio"] = "1:1"; + } + + $web["thumb"]["url"] = + $this->getimage( + $img[0]["attributes"]["id"] + ); + } + + // get table elements + $this->fuckhtml->load($description); + + $levels = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "padding-bottom" => "8px" + ], + self::is_class + ), + "div" + ); + + $additional_info = []; + foreach($levels as $level){ + + $this->fuckhtml->load($level); + + $spans = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + $is_rating = -2; + + foreach($spans as $span){ + + // clean up description + $description = + str_replace( + $span["outerHTML"], + "", + $description + ); + + $innertext = + $this->fuckhtml + ->getTextContent( + $span + ); + + if($innertext == ""){ continue; } + + if( + strtolower($innertext) + == "rating" + ){ + + $is_rating = -1; + continue; + } + + /* + Parse rating object + */ + + if($is_rating >= -1){ + + if($span["level"] !== 1){ continue; } + + $is_rating++; + + // 10/10 (123) + if($is_rating === 0){ + + $innertext = explode(" ", $innertext, 2); + + $web["table"]["Rating"] = $innertext[0]; + $web["table"]["Hits"] = + trim( + str_replace( + [ + "(", + ")" + ], + "", + $innertext[1] + ) + ); + continue; + } + + // US$4.99 + // MYR 50.00 + // $38.34 + // JP¥6,480 + if($is_rating === 2){ + + $web["table"]["Price"] = $innertext; + continue; + } + + // Android / In stock + if($is_rating === 4){ + + $web["table"]["Support"] = $innertext; + continue; + } + + // ignore the rest + continue; + } + + /* + Parse standalone text + */ + $additional_info[] = $innertext; + } + } + + for($i=0; $i<count($additional_info); $i++){ + + // @TODO + // generate better node names + $web["table"]["Info node #$i"] = $additional_info[$i]; + } + + $this->fuckhtml->load($description); + + // get date node + $span = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + if(count($span) !== 0){ + + $description = + str_replace( + $span[0]["outerHTML"], + "", + $description + ); + + $span = + strtotime( + $this->fuckhtml + ->getTextContent( + $span[0] + ) + ); + + if($span){ + + $web["date"] = $span; + } + } + + $web["description"] = + trim( + $this->fuckhtml + ->getTextContent( + $description + ), + " ·." + ); + + $out["web"][] = $web; + + continue; + } + + // check for container title header + $container_title = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "font-weight" => "bold", + "font-size" => "16px", + "color" => "#000", + "margin" => "0", + "padding" => "12px 16px 0 16px" + ], + self::is_class + ), + "div" + ); + + if(count($container_title) !== 0){ + + $container_title = + strtolower( + $this->fuckhtml + ->getTextContent( + $container_title[0] + ) + ); + + if( + $container_title == "related searches" || + $container_title == "people also search for" + ){ + + /* + Parse related searches + */ + $as = + $this->fuckhtml + ->getElementsByTagName("a"); + + foreach($as as $a){ + + $out["related"][] = + $this->fuckhtml + ->getTextContent($a); + } + } + + continue; + } + + /* + Parse image carousel + */ + $title_container = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "padding" => "12px 16px 12px" + ], + self::is_class + ), + "div" + ); + + if(count($title_container) !== 0){ + + $title_container = + strtolower( + $this->fuckhtml + ->getTextContent( + $title_container[0] + ) + ); + + if($title_container == "imagesview all"){ + + /* + Image carousel + */ + $pcitem = + $this->fuckhtml + ->getElementsByClassName( + "pcitem", + "div" + ); + + foreach($pcitem as $item){ + + $this->fuckhtml->load($item); + + $link = + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0]; + + parse_str( + parse_url( + $this->fuckhtml + ->getTextContent( + $link + ["attributes"] + ["href"] + ), + PHP_URL_QUERY + ), + $link + ); + + if(isset($link["tbm"])){ + + continue; + } + + $image = + $this->fuckhtml + ->getElementsByTagName("img")[0]; + + $title = + $this->fuckhtml + ->getTextContent( + $image + ["attributes"] + ["alt"] + ); + + $image = + $this->getimage( + $image + ["attributes"] + ["id"] + ); + + $out["image"][] = [ + "title" => $title, + "source" => [ + [ + "url" => $link["imgurl"], + "width" => (int)$link["w"], + "height" => (int)$link["h"] + ], + [ + "url" => $image, + "width" => (int)$link["tbnw"], + "height" => (int)$link["tbnh"] + ] + ], + "url" => $link["imgrefurl"] + ]; + } + } + } + + /* + Get next page + */ + $as = + $this->fuckhtml + ->getElementsByTagName("a"); + + foreach($as as $a){ + + if( + isset($a["attributes"]["aria-label"]) && + strtolower($a["attributes"]["aria-label"]) == "next page" + ){ + + $out["npt"] = + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ); + } + } + } + + return $out; } @@ -1163,7 +1662,7 @@ class google{ return $time; } - private function loadjavascriptcrap($html){ + private function parsejavascript($html){ $this->fuckhtml->load($html); diff --git a/scraper/yandex.php b/scraper/yandex.php index 437c8aa..8cb733e 100644 --- a/scraper/yandex.php +++ b/scraper/yandex.php @@ -18,8 +18,6 @@ class yandex{ $curlproc = curl_init(); - $search = $get["text"]; - if($get !== []){ $get = http_build_query($get); $url .= "?" . $get; @@ -40,7 +38,7 @@ class yandex{ "Accept-Language: en-US,en;q=0.5", "DNT: 1", "Cookie: yp=1716337604.sp.family%3A{$nsfw}#1685406411.szm.1:1920x1080:1920x999", - "Referer: https://yandex.com/images/search?text={$search}", + "Referer: https://yandex.com/images/search", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", "Sec-Fetch-Dest: document", @@ -72,6 +70,35 @@ class yandex{ switch($pagetype){ + case "web": + return [ + "lang" => [ + "display" => "Language", + "option" => [ + "any" => "Any language", + "en" => "English", + "ru" => "Russian", + "be" => "Belorussian", + "fr" => "French", + "de" => "German", + "id" => "Indonesian", + "kk" => "Kazakh", + "tt" => "Tatar", + "tr" => "Turkish", + "uk" => "Ukrainian" + ] + ], + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ] + ]; + break; + case "images": return [ @@ -149,12 +176,214 @@ class yandex{ ]; break; - default: - return []; + case "videos": + return [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "9" => "Recently" + ] + ], + "duration" => [ + "display" => "Duration", + "option" => [ + "any" => "Any duration", + "short" => "Short" + ] + ] + ]; break; } } - + + public function web($get){ + + // has captcha + // https://yandex.com/search/touch/?text=lol&app_platform=android&appsearch_header=1&ui=webmobileapp.yandex&app_version=23070603&app_id=ru.yandex.searchplugin&search_source=yandexcom_touch_native&clid=2218567 + + // https://yandex.com/search/site/?text=minecraft&web=1&frame=1&v=2.0&searchid=3131712 + // &within=777&from_day=26&from_month=8&from_year=2023&to_day=26&to_month=8&to_year=2023 + + if($get["npt"]){ + + $npt = $this->nextpage->get($get["npt"], "web"); + + $html = + $this->get( + "https://yandex.com" . $npt, + [], + "yes" + ); + }else{ + + $search = $get["s"]; + $lang = $get["lang"]; + $older = $get["older"]; + $newer = $get["newer"]; + + $params = [ + "text" => $search, + "web" => "1", + "frame" => "1", + "searchid" => "3131712" + ]; + + if($lang != "any"){ + + $params["lang"] = $lang; + } + + if( + $newer === false && + $older !== false + ){ + + $newer = 0; + } + + if($newer !== false){ + + $params["from_day"] = date("j", $newer); + $params["from_month"] = date("n", $newer); + $params["from_year"] = date("Y", $newer); + + if($older === false){ + + $older = time(); + } + + $params["to_day"] = date("j", $older); + $params["to_month"] = date("n", $older); + $params["to_year"] = date("Y", $older); + } + + try{ + $html = + $this->get( + "https://yandex.com/search/site/", + $params, + "yes" + ); + }catch(Exception $error){ + + throw new Exception("Could not get search page"); + } + + /* + $handle = fopen("scraper/yandex.html", "r"); + $html = fread($handle, filesize("scraper/yandex.html")); + fclose($handle);*/ + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + // get nextpage + $npt = + $this->fuckhtml + ->getElementsByClassName( + "b-pager__next", + "a" + ); + + if(count($npt) !== 0){ + + $out["npt"] = + $this->nextpage->store( + $this->fuckhtml + ->getTextContent( + $npt + [0] + ["attributes"] + ["href"] + ), + "web" + ); + } + + // get items + $items = + $this->fuckhtml + ->getElementsByClassName( + "b-serp-item", + "li" + ); + + foreach($items as $item){ + + $this->fuckhtml->load($item); + + $link = + $this->fuckhtml + ->getElementsByClassName( + "b-serp-item__title-link", + "a" + )[0]; + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $link + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "b-serp-item__text", + "div" + )[0] + ) + ), + "url" => + $this->fuckhtml + ->getTextContent( + $link + ["attributes"] + ["href"] + ), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } + public function image($get){ if($get["npt"]){ @@ -402,7 +631,7 @@ class yandex{ $json["type"] == "captcha" ){ - throw new Exception("Yandex blocked this 4get instance. Yandex blocks don't last very long, but the block timer gets reset everytime you make another unsuccessful request. Please try again in ~7 minutes."); + throw new Exception("Yandex blocked this 4get instance. Please try again in ~7 minutes."); } if($json === null){ @@ -513,6 +742,359 @@ class yandex{ return $out; } + public function video($get){ + + if($get["npt"]){ + + $params = + json_decode( + $this->nextpage->get( + $get["npt"], + "web" + ), + true + ); + + $nsfw = $params["nsfw"]; + unset($params["nsfw"]); + }else{ + $search = $get["s"]; + $nsfw = $get["nsfw"]; + $time = $get["time"]; + $duration = $get["duration"]; + + // https://yandex.com/video/search + // ?tmpl_version=releases/frontend/video/v1.1168.0#8d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63 + // &format=json + // &request= + // { + // "blocks":[ + // {"block":"extra-content","params":{},"version":2}, + // {"block":"i-global__params:ajax","params":{},"version":2}, + // {"block":"search2:ajax","params":{},"version":2}, + // {"block":"vital-incut","params":{},"version":2}, + // {"block":"content_type_search","params":{},"version":2}, + // {"block":"serp-controller","params":{},"version":2}, + // {"block":"cookies_ajax","params":{},"version":2} + // ], + // "metadata":{ + // "bundles":{"lb":"^G]!q<X120"}, + // "assets":{"las":"react-with-dom=1;185.0=1;73.0=1;145.0=1;5a502a.0=1;32c342.0=1;b84ac8.0=1"}, + // "extraContent":{"names":["i-react-ajax-adapter"]} + // } + // } + // &yu=4861394161661655015 + // &from=tabbar + // &reqid=1693106278500184-6825210746979814879-balancer-l7leveler-kubr-yp-sas-7-BAL-4237 + // &suggest_reqid=486139416166165501562797413447032 + // &text=minecraft + + $params = [ + "tmpl_version" => "releases/frontend/video/v1.1168.0#8d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63", + "format" => "json", + "request" => json_encode([ + "blocks" => [ + (object)[ + "block" => "extra-content", + "params" => (object)[], + "version" => 2 + ], + (object)[ + "block" => "i-global__params:ajax", + "params" => (object)[], + "version" => 2 + ], + (object)[ + "block" => "search2:ajax", + "params" => (object)[], + "version" => 2 + ], + (object)[ + "block" => "vital-incut", + "params" => (object)[], + "version" => 2 + ], + (object)[ + "block" => "content_type_search", + "params" => (object)[], + "version" => 2 + ], + (object)[ + "block" => "serp-controller", + "params" => (object)[], + "version" => 2 + ], + (object)[ + "block" => "cookies_ajax", + "params" => (object)[], + "version" => 2 + ] + ], + "metadata" => (object)[ + "bundles" => (object)[ + "lb" => "^G]!q<X120" + ], + "assets" => (object)[ + "las" => "react-with-dom=1;185.0=1;73.0=1;145.0=1;5a502a.0=1;32c342.0=1;b84ac8.0=1" + ], + "extraContent" => (object)[ + "names" => [ + "i-react-ajax-adapter" + ] + ] + ] + ]), + "text" => $search + ]; + + if($duration != "any"){ + + $params["duration"] = $duration; + } + + if($time != "any"){ + + $params["within"] = $time; + } + } + /* + $handle = fopen("scraper/yandex-video.json", "r"); + $json = fread($handle, filesize("scraper/yandex-video.json")); + fclose($handle); + */ + try{ + $json = + $this->get( + "https://yandex.com/video/search", + $params, + $nsfw + ); + }catch(Exception $error){ + + throw new Exception("Could not fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Could not parse JSON"); + } + + if(!isset($json["blocks"])){ + + throw new Exception("Yandex blocked this 4get instance. Please try again in 7~ minutes."); + } + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + $html = null; + foreach($json["blocks"] as $block){ + + if(isset($block["html"])){ + + $html .= $block["html"]; + } + } + + $this->fuckhtml->load($html); + + $div = + $this->fuckhtml + ->getElementsByTagName("div"); + + /* + Get nextpage + */ + $npt = + $this->fuckhtml + ->getElementsByClassName( + "more more_direction_next i-bem", + $div + ); + + if(count($npt) !== 0){ + + $params["p"] = "1"; + $params["nsfw"] = $nsfw; + $out["npt"] = + $this->nextpage->store( + json_encode($params), + "web" + ); + } + + $items = + $this->fuckhtml + ->getElementsByClassName( + "serp-item", + $div + ); + + foreach($items as $item){ + + $data = + json_decode( + $this->fuckhtml + ->getTextContent( + $item["attributes"]["data-video"] + ), + true + ); + + $this->fuckhtml->load($item); + + $thumb = + $this->fuckhtml + ->getElementsByClassName( + "thumb-image__image", + "img" + ); + + if(count($thumb) === 0){ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + }else{ + + $c = 1; + $thumb = [ + "url" => + str_replace( + "//", + "https://", + $this->fuckhtml + ->getTextContent( + $thumb + [0] + ["attributes"] + ["src"] + ), + $c + ), + "ratio" => "16:9" + ]; + } + + $smallinfos = + $this->fuckhtml + ->getElementsByClassName( + "serp-item__sitelinks-item", + "div" + ); + + $date = null; + $views = null; + $first = true; + + foreach($smallinfos as $info){ + + if($first){ + + $first = false; + continue; + } + + $info = + $this->fuckhtml + ->getTextContent( + $info + ); + + if($temp_date = strtotime($info)){ + + $date = $temp_date; + }else{ + + $views = $this->parseviews($info); + } + } + + $description = + $this->fuckhtml + ->getElementsByClassName( + "serp-item__text serp-item__text_visibleText_always", + "div" + ); + + if(count($description) === 0){ + + $description = null; + }else{ + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ); + } + + $out["video"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $this->titledots( + $data["title"] + ) + ), + "description" => $description, + "author" => [ + "name" => null, + "url" => null, + "avatar" => null + ], + "date" => $date, + "duration" => + (int)$data + ["counters"] + ["toHostingLoaded"] + ["stredParams"] + ["duration"], + "views" => $views, + "thumb" => $thumb, + "url" => + $this->fuckhtml + ->getTextContent( + $data["counters"] + ["toHostingLoaded"] + ["postfix"] + ["href"] + ) + ]; + } + + return $out; + } + + private function parseviews($text){ + + $text = explode(" ", $text); + + $num = (float)$text[0]; + $mod = $text[1]; + + switch($mod){ + + case "bln.": $num = $num * 1000000000; break; + case "mln.": $num = $num * 1000000; break; + case "thsd.": $num = $num * 1000; break; + } + + return $num; + } + private function titledots($title){ $substr = substr($title, -3); diff --git a/settings.php b/settings.php index c53599f..c968e57 100644 --- a/settings.php +++ b/settings.php @@ -70,6 +70,10 @@ $settings = [ "value" => "brave", "text" => "Brave" ], + [ + "value" => "yandex", + "text" => "Yandex" + ], /*[ "value" => "google", "text" => "Google" @@ -119,16 +123,16 @@ $settings = [ "text" => "YouTube" ], [ - "value" => "fb", - "text" => "Facebook videos" - ], - [ "value" => "ddg", "text" => "DuckDuckGo" ], [ "value" => "brave", "text" => "Brave" + ], + [ + "value" => "yandex", + "text" => "Yandex" ]/*, [ "value" => "google", @@ -147,8 +151,8 @@ $settings = [ [ "value" => "brave", "text" => "Brave" - ],/* - [ + ], + /*[ "value" => "google", "text" => "Google" ],*/ |