diff options
author | lolcat <will@lolcat.ca> | 2024-04-21 19:31:56 -0400 |
---|---|---|
committer | lolcat <will@lolcat.ca> | 2024-04-21 19:31:56 -0400 |
commit | 130358a9e0504a55cf3f86b2d7035feb7f4e84de (patch) | |
tree | 81f59790f7ead0b393a0e0b25caa082216245fcd /scraper | |
parent | 9e18327df69542e07fad2ef471a3ebdbe9b08ae8 (diff) |
v8
Diffstat (limited to 'scraper')
-rw-r--r-- | scraper/marginalia.php | 344 | ||||
-rw-r--r-- | scraper/pinterest.php | 7 | ||||
-rw-r--r-- | scraper/qwant.php | 893 | ||||
-rw-r--r-- | scraper/sc.php | 75 | ||||
-rw-r--r-- | scraper/wiby.php | 2 | ||||
-rw-r--r-- | scraper/yandex.php | 10 |
6 files changed, 1198 insertions, 133 deletions
diff --git a/scraper/marginalia.php b/scraper/marginalia.php index b790a97..2a2c1e6 100644 --- a/scraper/marginalia.php +++ b/scraper/marginalia.php @@ -3,78 +3,103 @@ class marginalia{ public function __construct(){ + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + include "lib/backend.php"; $this->backend = new backend("marginalia"); } public function getfilters($page){ - switch($page){ + if(config::MARGINALIA_API_KEY === null){ - case "web": - return [ - "profile" => [ - "display" => "Profile", - "option" => [ - "any" => "Default", - "modern" => "Modern" - ] - ], - "format" => [ - "display" => "Format", - "option" => [ - "any" => "Any", - "html5" => "html5", - "xhtml" => "xhtml", - "html123" => "html123" - ] - ], - "file" => [ - "display" => "File", - "option" => [ - "any" => "Any", - "nomedia" => "Deny media", - "media" => "Contains media", - "audio" => "Contains audio", - "video" => "Contains video", - "archive" => "Contains archive", - "document" => "Contains document" - ] - ], - "javascript" => [ - "display" => "Javascript", - "option" => [ - "any" => "Allow JS", - "deny" => "Deny JS", - "require" => "Require JS" - ] - ], - "trackers" => [ - "display" => "Trackers", - "option" => [ - "any" => "Allow trackers", - "deny" => "Deny trackers", - "require" => "Require trackers" - ] - ], - "cookies" => [ - "display" => "Cookies", - "option" => [ - "any" => "Allow cookies", - "deny" => "Deny cookies", - "require" => "Require cookies" - ] - ], - "affiliate" => [ - "display" => "Affiliate links in body", - "option" => [ - "any" => "Allow affiliate links", - "deny" => "Deny affiliate links", - "require" => "Require affiliate links" - ] + $base = [ + "adtech" => [ + "display" => "Reduce adtech", + "option" => [ + "no" => "No", + "yes" => "Yes" ] - ]; + ], + "recent" => [ + "display" => "Recent results", + "option" => [ + "no" => "No", + "yes" => "Yes" + ] + ], + "intitle" => [ + "display" => "Search in title", + "option" => [ + "no" => "No", + "yes" => "Yes" + ] + ] + ]; + }else{ + + $base = []; } + + return array_merge( + $base, + [ + "format" => [ + "display" => "Format", + "option" => [ + "any" => "Any format", + "html5" => "html5", + "xhtml" => "xhtml", + "html123" => "html123" + ] + ], + "file" => [ + "display" => "Filetype", + "option" => [ + "any" => "Any filetype", + "nomedia" => "Deny media", + "media" => "Contains media", + "audio" => "Contains audio", + "video" => "Contains video", + "archive" => "Contains archive", + "document" => "Contains document" + ] + ], + "javascript" => [ + "display" => "Javascript", + "option" => [ + "any" => "Allow JS", + "deny" => "Deny JS", + "require" => "Require JS" + ] + ], + "trackers" => [ + "display" => "Trackers", + "option" => [ + "any" => "Allow trackers", + "deny" => "Deny trackers", + "require" => "Require trackers" + ] + ], + "cookies" => [ + "display" => "Cookies", + "option" => [ + "any" => "Allow cookies", + "deny" => "Deny cookies", + "require" => "Require cookies" + ] + ], + "affiliate" => [ + "display" => "Affiliate links in body", + "option" => [ + "any" => "Allow affiliate links", + "deny" => "Deny affiliate links", + "require" => "Require affiliate links" + ] + ] + ] + ); } private function get($proxy, $url, $get = []){ @@ -132,7 +157,6 @@ class marginalia{ throw new Exception("Search term is empty!"); } - $profile = $get["profile"]; $format = $get["format"]; $file = $get["file"]; @@ -180,38 +204,6 @@ class marginalia{ $search = implode(" ", $search); - $params = [ - "count" => 20 - ]; - - if($profile == "modern"){ - - $params["index"] = 1; - } - - try{ - $json = - $this->get( - $this->backend->get_ip(), // no nextpage - "https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search), - $params - ); - }catch(Exception $error){ - - throw new Exception("Failed to get JSON"); - } - - if($json == "Slow down"){ - - throw new Exception("The API key used is rate limited. Please try again in a few minutes."); - } - - $json = json_decode($json, true); - /* - $handle = fopen("scraper/marginalia.json", "r"); - $json = json_decode(fread($handle, filesize("scraper/marginalia.json")), true); - fclose($handle);*/ - $out = [ "status" => "ok", "spelling" => [ @@ -228,19 +220,169 @@ class marginalia{ "related" => [] ]; - foreach($json["results"] as $result){ + if(config::MARGINALIA_API_KEY !== null){ + + try{ + $json = + $this->get( + $this->backend->get_ip(), // no nextpage + "https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search), + [ + "count" => 20 + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get JSON"); + } + + if($json == "Slow down"){ + + throw new Exception("The API key used is rate limited. Please try again in a few minutes."); + } + + $json = json_decode($json, true); + + foreach($json["results"] as $result){ + + $out["web"][] = [ + "title" => $result["title"], + "description" => str_replace("\n", " ", $result["description"]), + "url" => $result["url"], + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } + + // no more cloudflare!! Parse html by default + $params = [ + "query" => $search + ]; + + foreach(["adtech", "recent", "intitle"] as $v){ + + if($get[$v] == "yes"){ + + switch($v){ + + case "adtech": $params["adtech"] = "reduce"; break; + case "recent": $params["recent"] = "recent"; break; + case "adtech": $params["searchTitle"] = "title"; break; + } + } + } + + try{ + $html = + $this->get( + $this->backend->get_ip(), + "https://search.marginalia.nu/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + $this->fuckhtml->load($html); + + $sections = + $this->fuckhtml + ->getElementsByClassName( + "card search-result", + "section" + ); + + foreach($sections as $section){ + + $this->fuckhtml->load($section); + + $title = + $this->fuckhtml + ->getElementsByClassName( + "title", + "a" + )[0]; + + $description = + $this->fuckhtml + ->getElementsByClassName( + "description", + "p" + ); + + if(count($description) !== 0){ + + $description = + $this->fuckhtml + ->getTextContent( + $description[0] + ); + }else{ + + $description = null; + } + + $sublinks = []; + $sublink_html = + $this->fuckhtml + ->getElementsByClassName("additional-results"); + + if(count($sublink_html) !== 0){ + + $this->fuckhtml->load($sublink_html[0]); + + $links = + $this->fuckhtml + ->getElementsByTagName("a"); + + foreach($links as $link){ + + $sublinks[] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $link + ), + "date" => null, + "description" => null, + "url" => + $this->fuckhtml + ->getTextContent( + $link["attributes"]["href"] + ) + ]; + } + } $out["web"][] = [ - "title" => $result["title"], - "description" => str_replace("\n", " ", $result["description"]), - "url" => $result["url"], + "title" => + $this->fuckhtml + ->getTextContent( + $title + ), + "description" => $description, + "url" => + $this->fuckhtml + ->getTextContent( + $title["attributes"]["href"] + ), "date" => null, "type" => "web", "thumb" => [ "url" => null, "ratio" => null ], - "sublink" => [], + "sublink" => $sublinks, "table" => [] ]; } diff --git a/scraper/pinterest.php b/scraper/pinterest.php index 37473a1..f3c4439 100644 --- a/scraper/pinterest.php +++ b/scraper/pinterest.php @@ -4,11 +4,8 @@ class pinterest{ public function __construct(){ - include "lib/nextpage.php"; - $this->nextpage = new nextpage("pinterest"); - - include "lib/proxy_pool.php"; - $this->proxy = new proxy_pool("pinterest"); + include "lib/backend.php"; + $this->backend = new backend("pinterest"); } public function getfilters($page){ diff --git a/scraper/qwant.php b/scraper/qwant.php new file mode 100644 index 0000000..9cc9b9e --- /dev/null +++ b/scraper/qwant.php @@ -0,0 +1,893 @@ +<?php + +class qwant{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("qwant"); + } + + public function getfilters($page){ + + $base = [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "country" => [ + "display" => "Country", + "option" => [ + "en_US" => "United States", + "fr_FR" => "France", + "en_GB" => "Great Britain", + "de_DE" => "Germany", + "it_IT" => "Italy", + "es_AR" => "Argentina", + "en_AU" => "Australia", + "es_ES" => "Spain (es)", + "ca_ES" => "Spain (ca)", + "cs_CZ" => "Czech Republic", + "ro_RO" => "Romania", + "el_GR" => "Greece", + "zh_CN" => "China", + "zh_HK" => "Hong Kong", + "en_NZ" => "New Zealand", + "fr_FR" => "France", + "th_TH" => "Thailand", + "ko_KR" => "South Korea", + "sv_SE" => "Sweden", + "nb_NO" => "Norway", + "da_DK" => "Denmark", + "hu_HU" => "Hungary", + "et_EE" => "Estonia", + "es_MX" => "Mexico", + "es_CL" => "Chile", + "en_CA" => "Canada (en)", + "fr_CA" => "Canada (fr)", + "en_MY" => "Malaysia", + "bg_BG" => "Bulgaria", + "fi_FI" => "Finland", + "pl_PL" => "Poland", + "nl_NL" => "Netherlands", + "pt_PT" => "Portugal", + "de_CH" => "Switzerland (de)", + "fr_CH" => "Switzerland (fr)", + "it_CH" => "Switzerland (it)", + "de_AT" => "Austria", + "fr_BE" => "Belgium (fr)", + "nl_BE" => "Belgium (nl)", + "en_IE" => "Ireland", + "he_IL" => "Israel" + ] + ] + ]; + + switch($page){ + + case "web": + $base = array_merge( + $base, + [ + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "day" => "Past 24 hours", + "week" => "Past week", + "month" => "Past month" + ] + ], + "extendedsearch" => [ + // no display, wont show in interface + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ] + ] + ); + break; + + case "images": + $base = array_merge( + $base, + [ + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "day" => "Past 24 hours", + "week" => "Past week", + "month" => "Past month" + ] + ], + "size" => [ + "display" => "Size", + "option" => [ + "any" => "Any size", + "large" => "Large", + "medium" => "Medium", + "small" => "Small" + ] + ], + "color" => [ + "display" => "Color", + "option" => [ + "any" => "Any color", + "coloronly" => "Color only", + "monochrome" => "Monochrome", + "black" => "Black", + "brown" => "Brown", + "gray" => "Gray", + "white" => "White", + "yellow" => "Yellow", + "orange" => "Orange", + "red" => "Red", + "pink" => "Pink", + "purple" => "Purple", + "blue" => "Blue", + "teal" => "Teal", + "green" => "Green" + ] + ], + "imagetype" => [ + "display" => "Type", + "option" => [ + "any" => "Any type", + "animatedgif" => "Animated GIF", + "photo" => "Photograph", + "transparent" => "Transparent" + ] + ], + "license" => [ + "display" => "License", + "option" => [ + "any" => "Any license", + "share" => "Non-commercial reproduction and sharing", + "sharecommercially" => "Reproduction and sharing", + "modify" => "Non-commercial reproduction, sharing and modification", + "modifycommercially" => "Reproduction, sharing and modification", + "public" => "Public domain" + ] + ] + ] + ); + break; + + case "videos": + $base = array_merge( + $base, + [ + "order" => [ + "display" => "Order by", + "option" => [ + "relevance" => "Relevance", + "views" => "Views", + "date" => "Most recent", + ] + ], + "source" => [ + "display" => "Source", + "option" => [ + "any" => "Any source", + "youtube" => "YouTube", + "dailymotion" => "Dailymotion", + ] + ] + ] + ); + break; + + case "news": + $base = array_merge( + $base, + [ + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "hour" => "Less than 1 hour ago", + "day" => "Past 24 hours", + "week" => "Past week", + "month" => "Past month" + ] + ], + "order" => [ + "display" => "Order by", + "option" => [ + "relevance" => "Relevance", + "date" => "Most recent" + ] + ] + ] + ); + break; + } + + return $base; + } + + private function get($proxy, $url, $get = []){ + + $headers = [ + "User-Agent: " . config::USER_AGENT, + "Accept: application/json, text/plain, */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Origin: https://www.qwant.com", + "Referer: https://www.qwant.com/", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site", + "TE: trailers" + ]; + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + // Bypass HTTP/2 check + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + // get next page data + [$params, $proxy] = $this->backend->get($get["npt"], "web"); + + $params = json_decode($params, true); + + }else{ + + // get _GET data instead + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search term is too long!"); + } + + $proxy = $this->backend->get_ip(); + + $params = [ + "q" => $search, + "freshness" => $get["time"], + "count" => 10, + "locale" => $get["country"], + "offset" => 0, + "device" => "desktop", + "tgp" => 3, + "safesearch" => 0, + "displayed" => "true" + ]; + + switch($get["nsfw"]){ + + case "yes": $params["safesearch"] = 0; break; + case "maybe": $params["safesearch"] = 1; break; + case "no": $params["safesearch"] = 2; break; + } + } + /* + $handle = fopen("scraper/qwant_web.json", "r"); + $json = fread($handle, filesize("scraper/qwant_web.json")); + fclose($handle);*/ + + try{ + $json = + $this->get( + $proxy, + "https://fdn.qwant.com/v3/search/web", + $params + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === NULL){ + + throw new Exception("Failed to decode JSON"); + } + + if(isset($json["data"]["message"][0])){ + + throw new Exception("Server returned an error:\n" . $json["data"]["message"][0]); + } + + if($json["status"] != "success"){ + + if($json["data"]["error_code"] === 5){ + + return $out; + } + + throw new Exception("Server returned an error code: " . $json["data"]["error_code"]); + } + + if(!isset($json["data"]["result"]["items"]["mainline"])){ + + throw new Exception("Server did not return a result object"); + } + + // data is OK, parse + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // get instant answer + if( + $get["extendedsearch"] == "yes" && + isset($json["data"]["result"]["items"]["sidebar"][0]["endpoint"]) + ){ + + try{ + $answer = + $this->get( + $proxy, + "https://api.qwant.com/v3" . + $json["data"]["result"]["items"]["sidebar"][0]["endpoint"], + [] + ); + + $answer = json_decode($answer, true); + + if( + $answer === null || + $answer["status"] != "success" || + $answer["data"]["result"] === null + ){ + + throw new Exception(); + } + + // parse answer + $out["answer"][] = [ + "title" => $answer["data"]["result"]["title"], + "description" => [ + [ + "type" => "text", + "value" => $this->trimdots($answer["data"]["result"]["description"]) + ] + ], + "url" => $answer["data"]["result"]["url"], + "thumb" => + $answer["data"]["result"]["thumbnail"]["landscape"] == null ? + null : + $this->unshitimage( + $answer["data"]["result"]["thumbnail"]["landscape"], + false + ), + "table" => [], + "sublink" => [] + ]; + + }catch(Exception $error){ + + // do nothing in case of failure + } + + } + + // get word correction + if(isset($json["data"]["query"]["queryContext"]["alteredQuery"])){ + + $out["spelling"] = [ + "type" => "including", + "using" => $json["data"]["query"]["queryContext"]["alteredQuery"], + "correction" => $json["data"]["query"]["queryContext"]["alterationOverrideQuery"] + ]; + } + + // check for next page + if($json["data"]["result"]["lastPage"] === false){ + + $params["offset"] = $params["offset"] + 10; + + $out["npt"] = + $this->backend->store( + json_encode($params), + "web", + $proxy + ); + } + + // parse results + foreach($json["data"]["result"]["items"]["mainline"] as $item){ + + switch($item["type"]){ // ignores ads + + case "web": + foreach($item["items"] as $result){ + + if(isset($result["thumbnailUrl"])){ + + $thumb = [ + "url" => $this->unshitimage($result["thumbnailUrl"]), + "ratio" => "16:9" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + $sublinks = []; + if(isset($result["links"])){ + + foreach($result["links"] as $link){ + + $sublinks[] = [ + "title" => $this->trimdots($link["title"]), + "date" => null, + "description" => isset($link["desc"]) ? $this->trimdots($link["desc"]) : null, + "url" => $link["url"] + ]; + } + } + + $out["web"][] = [ + "title" => $this->trimdots($result["title"]), + "description" => $this->trimdots($result["desc"]), + "url" => $result["url"], + "date" => null, + "type" => "web", + "thumb" => $thumb, + "sublink" => $sublinks, + "table" => [] + ]; + } + break; + + case "images": + foreach($item["items"] as $image){ + + $out["image"][] = [ + "title" => $image["title"], + "source" => [ + [ + "url" => $image["media"], + "width" => (int)$image["width"], + "height" => (int)$image["height"] + ], + [ + "url" => $this->unshitimage($image["thumbnail"]), + "width" => $image["thumb_width"], + "height" => $image["thumb_height"] + ] + ], + "url" => $image["url"] + ]; + } + break; + + case "videos": + foreach($item["items"] as $video){ + + $out["video"][] = [ + "title" => $video["title"], + "description" => null, + "date" => (int)$video["date"], + "duration" => $video["duration"] === null ? null : $video["duration"] / 1000, + "views" => null, + "thumb" => + $video["thumbnail"] === null ? + [ + "url" => null, + "ratio" => null, + ] : + [ + "url" => $this->unshitimage($video["thumbnail"]), + "ratio" => "16:9", + ], + "url" => $video["url"] + ]; + } + break; + + case "related_searches": + foreach($item["items"] as $related){ + + $out["related"][] = $related["text"]; + } + break; + } + } + + return $out; + } + + + public function image($get){ + + if($get["npt"]){ + + [$params, $proxy] = + $this->backend->get( + $get["npt"], + "images" + ); + + $params = json_decode($params, true); + }else{ + + $search = $get["s"]; + + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + $params = [ + "t" => "images", + "q" => $search, + "count" => 125, + "locale" => $get["country"], + "offset" => 0, // increment by 125 + "device" => "desktop", + "tgp" => 3 + ]; + + if($get["time"] != "any"){ + + $params["freshness"] = $get["time"]; + } + + foreach(["size", "color", "imagetype", "license"] as $p){ + + if($get[$p] != "any"){ + + $params[$p] = $get[$p]; + } + } + + switch($get["nsfw"]){ + + case "yes": $params["safesearch"] = 0; break; + case "maybe": $params["safesearch"] = 1; break; + case "no": $params["safesearch"] = 2; break; + } + } + + try{ + $json = $this->get( + $proxy, + "https://api.qwant.com/v3/search/images", + $params, + ); + }catch(Exception $err){ + + throw new Exception("Failed to get JSON"); + } + + /* + $handle = fopen("scraper/yandex.json", "r"); + $json = fread($handle, filesize("scraper/yandex.json")); + fclose($handle);*/ + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + if($json["status"] != "success"){ + + throw new Exception("Qwant returned an API error"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + if($json["data"]["result"]["lastPage"] === false){ + + $params["offset"] = $params["offset"] + 125; + + $out["npt"] = $this->backend->store( + json_encode($params), + "images", + $proxy + ); + } + + foreach($json["data"]["result"]["items"] as $image){ + + $out["image"][] = [ + "title" => $this->trimdots($image["title"]), + "source" => [ + [ + "url" => $image["media"], + "width" => $image["width"], + "height" => $image["height"] + ], + [ + "url" => $this->unshitimage($image["thumbnail"]), + "width" => $image["thumb_width"], + "height" => $image["thumb_height"] + ] + ], + "url" => $image["url"] + ]; + } + + return $out; + } + + public function video($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $params = [ + "t" => "videos", + "q" => $search, + "count" => 50, + "locale" => $get["country"], + "offset" => 0, // dont implement pagination + "device" => "desktop", + "tgp" => 3 + ]; + + switch($get["nsfw"]){ + + case "yes": $params["safesearch"] = 0; break; + case "maybe": $params["safesearch"] = 1; break; + case "no": $params["safesearch"] = 2; break; + } + + try{ + $json = + $this->get( + $this->backend->get_ip(), + "https://api.qwant.com/v3/search/videos", + $params + ); + }catch(Exception $error){ + + throw new Exception("Could not fetch JSON"); + } + + /* + $handle = fopen("scraper/yandex-video.json", "r"); + $json = fread($handle, filesize("scraper/yandex-video.json")); + fclose($handle); + */ + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Could not parse JSON"); + } + + if($json["status"] != "success"){ + + throw new Exception("Qwant returned an API error"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + foreach($json["data"]["result"]["items"] as $video){ + + if(empty($video["thumbnail"])){ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + }else{ + + $thumb = [ + "url" => $this->unshitimage($video["thumbnail"], false), + "ratio" => "16:9" + ]; + } + + $duration = (int)$video["duration"]; + + $out["video"][] = [ + "title" => $video["title"], + "description" => $this->limitstrlen($video["desc"]), + "author" => [ + "name" => $video["channel"], + "url" => null, + "avatar" => null + ], + "date" => (int)$video["date"], + "duration" => $duration === 0 ? null : $duration, + "views" => null, + "thumb" => $thumb, + "url" => preg_replace("/\?syndication=.+/", "", $video["url"]) + ]; + } + + return $out; + } + + public function news($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $params = [ + "t" => "news", + "q" => $search, + "count" => 50, + "locale" => $get["country"], + "offset" => 0, // dont implement pagination + "device" => "desktop", + "tgp" => 3 + ]; + + switch($get["nsfw"]){ + + case "yes": $params["safesearch"] = 0; break; + case "maybe": $params["safesearch"] = 1; break; + case "no": $params["safesearch"] = 2; break; + } + + try{ + $json = + $this->get( + $this->backend->get_ip(), + "https://api.qwant.com/v3/search/news", + $params + ); + }catch(Exception $error){ + + throw new Exception("Could not fetch JSON"); + } + + /* + $handle = fopen("scraper/yandex-video.json", "r"); + $json = fread($handle, filesize("scraper/yandex-video.json")); + fclose($handle); + */ + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Could not parse JSON"); + } + + if($json["status"] != "success"){ + + throw new Exception("Qwant returned an API error"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + foreach($json["data"]["result"]["items"] as $news){ + + if(empty($news["media"][0]["pict_big"]["url"])){ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + }else{ + + $thumb = [ + "url" => $this->unshitimage($news["media"][0]["pict_big"]["url"], false), + "ratio" => "16:9" + ]; + } + + $out["news"][] = [ + "title" => $news["title"], + "author" => $news["press_name"], + "description" => $this->trimdots($news["desc"]), + "date" => (int)$news["date"], + "thumb" => $thumb, + "url" => $news["url"] + ]; + } + + return $out; + } + + private function limitstrlen($text){ + + return explode("\n", wordwrap($text, 300, "\n"))[0]; + } + + private function trimdots($text){ + + return trim($text, ". "); + } + + private function unshitimage($url, $is_bing = true){ + + // https://s1.qwant.com/thumbr/0x0/8/d/f6de4deb2c2b12f55d8bdcaae576f9f62fd58a05ec0feeac117b354d1bf5c2/th.jpg?u=https%3A%2F%2Fwww.bing.com%2Fth%3Fid%3DOIP.vvDWsagzxjoKKP_rOqhwrQAAAA%26w%3D160%26h%3D160%26c%3D7%26pid%3D5.1&q=0&b=1&p=0&a=0 + parse_str(parse_url($url)["query"], $parts); + + if($is_bing){ + $parse = parse_url($parts["u"]); + parse_str($parse["query"], $parts); + + return "https://" . $parse["host"] . "/th?id=" . urlencode($parts["id"]); + } + + return $parts["u"]; + } +} diff --git a/scraper/sc.php b/scraper/sc.php index 23742f1..e2e7385 100644 --- a/scraper/sc.php +++ b/scraper/sc.php @@ -70,7 +70,7 @@ class sc{ return $data; } - public function music($get){ + public function music($get, $last_attempt = false){ if($get["npt"]){ @@ -108,6 +108,7 @@ class sc{ $type = $get["type"]; $proxy = $this->backend->get_ip(); + $token = $this->get_token($proxy); switch($type){ @@ -117,12 +118,11 @@ class sc{ "q" => $search, "variant_ids" => "", "facet" => "model", - "user_id" => config::SC_USER_ID, - "client_id" => config::SC_CLIENT_TOKEN, + "client_id" => $token, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, - "app_version" => 1696577813, + "app_version" => 1713542117, "app_locale" => "en" ]; break; @@ -133,12 +133,11 @@ class sc{ "q" => $search, "variant_ids" => "", "facet_genre" => "", - "user_id" => config::SC_USER_ID, - "client_id" => config::SC_CLIENT_TOKEN, + "client_id" => $token, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, - "app_version" => 1696577813, + "app_version" => 1713542117, "app_locale" => "en" ]; break; @@ -149,12 +148,11 @@ class sc{ "q" => $search, "variant_ids" => "", "facet" => "place", - "user_id" => config::SC_USER_ID, - "client_id" => config::SC_CLIENT_TOKEN, + "client_id" => $token, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, - "app_version" => 1696577813, + "app_version" => 1713542117, "app_locale" => "en" ]; break; @@ -165,12 +163,11 @@ class sc{ "q" => $search, "variant_ids" => "", "facet" => "genre", - "user_id" => config::SC_USER_ID, - "client_id" => config::SC_CLIENT_TOKEN, + "client_id" => $token, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, - "app_version" => 1696577813, + "app_version" => 1713542117, "app_locale" => "en" ]; break; @@ -181,12 +178,11 @@ class sc{ "q" => $search, "variant_ids" => "", "facet" => "genre", - "user_id" => config::SC_USER_ID, - "client_id" => config::SC_CLIENT_TOKEN, + "client_id" => $token, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, - "app_version" => 1696577813, + "app_version" => 1713542117, "app_locale" => "en" ]; break; @@ -198,12 +194,11 @@ class sc{ "variant_ids" => "", "filter.content_tier" => "SUB_HIGH_TIER", "facet" => "genre", - "user_id" => config::SC_USER_ID, - "client_id" => config::SC_CLIENT_TOKEN, + "client_id" => $token, "limit" => 20, "offset" => 0, "linked_partitioning" => 1, - "app_version" => 1696577813, + "app_version" => 1713542117, "app_locale" => "en" ]; break; @@ -229,7 +224,14 @@ class sc{ if($json === null){ - throw new Exception("Failed to decode JSON. Did the keys set in data/config.php expire?"); + if($last_attempt === true){ + + throw new Exception("Fetched an invalid token (please report!!)"); + } + + // token might've expired, get a new one and re-try search + get_token($proxy); + return $this->music($get, true); } $out = [ @@ -352,7 +354,7 @@ class sc{ "endpoint" => "sc", "url" => $item["media"]["transcodings"][0]["url"] . - "?client_id=" . config::SC_CLIENT_TOKEN . + "?client_id=" . $token . "&track_authorization=" . $item["track_authorization"] ]; @@ -390,6 +392,37 @@ class sc{ return $out; } + public function get_token($proxy){ + + $token = apcu_fetch("sc_token"); + + if($token === false){ + + $js = + $this->get( + $proxy, + "https://a-v2.sndcdn.com/assets/1-c3e4038d.js", + [] + ); + + preg_match( + '/client_id=([^"]+)/', + $js, + $token + ); + + if(!isset($token[1])){ + + throw new Exception("Failed to get search token"); + } + + apcu_store("sc_token", $token[1]); + return $token[1]; + } + + return $token; + } + private function limitstrlen($text){ return diff --git a/scraper/wiby.php b/scraper/wiby.php index 2d79c56..59f723c 100644 --- a/scraper/wiby.php +++ b/scraper/wiby.php @@ -209,7 +209,7 @@ class wiby{ $out["web"][] = [ "title" => $this->unescapehtml(trim($links[2][$i])), - "description" => $this->unescapehtml(trim(strip_tags($links[3][$i]))), + "description" => $this->unescapehtml(trim(strip_tags($links[3][$i]), ".\n\r ")), "url" => trim($links[1][$i]), "date" => null, "type" => "web", diff --git a/scraper/yandex.php b/scraper/yandex.php index 9b73428..2e81cee 100644 --- a/scraper/yandex.php +++ b/scraper/yandex.php @@ -644,6 +644,11 @@ class yandex{ $json = json_decode($json, true); + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + if( isset($json["type"]) && $json["type"] == "captcha" @@ -652,11 +657,6 @@ class yandex{ throw new Exception("Yandex blocked this 4get instance. Please try again in ~7 minutes."); } - if($json === null){ - - throw new Exception("Failed to decode JSON"); - } - $out = [ "status" => "ok", "npt" => null, |