path: root/scraper/qwant.php
diff options
Diffstat (limited to 'scraper/qwant.php')
1 files changed, 893 insertions, 0 deletions
diff --git a/scraper/qwant.php b/scraper/qwant.php
new file mode 100644
index 0000000..9cc9b9e
--- /dev/null
+++ b/scraper/qwant.php
@@ -0,0 +1,893 @@
+class qwant{
+ public function __construct(){
+ include "lib/backend.php";
+ $this->backend = new backend("qwant");
+ }
+ public function getfilters($page){
+ $base = [
+ "nsfw" => [
+ "display" => "NSFW",
+ "option" => [
+ "yes" => "Yes",
+ "maybe" => "Maybe",
+ "no" => "No"
+ ]
+ ],
+ "country" => [
+ "display" => "Country",
+ "option" => [
+ "en_US" => "United States",
+ "fr_FR" => "France",
+ "en_GB" => "Great Britain",
+ "de_DE" => "Germany",
+ "it_IT" => "Italy",
+ "es_AR" => "Argentina",
+ "en_AU" => "Australia",
+ "es_ES" => "Spain (es)",
+ "ca_ES" => "Spain (ca)",
+ "cs_CZ" => "Czech Republic",
+ "ro_RO" => "Romania",
+ "el_GR" => "Greece",
+ "zh_CN" => "China",
+ "zh_HK" => "Hong Kong",
+ "en_NZ" => "New Zealand",
+ "fr_FR" => "France",
+ "th_TH" => "Thailand",
+ "ko_KR" => "South Korea",
+ "sv_SE" => "Sweden",
+ "nb_NO" => "Norway",
+ "da_DK" => "Denmark",
+ "hu_HU" => "Hungary",
+ "et_EE" => "Estonia",
+ "es_MX" => "Mexico",
+ "es_CL" => "Chile",
+ "en_CA" => "Canada (en)",
+ "fr_CA" => "Canada (fr)",
+ "en_MY" => "Malaysia",
+ "bg_BG" => "Bulgaria",
+ "fi_FI" => "Finland",
+ "pl_PL" => "Poland",
+ "nl_NL" => "Netherlands",
+ "pt_PT" => "Portugal",
+ "de_CH" => "Switzerland (de)",
+ "fr_CH" => "Switzerland (fr)",
+ "it_CH" => "Switzerland (it)",
+ "de_AT" => "Austria",
+ "fr_BE" => "Belgium (fr)",
+ "nl_BE" => "Belgium (nl)",
+ "en_IE" => "Ireland",
+ "he_IL" => "Israel"
+ ]
+ ]
+ ];
+ switch($page){
+ case "web":
+ $base = array_merge(
+ $base,
+ [
+ "time" => [
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "day" => "Past 24 hours",
+ "week" => "Past week",
+ "month" => "Past month"
+ ]
+ ],
+ "extendedsearch" => [
+ // no display, wont show in interface
+ "option" => [
+ "yes" => "Yes",
+ "no" => "No"
+ ]
+ ]
+ ]
+ );
+ break;
+ case "images":
+ $base = array_merge(
+ $base,
+ [
+ "time" => [
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "day" => "Past 24 hours",
+ "week" => "Past week",
+ "month" => "Past month"
+ ]
+ ],
+ "size" => [
+ "display" => "Size",
+ "option" => [
+ "any" => "Any size",
+ "large" => "Large",
+ "medium" => "Medium",
+ "small" => "Small"
+ ]
+ ],
+ "color" => [
+ "display" => "Color",
+ "option" => [
+ "any" => "Any color",
+ "coloronly" => "Color only",
+ "monochrome" => "Monochrome",
+ "black" => "Black",
+ "brown" => "Brown",
+ "gray" => "Gray",
+ "white" => "White",
+ "yellow" => "Yellow",
+ "orange" => "Orange",
+ "red" => "Red",
+ "pink" => "Pink",
+ "purple" => "Purple",
+ "blue" => "Blue",
+ "teal" => "Teal",
+ "green" => "Green"
+ ]
+ ],
+ "imagetype" => [
+ "display" => "Type",
+ "option" => [
+ "any" => "Any type",
+ "animatedgif" => "Animated GIF",
+ "photo" => "Photograph",
+ "transparent" => "Transparent"
+ ]
+ ],
+ "license" => [
+ "display" => "License",
+ "option" => [
+ "any" => "Any license",
+ "share" => "Non-commercial reproduction and sharing",
+ "sharecommercially" => "Reproduction and sharing",
+ "modify" => "Non-commercial reproduction, sharing and modification",
+ "modifycommercially" => "Reproduction, sharing and modification",
+ "public" => "Public domain"
+ ]
+ ]
+ ]
+ );
+ break;
+ case "videos":
+ $base = array_merge(
+ $base,
+ [
+ "order" => [
+ "display" => "Order by",
+ "option" => [
+ "relevance" => "Relevance",
+ "views" => "Views",
+ "date" => "Most recent",
+ ]
+ ],
+ "source" => [
+ "display" => "Source",
+ "option" => [
+ "any" => "Any source",
+ "youtube" => "YouTube",
+ "dailymotion" => "Dailymotion",
+ ]
+ ]
+ ]
+ );
+ break;
+ case "news":
+ $base = array_merge(
+ $base,
+ [
+ "time" => [
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "hour" => "Less than 1 hour ago",
+ "day" => "Past 24 hours",
+ "week" => "Past week",
+ "month" => "Past month"
+ ]
+ ],
+ "order" => [
+ "display" => "Order by",
+ "option" => [
+ "relevance" => "Relevance",
+ "date" => "Most recent"
+ ]
+ ]
+ ]
+ );
+ break;
+ }
+ return $base;
+ }
+ private function get($proxy, $url, $get = []){
+ $headers = [
+ "User-Agent: " . config::USER_AGENT,
+ "Accept: application/json, text/plain, */*",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Origin:",
+ "Referer:",
+ "Sec-Fetch-Dest: empty",
+ "Sec-Fetch-Mode: cors",
+ "Sec-Fetch-Site: same-site",
+ "TE: trailers"
+ ];
+ $curlproc = curl_init();
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
+ // Bypass HTTP/2 check
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+ $this->backend->assign_proxy($curlproc, $proxy);
+ $data = curl_exec($curlproc);
+ if(curl_errno($curlproc)){
+ throw new Exception(curl_error($curlproc));
+ }
+ curl_close($curlproc);
+ return $data;
+ }
+ public function web($get){
+ if($get["npt"]){
+ // get next page data
+ [$params, $proxy] = $this->backend->get($get["npt"], "web");
+ $params = json_decode($params, true);
+ }else{
+ // get _GET data instead
+ $search = $get["s"];
+ if(strlen($search) === 0){
+ throw new Exception("Search term is empty!");
+ }
+ if(strlen($search) > 2048){
+ throw new Exception("Search term is too long!");
+ }
+ $proxy = $this->backend->get_ip();
+ $params = [
+ "q" => $search,
+ "freshness" => $get["time"],
+ "count" => 10,
+ "locale" => $get["country"],
+ "offset" => 0,
+ "device" => "desktop",
+ "tgp" => 3,
+ "safesearch" => 0,
+ "displayed" => "true"
+ ];
+ switch($get["nsfw"]){
+ case "yes": $params["safesearch"] = 0; break;
+ case "maybe": $params["safesearch"] = 1; break;
+ case "no": $params["safesearch"] = 2; break;
+ }
+ }
+ /*
+ $handle = fopen("scraper/qwant_web.json", "r");
+ $json = fread($handle, filesize("scraper/qwant_web.json"));
+ fclose($handle);*/
+ try{
+ $json =
+ $this->get(
+ $proxy,
+ "",
+ $params
+ );
+ }catch(Exception $error){
+ throw new Exception("Could not fetch JSON");
+ }
+ $json = json_decode($json, true);
+ if($json === NULL){
+ throw new Exception("Failed to decode JSON");
+ }
+ if(isset($json["data"]["message"][0])){
+ throw new Exception("Server returned an error:\n" . $json["data"]["message"][0]);
+ }
+ if($json["status"] != "success"){
+ if($json["data"]["error_code"] === 5){
+ return $out;
+ }
+ throw new Exception("Server returned an error code: " . $json["data"]["error_code"]);
+ }
+ if(!isset($json["data"]["result"]["items"]["mainline"])){
+ throw new Exception("Server did not return a result object");
+ }
+ // data is OK, parse
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+ // get instant answer
+ if(
+ $get["extendedsearch"] == "yes" &&
+ isset($json["data"]["result"]["items"]["sidebar"][0]["endpoint"])
+ ){
+ try{
+ $answer =
+ $this->get(
+ $proxy,
+ "" .
+ $json["data"]["result"]["items"]["sidebar"][0]["endpoint"],
+ []
+ );
+ $answer = json_decode($answer, true);
+ if(
+ $answer === null ||
+ $answer["status"] != "success" ||
+ $answer["data"]["result"] === null
+ ){
+ throw new Exception();
+ }
+ // parse answer
+ $out["answer"][] = [
+ "title" => $answer["data"]["result"]["title"],
+ "description" => [
+ [
+ "type" => "text",
+ "value" => $this->trimdots($answer["data"]["result"]["description"])
+ ]
+ ],
+ "url" => $answer["data"]["result"]["url"],
+ "thumb" =>
+ $answer["data"]["result"]["thumbnail"]["landscape"] == null ?
+ null :
+ $this->unshitimage(
+ $answer["data"]["result"]["thumbnail"]["landscape"],
+ false
+ ),
+ "table" => [],
+ "sublink" => []
+ ];
+ }catch(Exception $error){
+ // do nothing in case of failure
+ }
+ }
+ // get word correction
+ if(isset($json["data"]["query"]["queryContext"]["alteredQuery"])){
+ $out["spelling"] = [
+ "type" => "including",
+ "using" => $json["data"]["query"]["queryContext"]["alteredQuery"],
+ "correction" => $json["data"]["query"]["queryContext"]["alterationOverrideQuery"]
+ ];
+ }
+ // check for next page
+ if($json["data"]["result"]["lastPage"] === false){
+ $params["offset"] = $params["offset"] + 10;
+ $out["npt"] =
+ $this->backend->store(
+ json_encode($params),
+ "web",
+ $proxy
+ );
+ }
+ // parse results
+ foreach($json["data"]["result"]["items"]["mainline"] as $item){
+ switch($item["type"]){ // ignores ads
+ case "web":
+ foreach($item["items"] as $result){
+ if(isset($result["thumbnailUrl"])){
+ $thumb = [
+ "url" => $this->unshitimage($result["thumbnailUrl"]),
+ "ratio" => "16:9"
+ ];
+ }else{
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }
+ $sublinks = [];
+ if(isset($result["links"])){
+ foreach($result["links"] as $link){
+ $sublinks[] = [
+ "title" => $this->trimdots($link["title"]),
+ "date" => null,
+ "description" => isset($link["desc"]) ? $this->trimdots($link["desc"]) : null,
+ "url" => $link["url"]
+ ];
+ }
+ }
+ $out["web"][] = [
+ "title" => $this->trimdots($result["title"]),
+ "description" => $this->trimdots($result["desc"]),
+ "url" => $result["url"],
+ "date" => null,
+ "type" => "web",
+ "thumb" => $thumb,
+ "sublink" => $sublinks,
+ "table" => []
+ ];
+ }
+ break;
+ case "images":
+ foreach($item["items"] as $image){
+ $out["image"][] = [
+ "title" => $image["title"],
+ "source" => [
+ [
+ "url" => $image["media"],
+ "width" => (int)$image["width"],
+ "height" => (int)$image["height"]
+ ],
+ [
+ "url" => $this->unshitimage($image["thumbnail"]),
+ "width" => $image["thumb_width"],
+ "height" => $image["thumb_height"]
+ ]
+ ],
+ "url" => $image["url"]
+ ];
+ }
+ break;
+ case "videos":
+ foreach($item["items"] as $video){
+ $out["video"][] = [
+ "title" => $video["title"],
+ "description" => null,
+ "date" => (int)$video["date"],
+ "duration" => $video["duration"] === null ? null : $video["duration"] / 1000,
+ "views" => null,
+ "thumb" =>
+ $video["thumbnail"] === null ?
+ [
+ "url" => null,
+ "ratio" => null,
+ ] :
+ [
+ "url" => $this->unshitimage($video["thumbnail"]),
+ "ratio" => "16:9",
+ ],
+ "url" => $video["url"]
+ ];
+ }
+ break;
+ case "related_searches":
+ foreach($item["items"] as $related){
+ $out["related"][] = $related["text"];
+ }
+ break;
+ }
+ }
+ return $out;
+ }
+ public function image($get){
+ if($get["npt"]){
+ [$params, $proxy] =
+ $this->backend->get(
+ $get["npt"],
+ "images"
+ );
+ $params = json_decode($params, true);
+ }else{
+ $search = $get["s"];
+ if(strlen($search) === 0){
+ throw new Exception("Search term is empty!");
+ }
+ $proxy = $this->backend->get_ip();
+ $params = [
+ "t" => "images",
+ "q" => $search,
+ "count" => 125,
+ "locale" => $get["country"],
+ "offset" => 0, // increment by 125
+ "device" => "desktop",
+ "tgp" => 3
+ ];
+ if($get["time"] != "any"){
+ $params["freshness"] = $get["time"];
+ }
+ foreach(["size", "color", "imagetype", "license"] as $p){
+ if($get[$p] != "any"){
+ $params[$p] = $get[$p];
+ }
+ }
+ switch($get["nsfw"]){
+ case "yes": $params["safesearch"] = 0; break;
+ case "maybe": $params["safesearch"] = 1; break;
+ case "no": $params["safesearch"] = 2; break;
+ }
+ }
+ try{
+ $json = $this->get(
+ $proxy,
+ "",
+ $params,
+ );
+ }catch(Exception $err){
+ throw new Exception("Failed to get JSON");
+ }
+ /*
+ $handle = fopen("scraper/yandex.json", "r");
+ $json = fread($handle, filesize("scraper/yandex.json"));
+ fclose($handle);*/
+ $json = json_decode($json, true);
+ if($json === null){
+ throw new Exception("Failed to decode JSON");
+ }
+ if($json["status"] != "success"){
+ throw new Exception("Qwant returned an API error");
+ }
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "image" => []
+ ];
+ if($json["data"]["result"]["lastPage"] === false){
+ $params["offset"] = $params["offset"] + 125;
+ $out["npt"] = $this->backend->store(
+ json_encode($params),
+ "images",
+ $proxy
+ );
+ }
+ foreach($json["data"]["result"]["items"] as $image){
+ $out["image"][] = [
+ "title" => $this->trimdots($image["title"]),
+ "source" => [
+ [
+ "url" => $image["media"],
+ "width" => $image["width"],
+ "height" => $image["height"]
+ ],
+ [
+ "url" => $this->unshitimage($image["thumbnail"]),
+ "width" => $image["thumb_width"],
+ "height" => $image["thumb_height"]
+ ]
+ ],
+ "url" => $image["url"]
+ ];
+ }
+ return $out;
+ }
+ public function video($get){
+ $search = $get["s"];
+ if(strlen($search) === 0){
+ throw new Exception("Search term is empty!");
+ }
+ $params = [
+ "t" => "videos",
+ "q" => $search,
+ "count" => 50,
+ "locale" => $get["country"],
+ "offset" => 0, // dont implement pagination
+ "device" => "desktop",
+ "tgp" => 3
+ ];
+ switch($get["nsfw"]){
+ case "yes": $params["safesearch"] = 0; break;
+ case "maybe": $params["safesearch"] = 1; break;
+ case "no": $params["safesearch"] = 2; break;
+ }
+ try{
+ $json =
+ $this->get(
+ $this->backend->get_ip(),
+ "",
+ $params
+ );
+ }catch(Exception $error){
+ throw new Exception("Could not fetch JSON");
+ }
+ /*
+ $handle = fopen("scraper/yandex-video.json", "r");
+ $json = fread($handle, filesize("scraper/yandex-video.json"));
+ fclose($handle);
+ */
+ $json = json_decode($json, true);
+ if($json === null){
+ throw new Exception("Could not parse JSON");
+ }
+ if($json["status"] != "success"){
+ throw new Exception("Qwant returned an API error");
+ }
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "video" => [],
+ "author" => [],
+ "livestream" => [],
+ "playlist" => [],
+ "reel" => []
+ ];
+ foreach($json["data"]["result"]["items"] as $video){
+ if(empty($video["thumbnail"])){
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }else{
+ $thumb = [
+ "url" => $this->unshitimage($video["thumbnail"], false),
+ "ratio" => "16:9"
+ ];
+ }
+ $duration = (int)$video["duration"];
+ $out["video"][] = [
+ "title" => $video["title"],
+ "description" => $this->limitstrlen($video["desc"]),
+ "author" => [
+ "name" => $video["channel"],
+ "url" => null,
+ "avatar" => null
+ ],
+ "date" => (int)$video["date"],
+ "duration" => $duration === 0 ? null : $duration,
+ "views" => null,
+ "thumb" => $thumb,
+ "url" => preg_replace("/\?syndication=.+/", "", $video["url"])
+ ];
+ }
+ return $out;
+ }
+ public function news($get){
+ $search = $get["s"];
+ if(strlen($search) === 0){
+ throw new Exception("Search term is empty!");
+ }
+ $params = [
+ "t" => "news",
+ "q" => $search,
+ "count" => 50,
+ "locale" => $get["country"],
+ "offset" => 0, // dont implement pagination
+ "device" => "desktop",
+ "tgp" => 3
+ ];
+ switch($get["nsfw"]){
+ case "yes": $params["safesearch"] = 0; break;
+ case "maybe": $params["safesearch"] = 1; break;
+ case "no": $params["safesearch"] = 2; break;
+ }
+ try{
+ $json =
+ $this->get(
+ $this->backend->get_ip(),
+ "",
+ $params
+ );
+ }catch(Exception $error){
+ throw new Exception("Could not fetch JSON");
+ }
+ /*
+ $handle = fopen("scraper/yandex-video.json", "r");
+ $json = fread($handle, filesize("scraper/yandex-video.json"));
+ fclose($handle);
+ */
+ $json = json_decode($json, true);
+ if($json === null){
+ throw new Exception("Could not parse JSON");
+ }
+ if($json["status"] != "success"){
+ throw new Exception("Qwant returned an API error");
+ }
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "news" => []
+ ];
+ foreach($json["data"]["result"]["items"] as $news){
+ if(empty($news["media"][0]["pict_big"]["url"])){
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }else{
+ $thumb = [
+ "url" => $this->unshitimage($news["media"][0]["pict_big"]["url"], false),
+ "ratio" => "16:9"
+ ];
+ }
+ $out["news"][] = [
+ "title" => $news["title"],
+ "author" => $news["press_name"],
+ "description" => $this->trimdots($news["desc"]),
+ "date" => (int)$news["date"],
+ "thumb" => $thumb,
+ "url" => $news["url"]
+ ];
+ }
+ return $out;
+ }
+ private function limitstrlen($text){
+ return explode("\n", wordwrap($text, 300, "\n"))[0];
+ }
+ private function trimdots($text){
+ return trim($text, ". ");
+ }
+ private function unshitimage($url, $is_bing = true){
+ //
+ parse_str(parse_url($url)["query"], $parts);
+ if($is_bing){
+ $parse = parse_url($parts["u"]);
+ parse_str($parse["query"], $parts);
+ return "https://" . $parse["host"] . "/th?id=" . urlencode($parts["id"]);
+ }
+ return $parts["u"];
+ }