summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2024-04-21 19:31:56 -0400
committerlolcat <will@lolcat.ca>2024-04-21 19:31:56 -0400
commit130358a9e0504a55cf3f86b2d7035feb7f4e84de (patch)
tree81f59790f7ead0b393a0e0b25caa082216245fcd /scraper
parent9e18327df69542e07fad2ef471a3ebdbe9b08ae8 (diff)
v8
Diffstat (limited to 'scraper')
-rw-r--r--scraper/marginalia.php344
-rw-r--r--scraper/pinterest.php7
-rw-r--r--scraper/qwant.php893
-rw-r--r--scraper/sc.php75
-rw-r--r--scraper/wiby.php2
-rw-r--r--scraper/yandex.php10
6 files changed, 1198 insertions, 133 deletions
diff --git a/scraper/marginalia.php b/scraper/marginalia.php
index b790a97..2a2c1e6 100644
--- a/scraper/marginalia.php
+++ b/scraper/marginalia.php
@@ -3,78 +3,103 @@
class marginalia{
public function __construct(){
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+
include "lib/backend.php";
$this->backend = new backend("marginalia");
}
public function getfilters($page){
- switch($page){
+ if(config::MARGINALIA_API_KEY === null){
- case "web":
- return [
- "profile" => [
- "display" => "Profile",
- "option" => [
- "any" => "Default",
- "modern" => "Modern"
- ]
- ],
- "format" => [
- "display" => "Format",
- "option" => [
- "any" => "Any",
- "html5" => "html5",
- "xhtml" => "xhtml",
- "html123" => "html123"
- ]
- ],
- "file" => [
- "display" => "File",
- "option" => [
- "any" => "Any",
- "nomedia" => "Deny media",
- "media" => "Contains media",
- "audio" => "Contains audio",
- "video" => "Contains video",
- "archive" => "Contains archive",
- "document" => "Contains document"
- ]
- ],
- "javascript" => [
- "display" => "Javascript",
- "option" => [
- "any" => "Allow JS",
- "deny" => "Deny JS",
- "require" => "Require JS"
- ]
- ],
- "trackers" => [
- "display" => "Trackers",
- "option" => [
- "any" => "Allow trackers",
- "deny" => "Deny trackers",
- "require" => "Require trackers"
- ]
- ],
- "cookies" => [
- "display" => "Cookies",
- "option" => [
- "any" => "Allow cookies",
- "deny" => "Deny cookies",
- "require" => "Require cookies"
- ]
- ],
- "affiliate" => [
- "display" => "Affiliate links in body",
- "option" => [
- "any" => "Allow affiliate links",
- "deny" => "Deny affiliate links",
- "require" => "Require affiliate links"
- ]
+ $base = [
+ "adtech" => [
+ "display" => "Reduce adtech",
+ "option" => [
+ "no" => "No",
+ "yes" => "Yes"
]
- ];
+ ],
+ "recent" => [
+ "display" => "Recent results",
+ "option" => [
+ "no" => "No",
+ "yes" => "Yes"
+ ]
+ ],
+ "intitle" => [
+ "display" => "Search in title",
+ "option" => [
+ "no" => "No",
+ "yes" => "Yes"
+ ]
+ ]
+ ];
+ }else{
+
+ $base = [];
}
+
+ return array_merge(
+ $base,
+ [
+ "format" => [
+ "display" => "Format",
+ "option" => [
+ "any" => "Any format",
+ "html5" => "html5",
+ "xhtml" => "xhtml",
+ "html123" => "html123"
+ ]
+ ],
+ "file" => [
+ "display" => "Filetype",
+ "option" => [
+ "any" => "Any filetype",
+ "nomedia" => "Deny media",
+ "media" => "Contains media",
+ "audio" => "Contains audio",
+ "video" => "Contains video",
+ "archive" => "Contains archive",
+ "document" => "Contains document"
+ ]
+ ],
+ "javascript" => [
+ "display" => "Javascript",
+ "option" => [
+ "any" => "Allow JS",
+ "deny" => "Deny JS",
+ "require" => "Require JS"
+ ]
+ ],
+ "trackers" => [
+ "display" => "Trackers",
+ "option" => [
+ "any" => "Allow trackers",
+ "deny" => "Deny trackers",
+ "require" => "Require trackers"
+ ]
+ ],
+ "cookies" => [
+ "display" => "Cookies",
+ "option" => [
+ "any" => "Allow cookies",
+ "deny" => "Deny cookies",
+ "require" => "Require cookies"
+ ]
+ ],
+ "affiliate" => [
+ "display" => "Affiliate links in body",
+ "option" => [
+ "any" => "Allow affiliate links",
+ "deny" => "Deny affiliate links",
+ "require" => "Require affiliate links"
+ ]
+ ]
+ ]
+ );
}
private function get($proxy, $url, $get = []){
@@ -132,7 +157,6 @@ class marginalia{
throw new Exception("Search term is empty!");
}
- $profile = $get["profile"];
$format = $get["format"];
$file = $get["file"];
@@ -180,38 +204,6 @@ class marginalia{
$search = implode(" ", $search);
- $params = [
- "count" => 20
- ];
-
- if($profile == "modern"){
-
- $params["index"] = 1;
- }
-
- try{
- $json =
- $this->get(
- $this->backend->get_ip(), // no nextpage
- "https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
- $params
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to get JSON");
- }
-
- if($json == "Slow down"){
-
- throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
- }
-
- $json = json_decode($json, true);
- /*
- $handle = fopen("scraper/marginalia.json", "r");
- $json = json_decode(fread($handle, filesize("scraper/marginalia.json")), true);
- fclose($handle);*/
-
$out = [
"status" => "ok",
"spelling" => [
@@ -228,19 +220,169 @@ class marginalia{
"related" => []
];
- foreach($json["results"] as $result){
+ if(config::MARGINALIA_API_KEY !== null){
+
+ try{
+ $json =
+ $this->get(
+ $this->backend->get_ip(), // no nextpage
+ "https://api.marginalia.nu/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
+ [
+ "count" => 20
+ ]
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get JSON");
+ }
+
+ if($json == "Slow down"){
+
+ throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
+ }
+
+ $json = json_decode($json, true);
+
+ foreach($json["results"] as $result){
+
+ $out["web"][] = [
+ "title" => $result["title"],
+ "description" => str_replace("\n", " ", $result["description"]),
+ "url" => $result["url"],
+ "date" => null,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => []
+ ];
+ }
+
+ return $out;
+ }
+
+ // no more cloudflare!! Parse html by default
+ $params = [
+ "query" => $search
+ ];
+
+ foreach(["adtech", "recent", "intitle"] as $v){
+
+ if($get[$v] == "yes"){
+
+ switch($v){
+
+ case "adtech": $params["adtech"] = "reduce"; break;
+ case "recent": $params["recent"] = "recent"; break;
+ case "adtech": $params["searchTitle"] = "title"; break;
+ }
+ }
+ }
+
+ try{
+ $html =
+ $this->get(
+ $this->backend->get_ip(),
+ "https://search.marginalia.nu/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ $this->fuckhtml->load($html);
+
+ $sections =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "card search-result",
+ "section"
+ );
+
+ foreach($sections as $section){
+
+ $this->fuckhtml->load($section);
+
+ $title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "title",
+ "a"
+ )[0];
+
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "description",
+ "p"
+ );
+
+ if(count($description) !== 0){
+
+ $description =
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ );
+ }else{
+
+ $description = null;
+ }
+
+ $sublinks = [];
+ $sublink_html =
+ $this->fuckhtml
+ ->getElementsByClassName("additional-results");
+
+ if(count($sublink_html) !== 0){
+
+ $this->fuckhtml->load($sublink_html[0]);
+
+ $links =
+ $this->fuckhtml
+ ->getElementsByTagName("a");
+
+ foreach($links as $link){
+
+ $sublinks[] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $link
+ ),
+ "date" => null,
+ "description" => null,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $link["attributes"]["href"]
+ )
+ ];
+ }
+ }
$out["web"][] = [
- "title" => $result["title"],
- "description" => str_replace("\n", " ", $result["description"]),
- "url" => $result["url"],
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $title
+ ),
+ "description" => $description,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $title["attributes"]["href"]
+ ),
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
- "sublink" => [],
+ "sublink" => $sublinks,
"table" => []
];
}
diff --git a/scraper/pinterest.php b/scraper/pinterest.php
index 37473a1..f3c4439 100644
--- a/scraper/pinterest.php
+++ b/scraper/pinterest.php
@@ -4,11 +4,8 @@ class pinterest{
public function __construct(){
- include "lib/nextpage.php";
- $this->nextpage = new nextpage("pinterest");
-
- include "lib/proxy_pool.php";
- $this->proxy = new proxy_pool("pinterest");
+ include "lib/backend.php";
+ $this->backend = new backend("pinterest");
}
public function getfilters($page){
diff --git a/scraper/qwant.php b/scraper/qwant.php
new file mode 100644
index 0000000..9cc9b9e
--- /dev/null
+++ b/scraper/qwant.php
@@ -0,0 +1,893 @@
+<?php
+
+class qwant{
+
+ public function __construct(){
+
+ include "lib/backend.php";
+ $this->backend = new backend("qwant");
+ }
+
+ public function getfilters($page){
+
+ $base = [
+ "nsfw" => [
+ "display" => "NSFW",
+ "option" => [
+ "yes" => "Yes",
+ "maybe" => "Maybe",
+ "no" => "No"
+ ]
+ ],
+ "country" => [
+ "display" => "Country",
+ "option" => [
+ "en_US" => "United States",
+ "fr_FR" => "France",
+ "en_GB" => "Great Britain",
+ "de_DE" => "Germany",
+ "it_IT" => "Italy",
+ "es_AR" => "Argentina",
+ "en_AU" => "Australia",
+ "es_ES" => "Spain (es)",
+ "ca_ES" => "Spain (ca)",
+ "cs_CZ" => "Czech Republic",
+ "ro_RO" => "Romania",
+ "el_GR" => "Greece",
+ "zh_CN" => "China",
+ "zh_HK" => "Hong Kong",
+ "en_NZ" => "New Zealand",
+ "fr_FR" => "France",
+ "th_TH" => "Thailand",
+ "ko_KR" => "South Korea",
+ "sv_SE" => "Sweden",
+ "nb_NO" => "Norway",
+ "da_DK" => "Denmark",
+ "hu_HU" => "Hungary",
+ "et_EE" => "Estonia",
+ "es_MX" => "Mexico",
+ "es_CL" => "Chile",
+ "en_CA" => "Canada (en)",
+ "fr_CA" => "Canada (fr)",
+ "en_MY" => "Malaysia",
+ "bg_BG" => "Bulgaria",
+ "fi_FI" => "Finland",
+ "pl_PL" => "Poland",
+ "nl_NL" => "Netherlands",
+ "pt_PT" => "Portugal",
+ "de_CH" => "Switzerland (de)",
+ "fr_CH" => "Switzerland (fr)",
+ "it_CH" => "Switzerland (it)",
+ "de_AT" => "Austria",
+ "fr_BE" => "Belgium (fr)",
+ "nl_BE" => "Belgium (nl)",
+ "en_IE" => "Ireland",
+ "he_IL" => "Israel"
+ ]
+ ]
+ ];
+
+ switch($page){
+
+ case "web":
+ $base = array_merge(
+ $base,
+ [
+ "time" => [
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "day" => "Past 24 hours",
+ "week" => "Past week",
+ "month" => "Past month"
+ ]
+ ],
+ "extendedsearch" => [
+ // no display, wont show in interface
+ "option" => [
+ "yes" => "Yes",
+ "no" => "No"
+ ]
+ ]
+ ]
+ );
+ break;
+
+ case "images":
+ $base = array_merge(
+ $base,
+ [
+ "time" => [
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "day" => "Past 24 hours",
+ "week" => "Past week",
+ "month" => "Past month"
+ ]
+ ],
+ "size" => [
+ "display" => "Size",
+ "option" => [
+ "any" => "Any size",
+ "large" => "Large",
+ "medium" => "Medium",
+ "small" => "Small"
+ ]
+ ],
+ "color" => [
+ "display" => "Color",
+ "option" => [
+ "any" => "Any color",
+ "coloronly" => "Color only",
+ "monochrome" => "Monochrome",
+ "black" => "Black",
+ "brown" => "Brown",
+ "gray" => "Gray",
+ "white" => "White",
+ "yellow" => "Yellow",
+ "orange" => "Orange",
+ "red" => "Red",
+ "pink" => "Pink",
+ "purple" => "Purple",
+ "blue" => "Blue",
+ "teal" => "Teal",
+ "green" => "Green"
+ ]
+ ],
+ "imagetype" => [
+ "display" => "Type",
+ "option" => [
+ "any" => "Any type",
+ "animatedgif" => "Animated GIF",
+ "photo" => "Photograph",
+ "transparent" => "Transparent"
+ ]
+ ],
+ "license" => [
+ "display" => "License",
+ "option" => [
+ "any" => "Any license",
+ "share" => "Non-commercial reproduction and sharing",
+ "sharecommercially" => "Reproduction and sharing",
+ "modify" => "Non-commercial reproduction, sharing and modification",
+ "modifycommercially" => "Reproduction, sharing and modification",
+ "public" => "Public domain"
+ ]
+ ]
+ ]
+ );
+ break;
+
+ case "videos":
+ $base = array_merge(
+ $base,
+ [
+ "order" => [
+ "display" => "Order by",
+ "option" => [
+ "relevance" => "Relevance",
+ "views" => "Views",
+ "date" => "Most recent",
+ ]
+ ],
+ "source" => [
+ "display" => "Source",
+ "option" => [
+ "any" => "Any source",
+ "youtube" => "YouTube",
+ "dailymotion" => "Dailymotion",
+ ]
+ ]
+ ]
+ );
+ break;
+
+ case "news":
+ $base = array_merge(
+ $base,
+ [
+ "time" => [
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "hour" => "Less than 1 hour ago",
+ "day" => "Past 24 hours",
+ "week" => "Past week",
+ "month" => "Past month"
+ ]
+ ],
+ "order" => [
+ "display" => "Order by",
+ "option" => [
+ "relevance" => "Relevance",
+ "date" => "Most recent"
+ ]
+ ]
+ ]
+ );
+ break;
+ }
+
+ return $base;
+ }
+
+ private function get($proxy, $url, $get = []){
+
+ $headers = [
+ "User-Agent: " . config::USER_AGENT,
+ "Accept: application/json, text/plain, */*",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Origin: https://www.qwant.com",
+ "Referer: https://www.qwant.com/",
+ "Sec-Fetch-Dest: empty",
+ "Sec-Fetch-Mode: cors",
+ "Sec-Fetch-Site: same-site",
+ "TE: trailers"
+ ];
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
+
+ // Bypass HTTP/2 check
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+ return $data;
+ }
+
+ public function web($get){
+
+ if($get["npt"]){
+
+ // get next page data
+ [$params, $proxy] = $this->backend->get($get["npt"], "web");
+
+ $params = json_decode($params, true);
+
+ }else{
+
+ // get _GET data instead
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ if(strlen($search) > 2048){
+
+ throw new Exception("Search term is too long!");
+ }
+
+ $proxy = $this->backend->get_ip();
+
+ $params = [
+ "q" => $search,
+ "freshness" => $get["time"],
+ "count" => 10,
+ "locale" => $get["country"],
+ "offset" => 0,
+ "device" => "desktop",
+ "tgp" => 3,
+ "safesearch" => 0,
+ "displayed" => "true"
+ ];
+
+ switch($get["nsfw"]){
+
+ case "yes": $params["safesearch"] = 0; break;
+ case "maybe": $params["safesearch"] = 1; break;
+ case "no": $params["safesearch"] = 2; break;
+ }
+ }
+ /*
+ $handle = fopen("scraper/qwant_web.json", "r");
+ $json = fread($handle, filesize("scraper/qwant_web.json"));
+ fclose($handle);*/
+
+ try{
+ $json =
+ $this->get(
+ $proxy,
+ "https://fdn.qwant.com/v3/search/web",
+ $params
+ );
+
+ }catch(Exception $error){
+
+ throw new Exception("Could not fetch JSON");
+ }
+
+ $json = json_decode($json, true);
+
+ if($json === NULL){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
+ if(isset($json["data"]["message"][0])){
+
+ throw new Exception("Server returned an error:\n" . $json["data"]["message"][0]);
+ }
+
+ if($json["status"] != "success"){
+
+ if($json["data"]["error_code"] === 5){
+
+ return $out;
+ }
+
+ throw new Exception("Server returned an error code: " . $json["data"]["error_code"]);
+ }
+
+ if(!isset($json["data"]["result"]["items"]["mainline"])){
+
+ throw new Exception("Server did not return a result object");
+ }
+
+ // data is OK, parse
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ // get instant answer
+ if(
+ $get["extendedsearch"] == "yes" &&
+ isset($json["data"]["result"]["items"]["sidebar"][0]["endpoint"])
+ ){
+
+ try{
+ $answer =
+ $this->get(
+ $proxy,
+ "https://api.qwant.com/v3" .
+ $json["data"]["result"]["items"]["sidebar"][0]["endpoint"],
+ []
+ );
+
+ $answer = json_decode($answer, true);
+
+ if(
+ $answer === null ||
+ $answer["status"] != "success" ||
+ $answer["data"]["result"] === null
+ ){
+
+ throw new Exception();
+ }
+
+ // parse answer
+ $out["answer"][] = [
+ "title" => $answer["data"]["result"]["title"],
+ "description" => [
+ [
+ "type" => "text",
+ "value" => $this->trimdots($answer["data"]["result"]["description"])
+ ]
+ ],
+ "url" => $answer["data"]["result"]["url"],
+ "thumb" =>
+ $answer["data"]["result"]["thumbnail"]["landscape"] == null ?
+ null :
+ $this->unshitimage(
+ $answer["data"]["result"]["thumbnail"]["landscape"],
+ false
+ ),
+ "table" => [],
+ "sublink" => []
+ ];
+
+ }catch(Exception $error){
+
+ // do nothing in case of failure
+ }
+
+ }
+
+ // get word correction
+ if(isset($json["data"]["query"]["queryContext"]["alteredQuery"])){
+
+ $out["spelling"] = [
+ "type" => "including",
+ "using" => $json["data"]["query"]["queryContext"]["alteredQuery"],
+ "correction" => $json["data"]["query"]["queryContext"]["alterationOverrideQuery"]
+ ];
+ }
+
+ // check for next page
+ if($json["data"]["result"]["lastPage"] === false){
+
+ $params["offset"] = $params["offset"] + 10;
+
+ $out["npt"] =
+ $this->backend->store(
+ json_encode($params),
+ "web",
+ $proxy
+ );
+ }
+
+ // parse results
+ foreach($json["data"]["result"]["items"]["mainline"] as $item){
+
+ switch($item["type"]){ // ignores ads
+
+ case "web":
+ foreach($item["items"] as $result){
+
+ if(isset($result["thumbnailUrl"])){
+
+ $thumb = [
+ "url" => $this->unshitimage($result["thumbnailUrl"]),
+ "ratio" => "16:9"
+ ];
+ }else{
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }
+
+ $sublinks = [];
+ if(isset($result["links"])){
+
+ foreach($result["links"] as $link){
+
+ $sublinks[] = [
+ "title" => $this->trimdots($link["title"]),
+ "date" => null,
+ "description" => isset($link["desc"]) ? $this->trimdots($link["desc"]) : null,
+ "url" => $link["url"]
+ ];
+ }
+ }
+
+ $out["web"][] = [
+ "title" => $this->trimdots($result["title"]),
+ "description" => $this->trimdots($result["desc"]),
+ "url" => $result["url"],
+ "date" => null,
+ "type" => "web",
+ "thumb" => $thumb,
+ "sublink" => $sublinks,
+ "table" => []
+ ];
+ }
+ break;
+
+ case "images":
+ foreach($item["items"] as $image){
+
+ $out["image"][] = [
+ "title" => $image["title"],
+ "source" => [
+ [
+ "url" => $image["media"],
+ "width" => (int)$image["width"],
+ "height" => (int)$image["height"]
+ ],
+ [
+ "url" => $this->unshitimage($image["thumbnail"]),
+ "width" => $image["thumb_width"],
+ "height" => $image["thumb_height"]
+ ]
+ ],
+ "url" => $image["url"]
+ ];
+ }
+ break;
+
+ case "videos":
+ foreach($item["items"] as $video){
+
+ $out["video"][] = [
+ "title" => $video["title"],
+ "description" => null,
+ "date" => (int)$video["date"],
+ "duration" => $video["duration"] === null ? null : $video["duration"] / 1000,
+ "views" => null,
+ "thumb" =>
+ $video["thumbnail"] === null ?
+ [
+ "url" => null,
+ "ratio" => null,
+ ] :
+ [
+ "url" => $this->unshitimage($video["thumbnail"]),
+ "ratio" => "16:9",
+ ],
+ "url" => $video["url"]
+ ];
+ }
+ break;
+
+ case "related_searches":
+ foreach($item["items"] as $related){
+
+ $out["related"][] = $related["text"];
+ }
+ break;
+ }
+ }
+
+ return $out;
+ }
+
+
+ public function image($get){
+
+ if($get["npt"]){
+
+ [$params, $proxy] =
+ $this->backend->get(
+ $get["npt"],
+ "images"
+ );
+
+ $params = json_decode($params, true);
+ }else{
+
+ $search = $get["s"];
+
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $proxy = $this->backend->get_ip();
+
+ $params = [
+ "t" => "images",
+ "q" => $search,
+ "count" => 125,
+ "locale" => $get["country"],
+ "offset" => 0, // increment by 125
+ "device" => "desktop",
+ "tgp" => 3
+ ];
+
+ if($get["time"] != "any"){
+
+ $params["freshness"] = $get["time"];
+ }
+
+ foreach(["size", "color", "imagetype", "license"] as $p){
+
+ if($get[$p] != "any"){
+
+ $params[$p] = $get[$p];
+ }
+ }
+
+ switch($get["nsfw"]){
+
+ case "yes": $params["safesearch"] = 0; break;
+ case "maybe": $params["safesearch"] = 1; break;
+ case "no": $params["safesearch"] = 2; break;
+ }
+ }
+
+ try{
+ $json = $this->get(
+ $proxy,
+ "https://api.qwant.com/v3/search/images",
+ $params,
+ );
+ }catch(Exception $err){
+
+ throw new Exception("Failed to get JSON");
+ }
+
+ /*
+ $handle = fopen("scraper/yandex.json", "r");
+ $json = fread($handle, filesize("scraper/yandex.json"));
+ fclose($handle);*/
+
+ $json = json_decode($json, true);
+
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
+ if($json["status"] != "success"){
+
+ throw new Exception("Qwant returned an API error");
+ }
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "image" => []
+ ];
+
+ if($json["data"]["result"]["lastPage"] === false){
+
+ $params["offset"] = $params["offset"] + 125;
+
+ $out["npt"] = $this->backend->store(
+ json_encode($params),
+ "images",
+ $proxy
+ );
+ }
+
+ foreach($json["data"]["result"]["items"] as $image){
+
+ $out["image"][] = [
+ "title" => $this->trimdots($image["title"]),
+ "source" => [
+ [
+ "url" => $image["media"],
+ "width" => $image["width"],
+ "height" => $image["height"]
+ ],
+ [
+ "url" => $this->unshitimage($image["thumbnail"]),
+ "width" => $image["thumb_width"],
+ "height" => $image["thumb_height"]
+ ]
+ ],
+ "url" => $image["url"]
+ ];
+ }
+
+ return $out;
+ }
+
+ public function video($get){
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $params = [
+ "t" => "videos",
+ "q" => $search,
+ "count" => 50,
+ "locale" => $get["country"],
+ "offset" => 0, // dont implement pagination
+ "device" => "desktop",
+ "tgp" => 3
+ ];
+
+ switch($get["nsfw"]){
+
+ case "yes": $params["safesearch"] = 0; break;
+ case "maybe": $params["safesearch"] = 1; break;
+ case "no": $params["safesearch"] = 2; break;
+ }
+
+ try{
+ $json =
+ $this->get(
+ $this->backend->get_ip(),
+ "https://api.qwant.com/v3/search/videos",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Could not fetch JSON");
+ }
+
+ /*
+ $handle = fopen("scraper/yandex-video.json", "r");
+ $json = fread($handle, filesize("scraper/yandex-video.json"));
+ fclose($handle);
+ */
+
+ $json = json_decode($json, true);
+
+ if($json === null){
+
+ throw new Exception("Could not parse JSON");
+ }
+
+ if($json["status"] != "success"){
+
+ throw new Exception("Qwant returned an API error");
+ }
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "video" => [],
+ "author" => [],
+ "livestream" => [],
+ "playlist" => [],
+ "reel" => []
+ ];
+
+ foreach($json["data"]["result"]["items"] as $video){
+
+ if(empty($video["thumbnail"])){
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }else{
+
+ $thumb = [
+ "url" => $this->unshitimage($video["thumbnail"], false),
+ "ratio" => "16:9"
+ ];
+ }
+
+ $duration = (int)$video["duration"];
+
+ $out["video"][] = [
+ "title" => $video["title"],
+ "description" => $this->limitstrlen($video["desc"]),
+ "author" => [
+ "name" => $video["channel"],
+ "url" => null,
+ "avatar" => null
+ ],
+ "date" => (int)$video["date"],
+ "duration" => $duration === 0 ? null : $duration,
+ "views" => null,
+ "thumb" => $thumb,
+ "url" => preg_replace("/\?syndication=.+/", "", $video["url"])
+ ];
+ }
+
+ return $out;
+ }
+
+ public function news($get){
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $params = [
+ "t" => "news",
+ "q" => $search,
+ "count" => 50,
+ "locale" => $get["country"],
+ "offset" => 0, // dont implement pagination
+ "device" => "desktop",
+ "tgp" => 3
+ ];
+
+ switch($get["nsfw"]){
+
+ case "yes": $params["safesearch"] = 0; break;
+ case "maybe": $params["safesearch"] = 1; break;
+ case "no": $params["safesearch"] = 2; break;
+ }
+
+ try{
+ $json =
+ $this->get(
+ $this->backend->get_ip(),
+ "https://api.qwant.com/v3/search/news",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Could not fetch JSON");
+ }
+
+ /*
+ $handle = fopen("scraper/yandex-video.json", "r");
+ $json = fread($handle, filesize("scraper/yandex-video.json"));
+ fclose($handle);
+ */
+
+ $json = json_decode($json, true);
+
+ if($json === null){
+
+ throw new Exception("Could not parse JSON");
+ }
+
+ if($json["status"] != "success"){
+
+ throw new Exception("Qwant returned an API error");
+ }
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "news" => []
+ ];
+
+ foreach($json["data"]["result"]["items"] as $news){
+
+ if(empty($news["media"][0]["pict_big"]["url"])){
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }else{
+
+ $thumb = [
+ "url" => $this->unshitimage($news["media"][0]["pict_big"]["url"], false),
+ "ratio" => "16:9"
+ ];
+ }
+
+ $out["news"][] = [
+ "title" => $news["title"],
+ "author" => $news["press_name"],
+ "description" => $this->trimdots($news["desc"]),
+ "date" => (int)$news["date"],
+ "thumb" => $thumb,
+ "url" => $news["url"]
+ ];
+ }
+
+ return $out;
+ }
+
+ private function limitstrlen($text){
+
+ return explode("\n", wordwrap($text, 300, "\n"))[0];
+ }
+
+ private function trimdots($text){
+
+ return trim($text, ". ");
+ }
+
+ private function unshitimage($url, $is_bing = true){
+
+ // https://s1.qwant.com/thumbr/0x0/8/d/f6de4deb2c2b12f55d8bdcaae576f9f62fd58a05ec0feeac117b354d1bf5c2/th.jpg?u=https%3A%2F%2Fwww.bing.com%2Fth%3Fid%3DOIP.vvDWsagzxjoKKP_rOqhwrQAAAA%26w%3D160%26h%3D160%26c%3D7%26pid%3D5.1&q=0&b=1&p=0&a=0
+ parse_str(parse_url($url)["query"], $parts);
+
+ if($is_bing){
+ $parse = parse_url($parts["u"]);
+ parse_str($parse["query"], $parts);
+
+ return "https://" . $parse["host"] . "/th?id=" . urlencode($parts["id"]);
+ }
+
+ return $parts["u"];
+ }
+}
diff --git a/scraper/sc.php b/scraper/sc.php
index 23742f1..e2e7385 100644
--- a/scraper/sc.php
+++ b/scraper/sc.php
@@ -70,7 +70,7 @@ class sc{
return $data;
}
- public function music($get){
+ public function music($get, $last_attempt = false){
if($get["npt"]){
@@ -108,6 +108,7 @@ class sc{
$type = $get["type"];
$proxy = $this->backend->get_ip();
+ $token = $this->get_token($proxy);
switch($type){
@@ -117,12 +118,11 @@ class sc{
"q" => $search,
"variant_ids" => "",
"facet" => "model",
- "user_id" => config::SC_USER_ID,
- "client_id" => config::SC_CLIENT_TOKEN,
+ "client_id" => $token,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
- "app_version" => 1696577813,
+ "app_version" => 1713542117,
"app_locale" => "en"
];
break;
@@ -133,12 +133,11 @@ class sc{
"q" => $search,
"variant_ids" => "",
"facet_genre" => "",
- "user_id" => config::SC_USER_ID,
- "client_id" => config::SC_CLIENT_TOKEN,
+ "client_id" => $token,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
- "app_version" => 1696577813,
+ "app_version" => 1713542117,
"app_locale" => "en"
];
break;
@@ -149,12 +148,11 @@ class sc{
"q" => $search,
"variant_ids" => "",
"facet" => "place",
- "user_id" => config::SC_USER_ID,
- "client_id" => config::SC_CLIENT_TOKEN,
+ "client_id" => $token,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
- "app_version" => 1696577813,
+ "app_version" => 1713542117,
"app_locale" => "en"
];
break;
@@ -165,12 +163,11 @@ class sc{
"q" => $search,
"variant_ids" => "",
"facet" => "genre",
- "user_id" => config::SC_USER_ID,
- "client_id" => config::SC_CLIENT_TOKEN,
+ "client_id" => $token,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
- "app_version" => 1696577813,
+ "app_version" => 1713542117,
"app_locale" => "en"
];
break;
@@ -181,12 +178,11 @@ class sc{
"q" => $search,
"variant_ids" => "",
"facet" => "genre",
- "user_id" => config::SC_USER_ID,
- "client_id" => config::SC_CLIENT_TOKEN,
+ "client_id" => $token,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
- "app_version" => 1696577813,
+ "app_version" => 1713542117,
"app_locale" => "en"
];
break;
@@ -198,12 +194,11 @@ class sc{
"variant_ids" => "",
"filter.content_tier" => "SUB_HIGH_TIER",
"facet" => "genre",
- "user_id" => config::SC_USER_ID,
- "client_id" => config::SC_CLIENT_TOKEN,
+ "client_id" => $token,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
- "app_version" => 1696577813,
+ "app_version" => 1713542117,
"app_locale" => "en"
];
break;
@@ -229,7 +224,14 @@ class sc{
if($json === null){
- throw new Exception("Failed to decode JSON. Did the keys set in data/config.php expire?");
+ if($last_attempt === true){
+
+ throw new Exception("Fetched an invalid token (please report!!)");
+ }
+
+ // token might've expired, get a new one and re-try search
+ get_token($proxy);
+ return $this->music($get, true);
}
$out = [
@@ -352,7 +354,7 @@ class sc{
"endpoint" => "sc",
"url" =>
$item["media"]["transcodings"][0]["url"] .
- "?client_id=" . config::SC_CLIENT_TOKEN .
+ "?client_id=" . $token .
"&track_authorization=" .
$item["track_authorization"]
];
@@ -390,6 +392,37 @@ class sc{
return $out;
}
+ public function get_token($proxy){
+
+ $token = apcu_fetch("sc_token");
+
+ if($token === false){
+
+ $js =
+ $this->get(
+ $proxy,
+ "https://a-v2.sndcdn.com/assets/1-c3e4038d.js",
+ []
+ );
+
+ preg_match(
+ '/client_id=([^"]+)/',
+ $js,
+ $token
+ );
+
+ if(!isset($token[1])){
+
+ throw new Exception("Failed to get search token");
+ }
+
+ apcu_store("sc_token", $token[1]);
+ return $token[1];
+ }
+
+ return $token;
+ }
+
private function limitstrlen($text){
return
diff --git a/scraper/wiby.php b/scraper/wiby.php
index 2d79c56..59f723c 100644
--- a/scraper/wiby.php
+++ b/scraper/wiby.php
@@ -209,7 +209,7 @@ class wiby{
$out["web"][] = [
"title" => $this->unescapehtml(trim($links[2][$i])),
- "description" => $this->unescapehtml(trim(strip_tags($links[3][$i]))),
+ "description" => $this->unescapehtml(trim(strip_tags($links[3][$i]), ".\n\r ")),
"url" => trim($links[1][$i]),
"date" => null,
"type" => "web",
diff --git a/scraper/yandex.php b/scraper/yandex.php
index 9b73428..2e81cee 100644
--- a/scraper/yandex.php
+++ b/scraper/yandex.php
@@ -644,6 +644,11 @@ class yandex{
$json = json_decode($json, true);
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
if(
isset($json["type"]) &&
$json["type"] == "captcha"
@@ -652,11 +657,6 @@ class yandex{
throw new Exception("Yandex blocked this 4get instance. Please try again in ~7 minutes.");
}
- if($json === null){
-
- throw new Exception("Failed to decode JSON");
- }
-
$out = [
"status" => "ok",
"npt" => null,