summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2023-08-13 23:35:08 -0400
committerlolcat <will@lolcat.ca>2023-08-13 23:35:08 -0400
commitb57b2d71180d4f833d34d11b8de251db9a17e7d6 (patch)
treec6e219fa470dfbc94abe560efb5a44fd676ca631
parent6f896221f0068fa9c4718a7bbc1b596ba73b9550 (diff)
ill need to rewrite the google scraper, but hey atleast we got imagesearch working woooo
-rw-r--r--README.md6
-rw-r--r--about.php8
-rw-r--r--lib/frontend.php8
-rw-r--r--proxy.php10
-rw-r--r--scraper/brave.php2
-rw-r--r--scraper/google.php1755
-rw-r--r--settings.php8
7 files changed, 357 insertions, 1440 deletions
diff --git a/README.md b/README.md
index 008a1c5..6a88308 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ https://4get.ca
1. Web
- DuckDuckGo
- Brave
+ - Google
- Mojeek
- Marginalia
- wiby
@@ -18,6 +19,7 @@ https://4get.ca
2. Images
- DuckDuckGo
- Yandex
+ - Google
- Brave
3. Videos
@@ -25,13 +27,15 @@ https://4get.ca
- Facebook videos
- DuckDuckgo
- Brave
+ - Google
4. News
- DuckDuckGo
- Brave
+ - Google
- Mojeek
-More scrapers are coming soon. I currently want to add Google, Hackernews, Qwant and find a way to scrape Yandex web without those fucking captchas. A shopping, music and files tab is also in my todo list.
+More scrapers are coming soon. I currently want to add Hackernews, Qwant and find a way to scrape Yandex web without those fucking captchas. A shopping, music and files tab is also in my todo list.
# Setup
This section is still to-do. You will need to figure shit out for some of the apache2 stuff. Everything else should be OK.
diff --git a/about.php b/about.php
index 0051846..2b3d316 100644
--- a/about.php
+++ b/about.php
@@ -88,8 +88,12 @@ $left =
<td>Address</td>
</tr>
<tr>
- <td>4get</td>
- <td><a href="https://4get.ca">4get.ca</a><a href="http://4getwebfrq5zr4sxugk6htxvawqehxtdgjrbcn2oslllcol2vepa23yd.onion/">(tor)</a></td>
+ <td>lolcat\'s instance (master)</td>
+ <td><a href="https://4get.ca">4get.ca</a><a href="http://4getwebfrq5zr4sxugk6htxvawqehxtdgjrbcn2oslllcol2vepa23yd.onion">(tor)</a></td>
+ </tr>
+ <tr>
+ <td>zzls\'s instance</td>
+ <td><a href="https://4get.zzls.xyz/">4get.zzls.xyz</a><a href="http://4get.zzlsghu6mvvwyy75mvga6gaf4znbp3erk5xwfzedb4gg6qqh2j6rlvid.onion">(tor)</a></td>
</tr>
</table>
diff --git a/lib/frontend.php b/lib/frontend.php
index a127989..74c65d6 100644
--- a/lib/frontend.php
+++ b/lib/frontend.php
@@ -877,8 +877,8 @@ class frontend{
"display" => "Scraper",
"option" => [
"ddg" => "DuckDuckGo",
- "brave" => "Brave",
- //"google" => "Google",
+ //"brave" => "Brave",
+ "google" => "Google",
"mojeek" => "Mojeek",
"marginalia" => "Marginalia",
"wiby" => "wiby"
@@ -892,8 +892,8 @@ class frontend{
"option" => [
"ddg" => "DuckDuckGo",
"yandex" => "Yandex",
- "brave" => "Brave"//,
- //"google" => "Google"
+ "brave" => "Brave",
+ "google" => "Google"
]
];
break;
diff --git a/proxy.php b/proxy.php
index bcf552e..b49fafd 100644
--- a/proxy.php
+++ b/proxy.php
@@ -32,11 +32,11 @@ try{
switch($_GET["s"]){
- case "portrait": $req = "&w=50&h=90&p=0&qlt=99"; break;
- case "landscape": $req = "&w=160&h=90&p=0&qlt=99"; break;
- case "square": $req = "&w=90&h=90&p=0&qlt=99"; break;
- case "thumb": $req = "&w=236&h=180&p=0&qlt=99"; break;
- case "cover": $req = "&w=207&h=270&p=0&qlt=99"; break;
+ case "portrait": $req = "&w=50&h=90&p=0&qlt=90"; break;
+ case "landscape": $req = "&w=160&h=90&p=0&qlt=90"; break;
+ case "square": $req = "&w=90&h=90&p=0&qlt=90"; break;
+ case "thumb": $req = "&w=236&h=180&p=0&qlt=90"; break;
+ case "cover": $req = "&w=207&h=270&p=0&qlt=90"; break;
}
$proxy->stream_linear_image($_GET["i"] . $req, "https://bing.net");
diff --git a/scraper/brave.php b/scraper/brave.php
index bcec59e..50e7b49 100644
--- a/scraper/brave.php
+++ b/scraper/brave.php
@@ -1982,8 +1982,6 @@ class brave{
as $result
){
- print_r($result);
-
$out["image"][] = [
"title" => $result["title"],
"source" => [
diff --git a/scraper/google.php b/scraper/google.php
index af243ba..7ed3577 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -19,6 +19,8 @@ class google{
switch($page){
case "web":
+ case "videos":
+ case "news":
return [
"country" => [ // gl=<country>
"display" => "Country",
@@ -619,37 +621,60 @@ class google{
"zh-TW" => "Chinese (Traditional)"
]
],
- "newer" => [ // &sort=review-date:r:20090301:20090430
- "display" => "Newer than",
- "option" => "_DATE"
- ],
- "older" => [
- "display" => "Older than",
- "option" => "_DATE"
+ "time" => [ // tbs=qrd:<size>
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "d" => "Past 24 hours",
+ "w" => "Past week",
+ "m" => "Past month",
+ "y" => "Past year"
+ ]
],
- "size" => [ // tbs=isz:<size>
+ "size" => [
"display" => "Size",
"option" => [
+ // tbs=isz:<size>
"any" => "Any size",
"l" => "Large",
"m" => "Medium",
- "i" => "Icon"
+ "i" => "Icon",
+ // from here
+ // tbz:lt,islt:<size>
+ "qsvga" => "Larger than 400x300",
+ "vga" => "Larger than 640x480",
+ "qsvga" => "Larger than 800x600",
+ "xga" => "Larger than 1024x768",
+ "2mp" => "Larger than 2MP",
+ "4mp" => "Larger than 4MP",
+ "6mp" => "Larger than 6MP",
+ "8mp" => "Larger than 8MP",
+ "10mp" => "Larger than 10MP",
+ "12mp" => "Larger than 12MP",
+ "15mp" => "Larger than 15MP",
+ "20mp" => "Larger than 20MP",
+ "40mp" => "Larger than 40MP",
+ "70mp" => "Larger than 70MP"
]
],
- "colortype" => [ // imgColorType=<color>
- "display" => "Color type",
+ "ratio" => [ // tbs=iar:<size>
+ "display" => "Aspect ratio",
"option" => [
- "any" => "Any color type",
- "color" => "Colored",
- "gray" => "Gray",
- "mono" => "Black & white",
- "trans" => "Transparent"
+ "any" => "Any ratio",
+ "t" => "Tall",
+ "s" => "Square",
+ "w" => "Wide",
+ "xw" => "Panoramic"
]
],
- "color" => [ // imgDominantColor=<color>
+ "color" => [ // tbs=ic:<color>
"display" => "Color",
"option" => [
"any" => "Any color",
+ "color" => "Full color",
+ "gray" => "Black & white",
+ "trans" => "Transparent",
+ // from there, its ic:specific,isc:<color>
"red" => "Red",
"orange" => "Orange",
"yellow" => "Yellow",
@@ -664,7 +689,7 @@ class google{
"brown" => "Brown"
]
],
- "type" => [ // imgType=<type>
+ "type" => [ // tbs=itp:<type>
"display" => "Type",
"option" => [
"any" => "Any type",
@@ -675,10 +700,24 @@ class google{
"animated" => "Animated"
]
],
+ "format" => [ // tbs=ift:<format>
+ "display" => "Format",
+ "option" => [
+ "any" => "Any format",
+ "jpg" => "JPG",
+ "gif" => "GIF",
+ "png" => "PNG",
+ "bmp" => "BMP",
+ "svg" => "SVG",
+ "webp" => "WEBP",
+ "ico" => "ICO",
+ "craw" => "RAW"
+ ]
+ ],
"rights" => [ // tbs=il:<rights>
"display" => "Usage rights",
"option" => [
- "any" => "No license",
+ "any" => "Any license",
"cl" => "Creative Commons licenses",
"ol" => "Commercial & other licenses"
]
@@ -802,1402 +841,148 @@ class google{
"news" => [],
"related" => []
];
+ }
+
+
+ public function image($get){
- $styles =
- $this->fuckhtml
- ->getElementsByTagName("style");
-
- $this->computedstyle = [];
- $this->ask = [];
-
- foreach($styles as $style){
-
- $this->computedstyle =
- array_merge(
- $this->computedstyle,
- $this->parsestyles($style["innerHTML"])
- );
- }
-
- // get images in javascript var
- preg_match(
- '/google\.ldi=({[^}]+})/',
- $html,
- $this->js_image
- );
-
- if(count($this->js_image) !== 0){
+ // generate parameters
+ if($get["npt"]){
- $this->js_image = json_decode($this->js_image[1], true);
+ $params =
+ json_decode(
+ $this->nextpage->get(
+ $get["npt"],
+ "images"
+ ),
+ true
+ );
}else{
- $this->js_image = [];
- }
-
- // additional js_images present in <script> tags
- // ugh i fucking hate you
- $scripts =
- $this->fuckhtml
- ->getElementsByTagName("script");
-
- foreach($scripts as $script){
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $lang = $get["lang"];
+ $time = $get["time"];
+ $size = $get["size"];
+ $ratio = $get["ratio"];
+ $color = $get["color"];
+ $type = $get["type"];
+ $format = $get["format"];
+ $rights = $get["rights"];
- if(!isset($script["innerHTML"])){
-
- continue;
- }
-
- preg_match_all(
- '/var s=\'(data:image[^\']+)\';var i=\[\'([^\']+)\'];/',
- $script["innerHTML"],
- $image_grep
- );
+ $params = [
+ "q" => $search,
+ "tbm" => "isch"
+ ];
- if(count($image_grep[0]) !== 0){
+ // country
+ if($country != "any"){
- $this->js_image[trim($image_grep[2][0])] =
- $this->fuckhtml
- ->getTextContent(
- $image_grep[1][0]
- );
+ $params["gl"] = $country;
}
- // even more javascript crap
- // "People also ask" node is loaded trough javascript
- preg_match_all(
- '/window\.jsl\.dh\(\'([^\']+)\',\'(.+)\'\);/',
- $script["innerHTML"],
- $ask_grep
- );
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
- for($i=0; $i<count($ask_grep[0]); $i++){
+ // language
+ if($lang != "any"){
- $this->ask[trim($ask_grep[1][$i])] =
- stripcslashes(
- $ask_grep[2][$i]
- );
+ $params["lr"] = "lang_" . $lang;
}
- }
-
- // get nodes
- // fuck you google!!!!!!!!!!!!!!
-
- $containers =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "background-color" => "#fff",
- "margin-bottom" => "10px",
- "-webkit-box-shadow" => "0 1px 6px rgba(32,33,36,0.28)",
- "border-radius" => "8px"
- ],
- self::is_class
- ),
- "div"
- );
-
- foreach($containers as $container){
- $this->fuckhtml->load($container);
+ $tbs = [];
- // get link at the top
- $link =
- $this->fuckhtml
- ->getElementsByTagName(
- "a"
- );
-
- if(count($link) !== 0){
+ // time
+ if($time != "any"){
- $link =
- $this->decodeurl(
- $link
- [0]
- ["attributes"]
- ["href"]
- );
+ $tbs[] = "qrd:" . $time;
}
- /*
- Check for carousel presence
- */
- $carousel =
- $this->fuckhtml
- ->getElementsByClassName(
- "pcitem",
- "div"
- );
-
- $title =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "color" => "#1967d2",
- "font-size" => "20px",
- "line-height" => "26px"
- ],
- self::is_class
- ),
- "div"
- );
-
- if(count($carousel) !== 0){
-
- $carousel_title =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "font-size" => "16px",
- "line-height" => "20px",
- "font-weight" => "400"
- ],
- self::is_class
- ),
- "div"
- );
+ // size
+ if($size != "any"){
- $sublink = []; // twitter carousel sublinks
- foreach($carousel as $item){
-
- $this->fuckhtml->load($item);
-
- $url =
- $this->decodeurl(
- $this->fuckhtml
- ->getElementsByTagName(
- "a"
- )[0]
- ["attributes"]
- ["href"]
- );
-
- // detect if its a twitter carousel or
- // a list of news articles
-
- $grey_node =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "white-space" => "pre-line",
- "word-wrap" => "break-word"
- ],
- self::is_class
- ),
- "div"
- );
+ if(
+ in_array(
+ $size,
+ ["l", "s", "i"]
+ )
+ ){
- if(count($carousel_title) !== 0){
-
- switch(
- strtolower(
- $this->fuckhtml
- ->getTextContent(
- $carousel_title[0]
- )
- )
- ){
-
- case "top stories":
- $img =
- $this->fuckhtml
- ->getElementsByTagName("img");
-
- if(
- count($img) !== 0 &&
- isset($img[0]["attributes"]["id"]) &&
- isset($this->js_image[$img[0]["attributes"]["id"]])
- ){
-
- $img = [
- "url" => $this->getimage($img[0]["attributes"]["id"]),
- "ratio" => "16:9"
- ];
- }else{
-
- $img = [
- "url" => null,
- "ratio" => null
- ];
- }
-
- /*
- Is a news node
- */
- $out["news"][] = [
- "title" =>
- $this->fuckhtml
- ->getTextContent(
- $grey_node[0]
- ),
- "description" => null,
- "date" =>
- strtotime(
- explode(
- "\n",
- $grey_node[1]["innerHTML"]
- )[1]
- ),
- "thumb" => $img,
- "url" => $url
- ];
- break;
-
- case "images":
-
- /*
- We found an image
- */
- $imagedata =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "display" => "block",
- "background-color" => "#fff",
- "border-radius" => "8px",
- "-webkit-box-shadow" => "0 1px 6px rgba(32, 33, 36, 0.28)",
- "overflow" => "hidden"
- ],
- self::is_class
- ),
- "a"
- );
-
- if(count($imagedata) === 0){
-
- break;
- }
-
- $imagedata = $imagedata[0];
-
- // https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/commons/thumb/9/9d/Joe_Biden_presidential_portrait_%2528cropped%2529.jpg/220px-Joe_Biden_presidential_portrait_%2528cropped%2529.jpg&imgrefurl=https://en.wikipedia.org/wiki/President_of_the_United_States&h=293&w=220&tbnid=kkQHBIAMuTitdM&q=who+is+the+president+of+the+united+states&tbnh=115&tbnw=86&usg=AI4_-kQVKi-K2zTGmVkS75_Fo6VldpPxsg&vet=1&docid=d2vgvyYSkU0hiM&sa=X&ved=2ahUKEwjKrMT17KyAAxV1j4kEHRAVCoYQ9QF6BAgFEAQ
- parse_str(
- parse_url(
- $this->fuckhtml
- ->getTextContent(
- $imagedata["attributes"]["href"]
- ),
- PHP_URL_QUERY
- ),
- $params
- );
-
- $image =
- $this->fuckhtml
- ->getElementsByTagName("img")[0];
-
- if(isset($this->js_image[$image["attributes"]["id"]])){
-
- $thumbimg = $this->getimage($image["attributes"]["id"]);
- }else{
-
- $thumbimg =
- $this->fuckhtml
- ->getTextContent(
- $image["attributes"]["src"]
- );
- }
-
- $out["image"][] = [
- "title" =>
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $image["attributes"]["alt"]
- )
- ),
- "source" => [
- [
- "url" => $params["imgurl"],
- "width" => (int)$params["w"],
- "height" => (int)$params["h"]
- ],
- [
- "url" => $thumbimg,
- "width" => (int)$params["tbnw"],
- "height" => (int)$params["tbnh"]
- ]
- ],
- "url" => $params["imgrefurl"]
- ];
- break;
- }
- }else{
-
- /*
- Is a web node (twitter-like)
- create a link -> sublink structure and
- ignore images
- */
-
- switch(count($grey_node)){
-
- case 0:
- continue 2;
-
- case 1:
- $sublink_title = $grey_node[0];
- $sublink_description = null;
- break;
-
- case 2:
- $sublink_title = $grey_node[1];
- $sublink_description =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $grey_node[0]
- )
- );
- break;
- }
-
- $sublink_url =
- $this->decodeurl(
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByTagName(
- "a"
- )[0]
- ["attributes"]
- ["href"]
- )
- );
-
- if($link == $sublink_url){
-
- continue;
- }
-
- $sublink_title =
- explode(
- " • ",
- $this->fuckhtml
- ->getTextContent(
- $sublink_title["innerHTML"]
- )
- );
-
- if(count($sublink_title) !== 1){
-
- $date = strtotime($sublink_title[1]);
- }else{
-
- $date = null;
- }
-
- $sublink_title = $this->titledots($sublink_title[0]);
-
- $sublink[] = [
- "title" => $sublink_title,
- "date" => $date,
- "description" => $sublink_description,
- "url" => $sublink_url
- ];
- }
- }
-
- // if it was a web node
- if(count($sublink) !== 0){
+ $tbs[] = "isz:" . $size;
+ }else{
- $out["web"][] = [
- "title" =>
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $title[0]
- )
- ),
- "description" => null,
- "url" => $url,
- "date" => null,
- "type" => "web",
- "thumb" => [
- "url" => null,
- "ratio" => null
- ],
- "sublink" => $sublink,
- "table" => []
- ];
+ $tbs[] = "tbz:lt";
+ $tbs[] = "islt:" . $size;
}
-
- continue;
}
- $people_title =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "font-weight" => "bold",
- "font-size" => "16px",
- "color" => "#000",
- "margin" => "0",
- "padding" => "12px 16px 0 16px"
- ],
- self::is_class
- ),
- "div"
- );
-
- if(
- count($people_title) !== 0 &&
- strtolower(
- $this->fuckhtml
- ->getTextContent(
- $people_title[0]
- )
- ) == "people also ask"
- ){
- /*
- Parse "people also ask" node
- */
-
- $div =
- $this->fuckhtml
- ->getElementsByTagName("div");
-
- // add suggestions
- $suggestions =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "display" => "inline-block",
- "padding-right" => "26px"
- ],
- self::is_class
- ),
- $div
- );
-
- foreach($suggestions as $suggestion){
-
- $out["related"][] =
- $this->fuckhtml
- ->getTextContent($suggestion);
- }
+ // ratio
+ if($ratio != "any"){
- // parse websites
- foreach($div as $d){
-
- if(
- isset($d["attributes"]["id"]) &&
- strpos(
- $d["attributes"]["id"],
- "accdef_"
- ) !== false
- ){
-
- $this->fuckhtml->load(
- $this->ask[
- $d["attributes"]["id"]
- ]
- );
-
- $description =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "white-space" => "pre-line",
- "word-wrap" => "break-word"
- ],
- self::is_class
- ),
- "div"
- )[0]
- )
- );
-
- $a =
- $this->fuckhtml
- ->getElementsByTagName("a")
- [0];
-
- $this->fuckhtml->load($a);
-
- $out["web"][] = [
- "title" =>
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByTagName("span")[0]
- )
- ),
- "description" => $description,
- "url" =>
- $this->decodeurl(
- $this->fuckhtml
- ->getTextContent(
- $a
- ["attributes"]
- ["href"]
- )
- ),
- "date" => null,
- "type" => "web",
- "thumb" => [
- "url" => null,
- "ratio" => null
- ],
- "sublink" => [],
- "table" => []
- ];
- }
- }
-
- continue;
+ $tbs[] = "iar:" . $ratio;
}
- if(count($title) !== 0){
-
- /*
- Get WEB search results
- */
-
- $thumb =
- $this->fuckhtml
- ->getElementsByTagName("img");
+ // color
+ if($color != "any"){
if(
- count($thumb) !== 0 &&
- isset($this->js_image[$thumb[0]["attributes"]["id"]])
+ in_array(
+ $color,
+ ["color", "gray", "trans"]
+ )
){
- $thumb = [
- "url" => $this->getimage($thumb[0]["attributes"]["id"]),
- "ratio" => "1:1"
- ];
-
- if(parse_url($thumb["url"], PHP_URL_HOST) == "i.ytimg.com"){
-
- $thumb = [
- "url" =>
- str_replace(
- "default.jpg",
- "maxresdefault.jpg",
- $thumb["url"]
- ),
- "ratio" => "16:9"
- ];
- }
+ $tbs[] = "ic:" . $color;
}else{
- $thumb = [
- "url" => null,
- "ratio" => null
- ];
- }
-
- // this contains description, sublinks
- $inner_category =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "white-space" => "pre-line",
- "word-wrap" => "break-word"
- ],
- self::is_class
- ),
- "div"
- );
-
- // set empty values
- $description = null;
- $table = [];
- $sublinks = [];
- $date = null;
-
- foreach($inner_category as $category){
-
- if($category["level"] !== 6){
-
- // enterring protocol 6
- // and u dont seem to understaaaaandddddd
- continue;
- }
-
- $this->fuckhtml->load($category);
-
- // check if its a table
- preg_match(
- '/^[A-z0-9 ]+: <span/',
- $category["innerHTML"],
- $tablematch
- );
-
- if(count($tablematch) !== 0){
-
- $categories = explode("<br>", $category["innerHTML"]);
-
- foreach($categories as $cat){
-
- $container["innerHTML"] = str_replace($cat, "", $container["innerHTML"]);
-
- $cat = explode(":", $cat, 2);
-
- $name =
- $this->fuckhtml
- ->getTextContent(
- $cat[0]
- );
-
- if(strtolower($name) != "posted"){
-
- $table[$name] =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $cat[1]
- )
- );
- }else{
-
- $date =
- strtotime(
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $cat[1]
- )
- )
- );
- }
- }
- continue;
- }
-
- $spans =
- $this->fuckhtml
- ->getElementsByTagName("span");
-
- $encounter_rating = false;
- foreach($spans as $span){
-
- // replace element with nothing
- if(empty($description)){
- $category["innerHTML"] =
- str_replace(
- $span["outerHTML"],
- "",
- $category["innerHTML"]
- );
- }
-
- if($encounter_rating !== false){
-
- switch($encounter_rating){
-
- case 3:
- $table["Votes"] =
- number_format(
- str_replace(
- [
- "(",
- ")",
- ","
- ],
- "",
- $this->fuckhtml
- ->getTextContent(
- $span["innerHTML"]
- )
- )
- );
- break;
-
- case 6:
- $table["Price"] =
- $this->fuckhtml
- ->getTextContent(
- $span["innerHTML"]
- );
- break;
-
- case 8:
- $table["Support"] =
- $this->fuckhtml
- ->getTextContent(
- $span["innerHTML"]
- );
- break;
- }
-
- $encounter_rating++;
- }
-
- // get rating
- if(isset($span["attributes"]["aria-hidden"])){
-
- $table["Rating"] = $span["innerHTML"];
- $encounter_rating = 0;
- continue;
- }
- }
-
- if(empty($description)){
-
- $description =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $category
- )
- );
- }
+ $tbs[] = "ic:specific";
+ $tbs[] = "isc:" . $color;
}
-
- // get sublinks
- $this->fuckhtml->load($container["innerHTML"]);
-
- $as =
- $this->fuckhtml->getElementsByTagName("a");
-
- foreach($as as $a){
-
- $this->fuckhtml->load($a);
-
- $detect =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "color" => "#1967d2",
- "font-size" => "14px",
- "line-height" => "20px"
- ],
- self::is_class
- ),
- "span"
- );
-
- if(count($detect) !== 0){
-
- $sublinks[] = [
- "title" =>
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $a
- )
- ),
- "date" => null,
- "description" => null,
- "url" =>
- $this->decodeurl(
- $a["attributes"]["href"]
- )
- ];
- }
- }
-
- $data = [
- "title" =>
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $title[0]
- )
- ),
- "description" => $description,
- "url" => $link,
- "date" => $date,
- "type" => "web",
- "thumb" => $thumb,
- "sublink" => $sublinks,
- "table" => $table
- ];
-
- $out["web"][] = $data;
-
- continue;
}
- /*
- Check related searches node
- */
- $relateds =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "display" => "block",
- "position" => "relative",
- "width" => "100%"
- ],
- self::is_class
- ),
- "a"
- );
-
- if(count($relateds) !== 0){
+ // type
+ if($type != "any"){
- foreach($relateds as $related){
-
- $out["related"][] =
- $this->fuckhtml
- ->getTextContent(
- $related
- );
- }
-
- continue;
+ $tbs[] = "itp:" . $type;
}
- /*
- Check for spelling autocorrect
- */
- $spelling =
- $this->fuckhtml
- ->getElementById(
- "scl"
- );
-
- if($spelling){
+ // format
+ if($format != "any"){
- $out["spelling"] = [
- "type" => "including",
- "using" =>
- $this->fuckhtml
- ->getTextContent(
- $spelling
- ),
- "correction" => $search
- ];
+ $tbs[] = "ift:" . $format;
}
- /*
- Get next page
- */
- $nextpage =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "-webkit-box-flex" => "1",
- "display" => "block"
- ],
- self::is_class
- ),
- "a"
- );
-
- if(count($nextpage) !== 0){
-
- $out["npt"] =
- $this->nextpage
- ->store(
- explode(
- "?",
- $this->fuckhtml
- ->getTextContent(
- $nextpage[0]
- ["attributes"]
- ["href"]
- )
- )[1],
- "web"
- );
+ // rights
+ if($rights != "any"){
- continue;
+ $tbs[] = "il:" . $rights;
}
- /*
- Check for DMCA complaint div
- */
- $dmca_table = false;
-
- $text =
- $this->fuckhtml
- ->getTextContent($container);
-
- if(
- stripos(
- $text,
- "In response to a complaint we received under the US Digital Millennium Copyright Act, we have removed"
- ) !== false
- ||
- stripos(
- $text,
- "In response to multiple complaints we received under the US Digital Millennium Copyright Act, we have removed"
- ) !== false
- ){
-
- $as =
- $this->fuckhtml
- ->getElementsByTagName("a");
-
- array_shift($as);
-
- $dmca_table = [
- "title" => "Removed results",
- "description" => [
- [
- "type" => "text",
- "value" => "Google removed results due to DMCA complaints. You can view the removed links by visiting these:\n\n"
- ]
- ],
- "url" => "https://support.google.com/legal/answer/1120734?visit_id=638260070062978894-2242290953",
- "thumb" => null,
- "table" => [],
- "sublink" => []
- ];
-
- $i = 0;
- $c = count($as);
-
- foreach($as as $a){
-
- $i++;
- $u =
- $this->decodeurl(
- $a["attributes"]["href"]
- );
-
- $dmca_table["description"][] = [
- "type" => "link",
- "url" => $u,
- "value" => $u
- ];
-
- if($i !== $c){
-
- $dmca_table["description"][] = [
- "type" => "text",
- "value" => "\n"
- ];
- }
- }
+ // append tbs
+ if(count($tbs) !== 0){
- continue;
- }
-
- /*
- Parse instant answers with parts
- */
- $parts =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "padding" => "12px 16px 12px"
- ],
- self::is_class
- ),
- "div"
- );
-
- if(count($parts) !== 0){
-
- $table = [
- "title" => null,
- "description" => [],
- "url" => null,
- "thumb" => null,
- "table" => [],
- "sublink" => []
- ];
-
- // get thumb
- $thumb =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "float" => "right",
- "padding-left" => "16px"
- ],
- self::is_class
- ),
- "div"
- );
-
- if(count($thumb) !== 0){
-
- $this->fuckhtml->load($thumb[0]);
-
- $img =
- $this->fuckhtml
- ->getElementsByTagName("img");
-
- if(count($img) !== 0){
-
- $table["thumb"] =
- $this->getimage(
- $img[0]["attributes"]["id"]
- );
- }
-
- $this->fuckhtml->load($container);
- }
-
- $h =
- $this->fuckhtml
- ->getElementsByTagName("h3");
-
- if(count($h) === 0){
-
- $h =
- $this->fuckhtml
- ->getElementsByTagName("h2");
- }
-
- if(count($h) !== 0){
- // set title + subtext for when a word definition
- // appears
- $h = $h[0];
-
- $table["title"] =
- $this->fuckhtml
- ->getTextContent(
- $h
- );
-
- $parts[0]["innerHTML"] =
- str_replace(
- $h["outerHTML"],
- "",
- $parts[0]["innerHTML"]
- );
-
- $table["description"][] =
- [
- "type" => "quote",
- "value" =>
- $this->fuckhtml
- ->getTextContent(
- $parts[0]
- )
- ];
- }else{
-
- // parse it as a wikipedia header
-
- }
-
- // get table elements
- $tables =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "display" => "table",
- "width" => "100%",
- "padding-right" => "16px",
- "-webkit-box-sizing" => "border-box"
- ],
- self::is_class
- ),
- "div"
- );
-
- foreach($tables as $tbl){
-
- $this->fuckhtml->load($tbl);
-
- $images =
- $this->fuckhtml
- ->getElementsByTagName("img");
-
- if(count($images) !== 0){
-
- $image = $this->getimage($images[0]["attributes"]["id"]);
-
- $text =
- $this->fuckhtml
- ->getTextContent(
- $tbl
- );
-
- $table["description"][] = [
- "type" => "link",
- "value" => $text,
- "url" => "?s=" . urlencode($text) . "&scraper=google"
- ];
-
- $table["description"][] = [
- "type" => "image",
- "url" => $image
- ];
- }
-
- }
-
- $audio =
- $this->fuckhtml
- ->getElementsByTagName("audio");
-
- if(count($audio) !== 0){
-
- $table["description"][] = [
- "type" => "audio",
- "url" =>
- str_replace(
- "http://",
- "https://",
- $this->fuckhtml
- ->getTextContent(
- $audio[0]["attributes"]["src"]
- )
- )
- ];
- }
-
- if(count($parts) >= 2){
-
- $this->fuckhtml->load($parts[1]);
-
- $parts =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "padding-bottom" => "12px"
- ],
- self::is_class
- ),
- "div"
- );
-
- foreach($parts as $part){
-
- $this->fuckhtml->load($part);
-
- $lists =
- $this->fuckhtml
- ->getElementsByTagName("ol");
-
- if(count($lists) !== 0){
-
- foreach($lists as $list){
-
- $this->fuckhtml->load($list);
-
- $list_items =
- $this->fuckhtml
- ->getElementsByTagName("li");
-
- $index = 0;
-
- if(count($list_items) !== 0){
-
- foreach($list_items as $list_item){
-
- $index++;
-
- $this->fuckhtml->load($list_item);
-
- $list_subitems =
- $this->fuckhtml
- ->getElementsByTagName("div");
-
- foreach($list_subitems as $subitem){
-
- if($subitem["level"] !== 1){ continue; }
-
- $this->fuckhtml->load($subitem);
-
- $spans =
- $this->fuckhtml
- ->getElementsByTagName("span");
-
- if(count($spans) !== 0){
-
- $type = "quote";
- }else{
-
- $type = "text";
- }
-
- $value =
- $this->fuckhtml
- ->getTextContent(
- $subitem
- );
-
- if($type == "text"){
-
- $value = $index . ". " . $value;
- }
-
- $table["description"][] = [
- "type" => $type,
- "value" => $value
- ];
- }
- }
- }
- }
-
- continue;
- }
-
- // get title
- $spans =
- $this->fuckhtml
- ->getElementsByTagName("span");
-
- if(count($spans) !== 0){
-
- foreach($spans as $span){
-
- $part["innerHTML"] =
- str_replace(
- $span["outerHTML"],
- "",
- $part["innerHTML"]
- );
- }
-
- if(
- $this->fuckhtml
- ->getTextContent(
- $part
- )
- == ""
- ){
-
- $table["description"][] = [
- "type" => "title",
- "value" =>
- $this->fuckhtml
- ->getTextContent(
- $spans[0]
- )
- ];
-
- continue;
- }
- }
-
- // fallback to getting non-numbered list
- $nlist =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->findstyles(
- [
- "white-space" => "pre-line",
- "word-wrap" => "break-word"
- ],
- self::is_class
- ),
- "div"
- );
-
- if(count($nlist) !== 0){
-
- foreach($nlist as $nlist_item){
-
- $text =
- $this->fuckhtml
- ->getTextContent($nlist_item);
-
- if($text == ""){
-
- continue;
- }
-
- $this->fuckhtml->load($nlist_item);
-
- $spans =
- $this->fuckhtml
- ->getElementsByTagName("span");
-
- if(count($spans) !== 0){
-
- // is a quote node
- $type = "quote";
- }else{
-
- $type = "text";
- }
-
- $table["description"][] = [
- "type" => $type,
- "value" => $text
- ];
- }
- }
- }
- }
-
- $out["answer"][] = $table;
+ $params["tbs"] =
+ implode(",", $tbs);
}
}
- if($dmca_table){
-
- $out["answer"][] = $dmca_table;
- }
-
- return $out;
- }
-
- public function image($get){
-
- $search = $get["s"];
- $country = $get["country"];
- $nsfw = $get["nsfw"];
- $lang = $get["lang"];
- $size = $get["size"];
- $colortype = $get["colortype"];
- $color = $get["color"];
- $type = $get["type"];
- $rights = $get["rights"];
- $older = $get["older"];
- $newer = $get["newer"];
-
- $params = [];
-
- // country
- if($country != "any"){
-
- $params["gl"] = $country;
- }
-
- // nsfw
- $params["safe"] = $nsfw == "yes" ? "off" : "active";
-
- // language
- if($lang != "any"){
-
- $params["lr"] = "lang_" . $lang;
- }
-
- // &sort=review-date:r:20090301:20090430
- $older = $older === false ? false : date("Ymd", $older);
- $newer = $newer === false ? false : date("Ymd", $newer);
-
- if(
- $older !== false &&
- $newer === false
- ){
-
- $newer = date("Ymd", time());
- }
-
- if(
- $older !== false ||
- $newer !== false
- ){
-
- $params["sort"] = "review-date:r:" . $older . ":" . $newer;
- }
-
+ /*
$handle = fopen("scraper/google-img.html", "r");
$html = fread($handle, filesize("scraper/google-img.html"));
- fclose($handle);
+ fclose($handle);*/
+
+ // scrape images
+ try{
+ $html =
+ $this->get(
+ "https://www.google.com/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get search page");
+ }
$this->fuckhtml->load($html);
@@ -2214,63 +999,6 @@ class google{
"div"
);
- // get next page
- // https://www.google.com/search
- // ?q=higurashi
- // &tbm=isch
- // &async=_id%3Aislrg_c%2C_fmt%3Ahtml
- // &asearch=ichunklite
- // &ved=0ahUKEwidjYXJqJSAAxWrElkFHZ07CDwQtDIIQygA
- $ved =
- $this->fuckhtml
- ->getElementById("islrg", "div");
-
- if($ved){
-
- $ved =
- $this->fuckhtml
- ->getTextContent(
- $ved["attributes"]["data-ved"]
- );
-
- // &vet=1{$ved}..i (10ahUKEwidjYXJqJSAAxWrElkFHZ07CDwQtDIIQygA..i)
-
- /*
- These 2 are handled by us
- start = start + number of results
- ijn = current page number
- */
- // &start=100
- // &ijn=1
-
- // &imgvl=CAEY7gQgBSj3Aji8VTjXVUC4AUC3AUgAYNdV
- preg_match(
- '/var e=\'([A-z0-9]+)\';/',
- $html,
- $imgvl
- );
-
- $imgvl = $imgvl[1];
-
- $out["npt"] =
- $this->nextpage->store(
- json_encode(
- [
- "q" => $get["s"],
- "tbm" => "isch",
- "async" => "_id:islrg_c,_fmt:html",
- "asearch" => "ichunklite",
- "ved" => $ved,
- "vet" => "1" . $ved . "..i",
- "start" => 100,
- "ijn" => 1,
- "imgvl" => $imgvl
- ]
- ),
- "images"
- );
- }
-
foreach($images as $image){
$this->fuckhtml->load($image);
@@ -2330,9 +1058,192 @@ class google{
];
}
+ // get next page
+ // https://www.google.com/search
+ // ?q=higurashi
+ // &tbm=isch
+ // &async=_id%3Aislrg_c%2C_fmt%3Ahtml
+ // &asearch=ichunklite
+ // &ved=0ahUKEwidjYXJqJSAAxWrElkFHZ07CDwQtDIIQygA
+
+ if(count($out["image"]) !== 100){
+
+ // no more results
+ return $out;
+ }
+
+ if($get["npt"]){
+
+ // update nextpage information
+ $params["start"] = (int)$params["start"] + count($out["image"]);
+ $params["ijn"] = (int)$params["ijn"] + 1;
+
+ $out["npt"] =
+ $this->nextpage->store(
+ json_encode($params),
+ "images"
+ );
+ }else{
+
+ // scrape nextpage information
+ $this->fuckhtml->load($html);
+
+ $ved =
+ $this->fuckhtml
+ ->getElementById("islrg", "div");
+
+ if($ved){
+
+ $ved =
+ $this->fuckhtml
+ ->getTextContent(
+ $ved["attributes"]["data-ved"]
+ );
+
+ // &vet=1{$ved}..i (10ahUKEwidjYXJqJSAAxWrElkFHZ07CDwQtDIIQygA..i)
+
+ /*
+ These 2 are handled by us
+ start = start + number of results
+ ijn = current page number
+ */
+ // &start=100
+ // &ijn=1
+
+ // &imgvl=CAEY7gQgBSj3Aji8VTjXVUC4AUC3AUgAYNdV
+ preg_match(
+ '/var e=\'([A-z0-9]+)\';/',
+ $html,
+ $imgvl
+ );
+
+ $imgvl = $imgvl[1];
+
+ $params["async"] = "_id:islrg_c,_fmt:html";
+ $params["asearch"] = "ichunklite";
+ $params["ved"] = $ved;
+ $params["vet"] = "1" . $ved . "..i";
+ $params["start"] = 100;
+ $params["ijn"] = 1;
+ $params["imgvl"] = $imgvl;
+
+ $out["npt"] =
+ $this->nextpage->store(
+ json_encode($params),
+ "images"
+ );
+ }
+ }
+
return $out;
}
+ private function hms2int($time){
+
+ $parts = explode(":", $time, 3);
+ $time = 0;
+
+ if(count($parts) === 3){
+
+ // hours
+ $time = $time + ((int)$parts[0] * 3600);
+ array_shift($parts);
+ }
+
+ if(count($parts) === 2){
+
+ // minutes
+ $time = $time + ((int)$parts[0] * 60);
+ array_shift($parts);
+ }
+
+ // seconds
+ $time = $time + (int)$parts[0];
+
+ return $time;
+ }
+
+ private function loadjavascriptcrap($html){
+
+ $this->fuckhtml->load($html);
+
+ $styles =
+ $this->fuckhtml
+ ->getElementsByTagName("style");
+
+ $this->computedstyle = [];
+ $this->ask = [];
+
+ foreach($styles as $style){
+
+ $this->computedstyle =
+ array_merge(
+ $this->computedstyle,
+ $this->parsestyles($style["innerHTML"])
+ );
+ }
+
+ // get images in javascript var
+ preg_match(
+ '/google\.ldi=({[^}]+})/',
+ $html,
+ $this->js_image
+ );
+
+ if(count($this->js_image) !== 0){
+
+ $this->js_image = json_decode($this->js_image[1], true);
+ }else{
+
+ $this->js_image = [];
+ }
+
+ // additional js_images present in <script> tags
+ // ugh i fucking hate you
+ $scripts =
+ $this->fuckhtml
+ ->getElementsByTagName("script");
+
+ foreach($scripts as $script){
+
+ if(!isset($script["innerHTML"])){
+
+ continue;
+ }
+
+ preg_match_all(
+ '/var s=\'(data:image[^\']+)\';var i=\[\'([^\']+)\'];/',
+ $script["innerHTML"],
+ $image_grep
+ );
+
+ if(count($image_grep[0]) !== 0){
+
+ $this->js_image[trim($image_grep[2][0])] =
+ $this->fuckhtml
+ ->getTextContent(
+ $image_grep[1][0]
+ );
+ }
+
+ // even more javascript crap
+ // "People also ask" node is loaded trough javascript
+ preg_match_all(
+ '/window\.jsl\.dh\(\'([^\']+)\',\'(.+)\'\);/',
+ $script["innerHTML"],
+ $ask_grep
+ );
+
+ for($i=0; $i<count($ask_grep[0]); $i++){
+
+ $this->ask[trim($ask_grep[1][$i])] =
+ stripcslashes(
+ $ask_grep[2][$i]
+ );
+ }
+ }
+ }
+
private function findstyles($rules, $is){
ksort($rules);
diff --git a/settings.php b/settings.php
index 7b7da01..c53599f 100644
--- a/settings.php
+++ b/settings.php
@@ -103,11 +103,11 @@ $settings = [
[
"value" => "brave",
"text" => "Brave"
- ]/*,
+ ],
[
"value" => "google",
"text" => "Google"
- ]*/
+ ]
]
],
[
@@ -147,8 +147,8 @@ $settings = [
[
"value" => "brave",
"text" => "Brave"
- ],
- /*[
+ ],/*
+ [
"value" => "google",
"text" => "Google"
],*/