From bca265aea67ec62499aaa113a6490ce9ec7fe730 Mon Sep 17 00:00:00 2001 From: lolcat Date: Sat, 22 Jul 2023 14:41:14 -0400 Subject: still missing things on google scraper --- lib/frontend.php | 1282 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1282 insertions(+) create mode 100644 lib/frontend.php (limited to 'lib/frontend.php') diff --git a/lib/frontend.php b/lib/frontend.php new file mode 100644 index 0000000..3be912b --- /dev/null +++ b/lib/frontend.php @@ -0,0 +1,1282 @@ + $value){ + + $html = + str_replace( + "{%{$key}%}", + $value, + $html + ); + } + + return trim($html); + } + + public function getthemeclass($raw = true){ + + if( + isset($_COOKIE["theme"]) && + $_COOKIE["theme"] == "cream" + ){ + + $body_class = "theme-white "; + }else{ + + $body_class = ""; + } + + if( + $raw && + $body_class != "" + ){ + + return ' class="' . rtrim($body_class) . '"'; + } + + return $body_class; + } + + public function loadheader(array $get, array $filters, string $page){ + + echo + $this->load("header.html", [ + "title" => trim($get["s"] . " ({$page})"), + "description" => ucfirst($page) . ' search results for "' . htmlspecialchars($get["s"]) . '"', + "index" => "no", + "search" => htmlspecialchars($get["s"]), + "tabs" => $this->generatehtmltabs($page, $get["s"]), + "filters" => $this->generatehtmlfilters($filters, $get), + "body_class" => $this->getthemeclass() + ]); + + if( + preg_match( + '/bot|wget|curl|python-requests|scrapy|feedfetcher|go-http-client|ruby|universalfeedparser|yahoo\! slurp|spider|rss/i', + $_SERVER["HTTP_USER_AGENT"] + ) + ){ + + // bot detected !! + echo + $this->drawerror( + "Tshh, blocked!", + 'You were blocked from viewing this page. If you wish to scrape data from 4get, please consider running your own 4get instance or using the API.', + ); + die(); + } + } + + public function drawerror($title, $error){ + + return + $this->load("search.html", [ + "class" => "", + "right-left" => "", + "right-right" => "", + "left" => + '
' . + '

' . htmlspecialchars($title) . '

' . + $error . + '
' + ]); + } + + public function drawtextresult($site, $greentext = null, $duration = null, $keywords, $tabindex = true){ + + $payload = + '
'; + + // add favicon, link and archive links + $payload .= $this->drawlink($site["url"]); + + /* + Draw title + description + filetype + */ + $payload .= + '' . + 'thumb'; + + if($duration !== null){ + + $payload .= + '
' . + htmlspecialchars($duration) . + '
'; + } + + $payload .= + '
'; + } + + $payload .= + '
'; + + if( + isset($site["type"]) && + $site["type"] != "web" + ){ + + $payload .= '
' . strtoupper($site["type"]) . '
'; + } + + $payload .= + htmlspecialchars($site["title"]) . + '
'; + + if($greentext !== null){ + + $payload .= + '
' . + htmlspecialchars($greentext) . + '
'; + } + + if($site["description"] !== null){ + + $payload .= + '
' . + $this->highlighttext($keywords, $site["description"]) . + '
'; + } + + $payload .= '
'; + + /* + Sublinks + */ + if( + isset($site["sublink"]) && + !empty($site["sublink"]) + ){ + + usort($site["sublink"], function($a, $b){ + + return strlen($a["description"]) > strlen($b["description"]); + }); + + $payload .= + ''; + } + + if( + isset($site["table"]) && + !empty($site["table"]) + ){ + + $payload .= ''; + + foreach($site["table"] as $title => $value){ + + $payload .= + '' . + '' . + '' . + ''; + } + + $payload .= '
' . htmlspecialchars($title) . '' . htmlspecialchars($value) . '
'; + } + + return $payload . ''; + } + + public function highlighttext($keywords, $text){ + + $text = htmlspecialchars($text); + + $keywords = explode(" ", $keywords); + $regex = []; + + foreach($keywords as $word){ + + $regex[] = "\b" . preg_quote($word, "/") . "\b"; + } + + $regex = "/" . implode("|", $regex) . "/i"; + + return + preg_replace( + $regex, + '${0}', + $text + ); + } + + function highlightcode($text){ + + // https://www.php.net/highlight_string + ini_set("highlight.comment", "c-comment"); + ini_set("highlight.default", "c-default"); + ini_set("highlight.html", "c-default"); + ini_set("highlight.keyword", "c-keyword"); + ini_set("highlight.string", "c-string"); + + $text = + trim( + preg_replace( + '/<\/span>$/', + "", // remove stray ending span because of the ', + ' ' + ], + [ + "\n", // replace
with newlines + " " // replace html entity to space + ], + str_replace( + [ + // leading \n<?php ", + "", + "" + ], + "", + highlight_string("', '', $text); + } + + return $text; + } + + public function drawlink($link){ + + /* + Add favicon + */ + $host = parse_url($link); + $esc = + explode( + ".", + $host["host"], + 2 + ); + + if( + count($esc) === 2 && + $esc[0] == "www" + ){ + + $esc = $esc[1]; + }else{ + + $esc = $esc[0]; + } + + $esc = substr($esc, 0, 2); + + $urlencode = urlencode($link); + + $payload = + '
' . + '' . + '
'; + + /* + Add archive links + */ + if( + $host["host"] == "boards.4chan.org" || + $host["host"] == "boards.4channel.org" + ){ + + $archives = []; + $path = explode("/", $host["path"]); + $count = count($path); + // /pol/thread/417568063/post-shitty-memes-if-you-want-to + + if($count !== 0){ + + $isboard = true; + + switch($path[1]){ + + case "con": + break; + + case "q": + $archives[] = "desuarchive.org"; + break; + + case "qa": + $archives[] = "desuarchive.org"; + break; + + case "qb": + $archives[] = "arch.b4k.co"; + break; + + case "trash": + $archives[] = "desuarchive.org"; + break; + + case "a": + $archives[] = "desuarchive.org"; + break; + + case "c": + $archives[] = "desuarchive.org"; + break; + + case "w": + break; + + case "m": + $archives[] = "desuarchive.org"; + break; + + case "cgl": + $archives[] = "desuarchive.org"; + $archives[] = "warosu.org"; + break; + + case "cm": + $archives[] = "boards.fireden.net"; + break; + + case "f": + $archives[] = "archive.4plebs.org"; + break; + + case "n": + break; + + case "jp": + $archives[] = "warosu.org"; + break; + + case "vt": + $archives[] = "warosu.org"; + break; + + case "v": + $archives[] = "boards.fireden.net"; + $archives[] = "arch.b4k.co"; + break; + + case "vg": + $archives[] = "boards.fireden.net"; + $archives[] = "arch.b4k.co"; + break; + + case "vm": + $archives[] = "arch.b4k.co"; + break; + + case "vmg": + $archives[] = "arch.b4k.co"; + break; + + case "vp": + $archives[] = "arch.b4k.co"; + break; + + case "vr": + $archives[] = "desuarchive.org"; + $archives[] = "warosu.org"; + break; + + case "vrpg": + $archives[] = "arch.b4k.co"; + break; + + case "vst": + $archives[] = "arch.b4k.co"; + break; + + case "co": + $archives[] = "desuarchive.org"; + break; + + case "g": + $archives[] = "desuarchive.org"; + $archives[] = "arch.b4k.co"; + break; + + case "tv": + $archives[] = "archive.4plebs.org"; + break; + + case "k": + $archives[] = "desuarchive.org"; + break; + + case "o": + $archives[] = "archive.4plebs.org"; + break; + + case "an": + $archives[] = "desuarchive.org"; + break; + + case "tg": + $archives[] = "desuarchive.org"; + $archives[] = "archive.4plebs.org"; + break; + + case "sp": + $archives[] = "archive.4plebs.org"; + break; + + case "xs": + $archives[] = "eientei.xyz"; + break; + + case "pw": + break; + + case "sci": + $archives[] = "boards.fireden.net"; + $archives[] = "warosu.org"; + $archives[] = "eientei.xyz"; + break; + + case "his": + $archives[] = "desuarchive.org"; + break; + + case "int": + $archives[] = "desuarchive.org"; + break; + + case "out": + break; + + case "toy": + break; + + case "i": + $archives[] = "archiveofsins.com"; + $archives[] = "eientei.xyz"; + break; + + case "po": + break; + + case "p": + break; + + case "ck": + $archives[] = "warosu.org"; + break; + + case "ic": + $archives[] = "boards.fireden.net"; + $archives[] = "warosu.org"; + break; + + case "wg": + break; + + case "lit": + $archives[] = "warosu.org"; + break; + + case "mu": + $archives[] = "desuarchive.org"; + break; + + case "fa": + $archives[] = "warosu.org"; + break; + + case "3": + $archives[] = "warosu.org"; + $archives[] = "eientei.xyz"; + break; + + case "gd": + break; + + case "diy": + $archives[] = "warosu.org"; + break; + + case "wsg": + $archives[] = "desuarchive.org"; + break; + + case "qst": + break; + + case "biz": + $archives[] = "warosu.org"; + break; + + case "trv": + $archives[] = "archive.4plebs.org"; + break; + + case "fit": + $archives[] = "desuarchive.org"; + break; + + case "x": + $archives[] = "archive.4plebs.org"; + break; + + case "adv": + $archives[] = "archive.4plebs.org"; + break; + + case "lgbt": + $archives[] = "archiveofsins.com"; + break; + + case "mlp": + $archives[] = "desuarchive.org"; + $archives[] = "arch.b4k.co"; + break; + + case "news": + break; + + case "wsr": + break; + + case "vip": + break; + + case "b": + $archives[] = "thebarchive.com"; + break; + + case "r9k": + $archives[] = "desuarchive.org"; + break; + + case "pol": + $archives[] = "archive.4plebs.org"; + break; + + case "bant": + $archives[] = "thebarchive.com"; + break; + + case "soc": + $archives[] = "archiveofsins.com"; + break; + + case "s4s": + $archives[] = "archive.4plebs.org"; + break; + + case "s": + $archives[] = "archiveofsins.com"; + break; + + case "hc": + $archives[] = "archiveofsins.com"; + break; + + case "hm": + $archives[] = "archiveofsins.com"; + break; + + case "h": + $archives[] = "archiveofsins.com"; + break; + + case "e": + break; + + case "u": + $archives[] = "archiveofsins.com"; + break; + + case "d": + $archives[] = "desuarchive.org"; + break; + + case "y": + $archives[] = "boards.fireden.net"; + break; + + case "t": + $archives[] = "archiveofsins.com"; + break; + + case "hr": + $archives[] = "archive.4plebs.org"; + break; + + case "gif": + break; + + case "aco": + $archives[] = "desuarchive.org"; + break; + + case "r": + $archives[] = "archiveofsins.com"; + break; + + default: + $isboard = false; + break; + } + + if($isboard === true){ + + $archives[] = "archived.moe"; + } + + $trail = ""; + + if( + isset($path[2]) && + isset($path[3]) && + $path[2] == "thread" + ){ + + $trail .= "/" . $path[1] . "/thread/" . $path[3]; + }elseif($isboard){ + + $trail = "/" . $path[1] . "/"; + } + + for($i=0; $i' . + '' . $archives[$i][0] . $archives[$i][1] . '' . + $archives[$i] . + ''; + } + } + } + + $payload .= + 'goGoogle cache' . + 'arArchive.org' . + 'arArchive.is' . + 'biBing cache' . + 'meMegalodon' . + '
'; + + /* + Draw link + */ + $parts = explode("/", $link); + $clickurl = ""; + + // remove trailing / + $c = count($parts) - 1; + if($parts[$c] == ""){ + + $parts[$c - 1] = $parts[$c - 1] . "/"; + unset($parts[$c]); + } + + // merge https://site together + $parts = [ + $parts[0] . $parts[1] . '//' . $parts[2], + ...array_slice($parts, 3, count($parts) - 1) + ]; + + $c = count($parts); + for($i=0; $i<$c; $i++){ + + if($i !== 0){ $clickurl .= "/"; } + + $clickurl .= $parts[$i]; + + if($i === $c - 1){ + + $parts[$i] = rtrim($parts[$i], "/"); + } + + $payload .= + '' . + htmlspecialchars(urldecode($parts[$i])) . + ''; + + if($i !== $c - 1){ + + $payload .= ''; + } + } + + return $payload . '
'; + } + + public function getscraperfilters($page){ + + $get_scraper = null; + + switch($page){ + + case "web": + $get_scraper = isset($_COOKIE["scraper_web"]) ? $_COOKIE["scraper_web"] : null; + break; + + case "images": + $get_scraper = isset($_COOKIE["scraper_images"]) ? $_COOKIE["scraper_images"] : null; + break; + + case "videos": + $get_scraper = isset($_COOKIE["scraper_videos"]) ? $_COOKIE["scraper_videos"] : null; + break; + + case "news": + $get_scraper = isset($_COOKIE["scraper_news"]) ? $_COOKIE["scraper_news"] : null; + break; + } + + if( + isset($_GET["scraper"]) && + is_string($_GET["scraper"]) + ){ + + $get_scraper = $_GET["scraper"]; + }else{ + + if( + isset($_GET["npt"]) && + is_string($_GET["npt"]) + ){ + + $get_scraper = explode(".", $_GET["npt"], 2)[0]; + + $get_scraper = + preg_replace( + '/[0-9]+$/', + "", + $get_scraper + ); + } + } + + // add search field + $filters = + [ + "s" => [ + "option" => "_SEARCH" + ] + ]; + + // define default scrapers + switch($page){ + + case "web": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "ddg" => "DuckDuckGo", + "brave" => "Brave", + "google" => "Google", + "mojeek" => "Mojeek", + "marginalia" => "Marginalia", + "wiby" => "wiby" + ] + ]; + break; + + case "images": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "ddg" => "DuckDuckGo", + "yandex" => "Yandex", + "google" => "Google" + ] + ]; + break; + + case "videos": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "yt" => "YouTube", + "ddg" => "DuckDuckGo", + "google" => "Google" + ] + ]; + break; + + case "news": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "ddg" => "DuckDuckGo", + "brave" => "Brave", + "google" => "Google", + "mojeek" => "Mojeek" + ] + ]; + break; + } + + // get scraper name from user input, or default out to preferred scraper + $scraper_out = null; + $first = true; + + foreach($filters["scraper"]["option"] as $scraper_name => $scraper_pretty){ + + if($first === true){ + + $first = $scraper_name; + } + + if($scraper_name == $get_scraper){ + + $scraper_out = $scraper_name; + } + } + + if($scraper_out === null){ + + $scraper_out = $first; + } + + switch($scraper_out){ + + case "ddg": + include "scraper/ddg.php"; + $lib = new ddg(); + break; + + case "brave": + include "scraper/brave.php"; + $lib = new brave(); + break; + + case "yt"; + include "scraper/youtube.php"; + $lib = new youtube(); + break; + + case "yandex": + include "scraper/yandex.php"; + $lib = new yandex(); + break; + + case "google": + include "scraper/google.php"; + $lib = new google(); + break; + + case "mojeek": + include "scraper/mojeek.php"; + $lib = new mojeek(); + break; + + case "marginalia": + include "scraper/marginalia.php"; + $lib = new marginalia(); + break; + + case "wiby": + include "scraper/wiby.php"; + $lib = new wiby(); + break; + } + + // set scraper on $_GET + $_GET["scraper"] = $scraper_out; + + // set nsfw on $_GET + if( + isset($_COOKIE["nsfw"]) && + !isset($_GET["nsfw"]) + ){ + + $_GET["nsfw"] = $_COOKIE["nsfw"]; + } + + return + [ + $lib, + array_merge_recursive( + $filters, + $lib->getfilters($page) + ) + ]; + } + + public function parsegetfilters($parameters, $whitelist){ + + $sanitized = []; + + // add npt token + if( + isset($parameters["npt"]) && + is_string($parameters["npt"]) + ){ + + $sanitized["npt"] = $parameters["npt"]; + }else{ + + $sanitized["npt"] = false; + } + + // we're iterating over $whitelist, so + // you can't polluate $sanitized with useless + // parameters + foreach($whitelist as $parameter => $value){ + + if(isset($parameters[$parameter])){ + + if(!is_string($parameters[$parameter])){ + + $sanitized[$parameter] = null; + continue; + } + + // parameter is already set, use that value + $sanitized[$parameter] = $parameters[$parameter]; + }else{ + + // parameter is not set, add it + if(is_string($value["option"])){ + + // special field: set default value manually + switch($value["option"]){ + + case "_DATE": + // no date set + $sanitized[$parameter] = false; + break; + + case "_SEARCH": + // no search set + $sanitized[$parameter] = ""; + break; + } + + }else{ + + // set a default value + $sanitized[$parameter] = array_keys($value["option"])[0]; + } + } + + // sanitize input + if(is_array($value["option"])){ + if( + !in_array( + $sanitized[$parameter], + $keys = array_keys($value["option"]) + ) + ){ + + $sanitized[$parameter] = $keys[0]; + } + }else{ + + // sanitize search & string + switch($value["option"]){ + + case "_DATE": + if($sanitized[$parameter] !== false){ + + $sanitized[$parameter] = strtotime($sanitized[$parameter]); + if($sanitized[$parameter] <= 0){ + + $sanitized[$parameter] = false; + } + } + break; + + case "_SEARCH": + + // get search string & bang + $sanitized[$parameter] = trim($sanitized[$parameter]); + $sanitized["bang"] = ""; + + if( + strlen($sanitized[$parameter]) !== 0 && + $sanitized[$parameter][0] == "!" + ){ + + $sanitized[$parameter] = explode(" ", $sanitized[$parameter], 2); + + $sanitized["bang"] = trim($sanitized[$parameter][0]); + + if(count($sanitized[$parameter]) === 2){ + + $sanitized[$parameter] = trim($sanitized[$parameter][1]); + }else{ + + $sanitized[$parameter] = ""; + } + + $sanitized["bang"] = ltrim($sanitized["bang"], "!"); + } + + $sanitized[$parameter] = ltrim($sanitized[$parameter], "! \n\r\t\v\x00"); + } + } + } + + // invert dates if needed + if( + isset($sanitized["older"]) && + isset($sanitized["newer"]) && + $sanitized["newer"] !== false && + $sanitized["older"] !== false && + $sanitized["newer"] > $sanitized["older"] + ){ + + // invert + [ + $sanitized["older"], + $sanitized["newer"] + ] = [ + $sanitized["newer"], + $sanitized["older"] + ]; + } + + return $sanitized; + } + + public function s_to_timestamp($seconds){ + + if(is_string($seconds)){ + + return "LIVE"; + } + + return ($seconds >= 60) ? ltrim(gmdate("H:i:s", $seconds), ":0") : gmdate("0:s", $seconds); + } + + public function generatehtmltabs($page, $query){ + + $html = null; + + foreach(["web", "images", "videos", "news"] as $type){ + + $html .= '' . ucfirst($type) . ''; + } + + return $html; + } + + public function generatehtmlfilters($filters, $params){ + + $html = null; + + foreach($filters as $filter_name => $filter_values){ + + if(!isset($filter_values["display"])){ + + continue; + } + + $output = true; + $tmp = + '
' . + '
' . htmlspecialchars($filter_values["display"]) . '
'; + + if(is_array($filter_values["option"])){ + + $tmp .= ''; + }else{ + + switch($filter_values["option"]){ + + case "_DATE": + $tmp .= ' $value){ + + if( + $value == null || + $value == false || + $key == "npt" || + $key == "extendedsearch" || + $value == "any" || + $value == "all" || + ( + $ommit === true && + $key == "s" + ) + ){ + + continue; + } + + $out[$key] = $value; + } + + return http_build_query($out); + } + + public function htmlnextpage($gets, $npt, $page){ + + $query = $this->buildquery($gets); + + return $page . "?" . $query . "&npt=" . $npt; + } +} -- cgit v1.2.3