summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
Diffstat (limited to 'scraper')
-rw-r--r--scraper/ddg.php2
-rw-r--r--scraper/google.php1065
2 files changed, 830 insertions, 237 deletions
diff --git a/scraper/ddg.php b/scraper/ddg.php
index c9c28af..1ce8e18 100644
--- a/scraper/ddg.php
+++ b/scraper/ddg.php
@@ -679,7 +679,7 @@ class ddg{
Check for worknik results
*/
preg_match(
- '/nrj\(\'\/js\/spice\/dictionary\/definition\/([^\']+)\'\)/',
+ '/nrj\(\'\/js\/spice\/dictionary\/definition\/([^\'\)]+)/',
$js,
$wordnik
);
diff --git a/scraper/google.php b/scraper/google.php
index 6a746f7..df10754 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -18,37 +18,44 @@ class google{
switch($page){
- case "web": return [];/*
+ case "web":
return [
- "country" => [
+ "country" => [ // gl=<country>
"display" => "Country",
"option" => [
- "zz" => "Instance region",
+ "any" => "Instance's country",
"af" => "Afghanistan",
"al" => "Albania",
"dz" => "Algeria",
"as" => "American Samoa",
"ad" => "Andorra",
"ao" => "Angola",
- "ag" => "Antigua & Barbuda",
+ "ai" => "Anguilla",
+ "aq" => "Antarctica",
+ "ag" => "Antigua and Barbuda",
"ar" => "Argentina",
"am" => "Armenia",
+ "aw" => "Aruba",
"au" => "Australia",
"at" => "Austria",
"az" => "Azerbaijan",
"bs" => "Bahamas",
"bh" => "Bahrain",
"bd" => "Bangladesh",
+ "bb" => "Barbados",
"by" => "Belarus",
"be" => "Belgium",
"bz" => "Belize",
"bj" => "Benin",
+ "bm" => "Bermuda",
"bt" => "Bhutan",
"bo" => "Bolivia",
- "ba" => "Bosnia & Herzegovina",
+ "ba" => "Bosnia and Herzegovina",
"bw" => "Botswana",
+ "bv" => "Bouvet Island",
"br" => "Brazil",
- "bn" => "Brunei",
+ "io" => "British Indian Ocean Territory",
+ "bn" => "Brunei Darussalam",
"bg" => "Bulgaria",
"bf" => "Burkina Faso",
"bi" => "Burundi",
@@ -56,19 +63,24 @@ class google{
"cm" => "Cameroon",
"ca" => "Canada",
"cv" => "Cape Verde",
+ "ky" => "Cayman Islands",
"cf" => "Central African Republic",
"td" => "Chad",
"cl" => "Chile",
+ "cn" => "China",
+ "cx" => "Christmas Island",
+ "cc" => "Cocos (Keeling) Islands",
"co" => "Colombia",
- "cg" => "Congo - Brazzaville",
- "cd" => "Congo - Kinshasa",
+ "km" => "Comoros",
+ "cg" => "Congo",
+ "cd" => "Congo, the Democratic Republic",
"ck" => "Cook Islands",
"cr" => "Costa Rica",
- "ci" => "Côte d’Ivoire",
+ "ci" => "Cote D'ivoire",
"hr" => "Croatia",
"cu" => "Cuba",
"cy" => "Cyprus",
- "cz" => "Czechia",
+ "cz" => "Czech Republic",
"dk" => "Denmark",
"dj" => "Djibouti",
"dm" => "Dominica",
@@ -76,11 +88,18 @@ class google{
"ec" => "Ecuador",
"eg" => "Egypt",
"sv" => "El Salvador",
+ "gq" => "Equatorial Guinea",
+ "er" => "Eritrea",
"ee" => "Estonia",
"et" => "Ethiopia",
+ "fk" => "Falkland Islands (Malvinas)",
+ "fo" => "Faroe Islands",
"fj" => "Fiji",
"fi" => "Finland",
"fr" => "France",
+ "gf" => "French Guiana",
+ "pf" => "French Polynesia",
+ "tf" => "French Southern Territories",
"ga" => "Gabon",
"gm" => "Gambia",
"ge" => "Georgia",
@@ -89,86 +108,111 @@ class google{
"gi" => "Gibraltar",
"gr" => "Greece",
"gl" => "Greenland",
+ "gd" => "Grenada",
+ "gp" => "Guadeloupe",
+ "gu" => "Guam",
"gt" => "Guatemala",
- "gg" => "Guernsey",
+ "gn" => "Guinea",
+ "gw" => "Guinea-Bissau",
"gy" => "Guyana",
"ht" => "Haiti",
+ "hm" => "Heard Island and Mcdonald Islands",
+ "va" => "Holy See (Vatican City State)",
"hn" => "Honduras",
"hk" => "Hong Kong",
"hu" => "Hungary",
"is" => "Iceland",
"in" => "India",
"id" => "Indonesia",
+ "ir" => "Iran, Islamic Republic",
"iq" => "Iraq",
"ie" => "Ireland",
- "im" => "Isle of Man",
"il" => "Israel",
"it" => "Italy",
"jm" => "Jamaica",
"jp" => "Japan",
- "je" => "Jersey",
"jo" => "Jordan",
"kz" => "Kazakhstan",
"ke" => "Kenya",
"ki" => "Kiribati",
+ "kp" => "Korea, Democratic People's Republic",
+ "kr" => "Korea, Republic",
"kw" => "Kuwait",
"kg" => "Kyrgyzstan",
- "la" => "Laos",
+ "la" => "Lao People's Democratic Republic",
"lv" => "Latvia",
"lb" => "Lebanon",
"ls" => "Lesotho",
- "ly" => "Libya",
+ "lr" => "Liberia",
+ "ly" => "Libyan Arab Jamahiriya",
"li" => "Liechtenstein",
"lt" => "Lithuania",
"lu" => "Luxembourg",
+ "mo" => "Macao",
+ "mk" => "Macedonia, the Former Yugosalv Republic",
"mg" => "Madagascar",
"mw" => "Malawi",
"my" => "Malaysia",
"mv" => "Maldives",
"ml" => "Mali",
"mt" => "Malta",
+ "mh" => "Marshall Islands",
+ "mq" => "Martinique",
+ "mr" => "Mauritania",
"mu" => "Mauritius",
+ "yt" => "Mayotte",
"mx" => "Mexico",
- "fm" => "Micronesia",
- "md" => "Moldova",
+ "fm" => "Micronesia, Federated States",
+ "md" => "Moldova, Republic",
+ "mc" => "Monaco",
"mn" => "Mongolia",
- "me" => "Montenegro",
+ "ms" => "Montserrat",
"ma" => "Morocco",
"mz" => "Mozambique",
- "mm" => "Myanmar (Burma)",
+ "mm" => "Myanmar",
"na" => "Namibia",
"nr" => "Nauru",
"np" => "Nepal",
"nl" => "Netherlands",
+ "an" => "Netherlands Antilles",
+ "nc" => "New Caledonia",
"nz" => "New Zealand",
"ni" => "Nicaragua",
"ne" => "Niger",
"ng" => "Nigeria",
"nu" => "Niue",
- "mk" => "North Macedonia",
+ "nf" => "Norfolk Island",
+ "mp" => "Northern Mariana Islands",
"no" => "Norway",
"om" => "Oman",
"pk" => "Pakistan",
- "ps" => "Palestine",
+ "pw" => "Palau",
+ "ps" => "Palestinian Territory, Occupied",
"pa" => "Panama",
"pg" => "Papua New Guinea",
"py" => "Paraguay",
"pe" => "Peru",
"ph" => "Philippines",
- "pn" => "Pitcairn Islands",
+ "pn" => "Pitcairn",
"pl" => "Poland",
"pt" => "Portugal",
"pr" => "Puerto Rico",
"qa" => "Qatar",
+ "re" => "Reunion",
"ro" => "Romania",
- "ru" => "Russia",
+ "ru" => "Russian Federation",
"rw" => "Rwanda",
+ "sh" => "Saint Helena",
+ "kn" => "Saint Kitts and Nevis",
+ "lc" => "Saint Lucia",
+ "pm" => "Saint Pierre and Miquelon",
+ "vc" => "Saint Vincent and the Grenadines",
"ws" => "Samoa",
"sm" => "San Marino",
- "st" => "São Tomé & Príncipe",
+ "st" => "Sao Tome and Principe",
"sa" => "Saudi Arabia",
"sn" => "Senegal",
- "rs" => "Serbia",
+ "cs" => "Serbia and Montenegro",
"sc" => "Seychelles",
"sl" => "Sierra Leone",
"sg" => "Singapore",
@@ -177,36 +221,46 @@ class google{
"sb" => "Solomon Islands",
"so" => "Somalia",
"za" => "South Africa",
- "kr" => "South Korea",
+ "gs" => "South Georgia and the South Sandwich Islands",
"es" => "Spain",
"lk" => "Sri Lanka",
- "sh" => "St. Helena",
- "vc" => "St. Vincent & Grenadines",
+ "sd" => "Sudan",
"sr" => "Suriname",
+ "sj" => "Svalbard and Jan Mayen",
+ "sz" => "Swaziland",
"se" => "Sweden",
"ch" => "Switzerland",
- "tw" => "Taiwan",
+ "sy" => "Syrian Arab Republic",
+ "tw" => "Taiwan, Province of China",
"tj" => "Tajikistan",
- "tz" => "Tanzania",
+ "tz" => "Tanzania, United Republic",
"th" => "Thailand",
"tl" => "Timor-Leste",
"tg" => "Togo",
+ "tk" => "Tokelau",
"to" => "Tonga",
- "tt" => "Trinidad & Tobago",
+ "tt" => "Trinidad and Tobago",
"tn" => "Tunisia",
- "tr" => "Türkiye",
+ "tr" => "Turkey",
"tm" => "Turkmenistan",
- "vi" => "U.S. Virgin Islands",
+ "tc" => "Turks and Caicos Islands",
+ "tv" => "Tuvalu",
"ug" => "Uganda",
"ua" => "Ukraine",
"ae" => "United Arab Emirates",
- "gb" => "United Kingdom",
+ "uk" => "United Kingdom",
"us" => "United States",
+ "um" => "United States Minor Outlying Islands",
"uy" => "Uruguay",
"uz" => "Uzbekistan",
"vu" => "Vanuatu",
"ve" => "Venezuela",
- "vn" => "Vietnam",
+ "vn" => "Viet Nam",
+ "vg" => "Virgin Islands, British",
+ "vi" => "Virgin Islands, U.S.",
+ "wf" => "Wallis and Futuna",
+ "eh" => "Western Sahara",
+ "ye" => "Yemen",
"zm" => "Zambia",
"zw" => "Zimbabwe"
]
@@ -214,81 +268,60 @@ class google{
"nsfw" => [
"display" => "NSFW",
"option" => [
- "yes" => "Yes",
- "no" => "No"
+ "yes" => "Yes", // safe=active
+ "no" => "No" // safe=off
]
],
- "lang" => [ // prefix with lang_
+ "lang" => [ // lr=<lang> (prefix lang with "lang_")
"display" => "Language",
"option" => [
"any" => "Any language",
- "af" => "Afrikaans",
- "ca" => "català",
- "cs" => "čeština",
- "da" => "dansk",
- "de" => "Deutsch",
- "et" => "eesti",
+ "ar" => "Arabic",
+ "bg" => "Bulgarian",
+ "ca" => "Catalan",
+ "cs" => "Czech",
+ "da" => "Danish",
+ "de" => "German",
+ "el" => "Greek",
"en" => "English",
- "es" => "español",
- "eo" => "esperanto",
- "tl" => "Filipino",
- "fr" => "français",
- "hr" => "hrvatski",
- "id" => "Indonesia",
- "is" => "íslenska",
- "it" => "italiano",
- "sw" => "Kiswahili",
- "lv" => "latviešu",
- "lt" => "lietuvių",
- "hu" => "magyar",
- "nl" => "Nederlands",
- "no" => "norsk",
- "pl" => "polski",
- "pt" => "português",
- "ro" => "română",
- "sk" => "slovenčina",
- "sl" => "slovenščina",
- "fi" => "suomi",
- "sv" => "svenska",
- "vi" => "Tiếng Việt",
- "tr" => "Türkçe",
- "el" => "Ελληνικά",
- "be" => "беларуская",
- "bg" => "български",
- "ru" => "русский",
- "sr" => "српски",
- "uk" => "українська",
- "hy" => "հայերեն",
- "iw" => "עברית",
- "ar" => "العربية",
- "fa" => "فارسی",
- "hi" => "हिन्दी",
- "th" => "ไทย",
- "ko" => "한국어",
- "zh-CN" => "中文 (简体)",
- "zh-TW" => "中文 (繁體)",
- "ja" => "日本語"
+ "es" => "Spanish",
+ "et" => "Estonian",
+ "fi" => "Finnish",
+ "fr" => "French",
+ "hr" => "Croatian",
+ "hu" => "Hungarian",
+ "id" => "Indonesian",
+ "is" => "Icelandic",
+ "it" => "Italian",
+ "iw" => "Hebrew",
+ "ja" => "Japanese",
+ "ko" => "Korean",
+ "lt" => "Lithuanian",
+ "lv" => "Latvian",
+ "nl" => "Dutch",
+ "no" => "Norwegian",
+ "pl" => "Polish",
+ "pt" => "Portuguese",
+ "ro" => "Romanian",
+ "ru" => "Russian",
+ "sk" => "Slovak",
+ "sl" => "Slovenian",
+ "sr" => "Serbian",
+ "sv" => "Swedish",
+ "tr" => "Turkish",
+ "zh-CN" => "Chinese (Simplified)",
+ "zh-TW" => "Chinese (Traditional)"
]
],
- "time" => [
- "display" => "Time posted",
- "option" => [
- "any" => "Any time",
- "h" => "Last hour",
- "d" => "Last 24 hours",
- "w" => "Last week",
- "m" => "Last month",
- "y" => "Last year"
- ]
+ "newer" => [ // &sort=review-date:r:20090301:20090430
+ "display" => "Newer than",
+ "option" => "_DATE"
],
- "verbatim" => [
- "display" => "Verbatim",
- "option" => [
- "no" => "No",
- "yes" => "Yes"
- ]
+ "older" => [
+ "display" => "Older than",
+ "option" => "_DATE"
]
- ];*/
+ ];
break;
case "images":
@@ -346,7 +379,7 @@ class google{
"co" => "Colombia",
"km" => "Comoros",
"cg" => "Congo",
- "cd" => "Congo, the Democratic Republic of the",
+ "cd" => "Congo, the Democratic Republic",
"ck" => "Cook Islands",
"cr" => "Costa Rica",
"ci" => "Cote D'ivoire",
@@ -397,7 +430,7 @@ class google{
"is" => "Iceland",
"in" => "India",
"id" => "Indonesia",
- "ir" => "Iran, Islamic Republic of",
+ "ir" => "Iran, Islamic Republic",
"iq" => "Iraq",
"ie" => "Ireland",
"il" => "Israel",
@@ -408,8 +441,8 @@ class google{
"kz" => "Kazakhstan",
"ke" => "Kenya",
"ki" => "Kiribati",
- "kp" => "Korea, Democratic People's Republic of",
- "kr" => "Korea, Republic of",
+ "kp" => "Korea, Democratic People's Republic",
+ "kr" => "Korea, Republic",
"kw" => "Kuwait",
"kg" => "Kyrgyzstan",
"la" => "Lao People's Democratic Republic",
@@ -422,7 +455,7 @@ class google{
"lt" => "Lithuania",
"lu" => "Luxembourg",
"mo" => "Macao",
- "mk" => "Macedonia, the Former Yugosalv Republic of",
+ "mk" => "Macedonia, the Former Yugosalv Republic",
"mg" => "Madagascar",
"mw" => "Malawi",
"my" => "Malaysia",
@@ -435,8 +468,8 @@ class google{
"mu" => "Mauritius",
"yt" => "Mayotte",
"mx" => "Mexico",
- "fm" => "Micronesia, Federated States of",
- "md" => "Moldova, Republic of",
+ "fm" => "Micronesia, Federated States",
+ "md" => "Moldova, Republic",
"mc" => "Monaco",
"mn" => "Mongolia",
"ms" => "Montserrat",
@@ -506,7 +539,7 @@ class google{
"sy" => "Syrian Arab Republic",
"tw" => "Taiwan, Province of China",
"tj" => "Tajikistan",
- "tz" => "Tanzania, United Republic of",
+ "tz" => "Tanzania, United Republic",
"th" => "Thailand",
"tl" => "Timor-Leste",
"tg" => "Togo",
@@ -603,14 +636,20 @@ class google{
"i" => "Icon"
]
],
- "color" => [ // tbs=ic:<color>
+ "colortype" => [ // imgColorType=<color>
+ "display" => "Color type",
+ "option" => [
+ "any" => "Any color type",
+ "color" => "Colored",
+ "gray" => "Gray",
+ "mono" => "Black & white",
+ "trans" => "Transparent"
+ ]
+ ],
+ "color" => [ // imgDominantColor=<color>
"display" => "Color",
"option" => [
"any" => "Any color",
- "gray" => "Black and white",
- "trans" => "Transparent",
- // from here, format is
- // tbs=specific,isc:<color>
"red" => "Red",
"orange" => "Orange",
"yellow" => "Yellow",
@@ -625,13 +664,15 @@ class google{
"brown" => "Brown"
]
],
- "type" => [ // tbs=itp:<type>
+ "type" => [ // imgType=<type>
"display" => "Type",
"option" => [
"any" => "Any type",
+ "face" => "Faces",
"clipart" => "Clip Art",
"lineart" => "Line Drawing",
- "animated" => "GIF"
+ "stock" => "Stock",
+ "animated" => "Animated"
]
],
"rights" => [ // tbs=il:<rights>
@@ -694,6 +735,52 @@ class google{
public function web($get){
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $lang = $get["lang"];
+ $older = $get["older"];
+ $newer = $get["newer"];
+
+ $params = [
+ "num" => 20 // get 20 results
+ ];
+
+ // country
+ if($country != "any"){
+
+ $params["gl"] = $country;
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ // language
+ if($lang != "any"){
+
+ $params["lr"] = "lang_" . $lang;
+ }
+
+ // &sort=review-date:r:20090301:20090430
+ $older = $older === false ? false : date("Ymd", $older);
+ $newer = $newer === false ? false : date("Ymd", $newer);
+
+ if(
+ $older !== false &&
+ $newer === false
+ ){
+
+ $newer = date("Ymd", time());
+ }
+
+ if(
+ $older !== false ||
+ $newer !== false
+ ){
+
+ $params["sort"] = "review-date:r:" . $older . ":" . $newer;
+ }
+
$handle = fopen("scraper/google.html", "r");
$html = fread($handle, filesize("scraper/google.html"));
fclose($handle);
@@ -735,15 +822,44 @@ class google{
preg_match(
'/google\.ldi=({[^}]+})/',
$html,
- $js_image
+ $this->js_image
);
- if(count($js_image) !== 0){
+ if(count($this->js_image) !== 0){
- $js_image = json_decode($js_image[1], true);
+ $this->js_image = json_decode($this->js_image[1], true);
}else{
- $js_image = [];
+ $this->js_image = [];
+ }
+
+ // additional js_images present in <script> tags
+ // ugh i fucking hate you
+ $scripts =
+ $this->fuckhtml
+ ->getElementsByTagName("script");
+
+ foreach($scripts as $script){
+
+ if(!isset($script["innerHTML"])){
+
+ continue;
+ }
+
+ preg_match_all(
+ '/var s=\'(data:image[^\']+)\';var i=\[\'([^\']+)\'];/',
+ $script["innerHTML"],
+ $image_grep
+ );
+
+ if(count($image_grep[0]) !== 0){
+
+ $this->js_image[trim($image_grep[2][0])] =
+ $this->fuckhtml
+ ->getTextContent(
+ $image_grep[1][0]
+ );
+ }
}
// get nodes
@@ -859,56 +975,139 @@ class google{
if(count($carousel_title) !== 0){
- if(
- $this->fuckhtml
- ->getTextContent(
- $carousel_title[0]
+ switch(
+ strtolower(
+ $this->fuckhtml
+ ->getTextContent(
+ $carousel_title[0]
+ )
)
- == "Top stories"
){
- $img =
- $this->fuckhtml
- ->getElementsByTagName("img");
-
- if(
- count($img) !== 0 &&
- isset($img[0]["attributes"]["id"]) &&
- isset($js_image[$img[0]["attributes"]["id"]])
- ){
+ case "top stories":
+ $img =
+ $this->fuckhtml
+ ->getElementsByTagName("img");
- $img = [
- "url" => $js_image[$img[0]["attributes"]["id"]],
- "ratio" => "16:9"
- ];
- }else{
+ if(
+ count($img) !== 0 &&
+ isset($img[0]["attributes"]["id"]) &&
+ isset($this->js_image[$img[0]["attributes"]["id"]])
+ ){
+
+ $img = [
+ "url" => $this->getimage($img[0]["attributes"]["id"]),
+ "ratio" => "16:9"
+ ];
+ }else{
+
+ $img = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }
- $img = [
- "url" => null,
- "ratio" => null
+ /*
+ Is a news node
+ */
+ $out["news"][] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $grey_node[0]
+ ),
+ "description" => null,
+ "date" =>
+ strtotime(
+ explode(
+ "\n",
+ $grey_node[1]["innerHTML"]
+ )[1]
+ ),
+ "thumb" => $img,
+ "url" => $url
];
- }
+ break;
- /*
- Is a news node
- */
- $out["news"][] = [
- "title" =>
+ case "images":
+
+ /*
+ We found an image
+ */
+ $imagedata =
$this->fuckhtml
- ->getTextContent(
- $grey_node[0]
- ),
- "description" => null,
- "date" =>
- strtotime(
- explode(
- "\n",
- $grey_node[1]["innerHTML"]
- )[1]
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "display" => "block",
+ "background-color" => "#fff",
+ "border-radius" => "8px",
+ "-webkit-box-shadow" => "0 1px 6px rgba(32, 33, 36, 0.28)",
+ "overflow" => "hidden"
+ ],
+ self::is_class
+ ),
+ "a"
+ );
+
+ if(count($imagedata) === 0){
+
+ break;
+ }
+
+ $imagedata = $imagedata[0];
+
+ // https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/commons/thumb/9/9d/Joe_Biden_presidential_portrait_%2528cropped%2529.jpg/220px-Joe_Biden_presidential_portrait_%2528cropped%2529.jpg&imgrefurl=https://en.wikipedia.org/wiki/President_of_the_United_States&h=293&w=220&tbnid=kkQHBIAMuTitdM&q=who+is+the+president+of+the+united+states&tbnh=115&tbnw=86&usg=AI4_-kQVKi-K2zTGmVkS75_Fo6VldpPxsg&vet=1&docid=d2vgvyYSkU0hiM&sa=X&ved=2ahUKEwjKrMT17KyAAxV1j4kEHRAVCoYQ9QF6BAgFEAQ
+ parse_str(
+ parse_url(
+ $this->fuckhtml
+ ->getTextContent(
+ $imagedata["attributes"]["href"]
+ ),
+ PHP_URL_QUERY
),
- "thumb" => $img,
- "url" => $url
- ];
+ $params
+ );
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByTagName("img")[0];
+
+ if(isset($this->js_image[$image["attributes"]["id"]])){
+
+ $thumbimg = $this->getimage($image["attributes"]["id"]);
+ }else{
+
+ $thumbimg =
+ $this->fuckhtml
+ ->getTextContent(
+ $image["attributes"]["src"]
+ );
+ }
+
+ $out["image"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $image["attributes"]["alt"]
+ )
+ ),
+ "source" => [
+ [
+ "url" => $params["imgurl"],
+ "width" => (int)$params["w"],
+ "height" => (int)$params["h"]
+ ],
+ [
+ "url" => $thumbimg,
+ "width" => (int)$params["tbnw"],
+ "height" => (int)$params["tbnh"]
+ ]
+ ],
+ "url" => $params["imgrefurl"]
+ ];
+ break;
}
}else{
@@ -1025,12 +1224,11 @@ class google{
if(
count($thumb) !== 0 &&
- isset($js_image[$thumb[0]["attributes"]["id"]])
+ isset($this->js_image[$thumb[0]["attributes"]["id"]])
){
$thumb = [
- "url" =>
- $js_image[$thumb[0]["attributes"]["id"]],
+ "url" => $this->getimage($thumb[0]["attributes"]["id"]),
"ratio" => "1:1"
];
}else{
@@ -1085,6 +1283,8 @@ class google{
foreach($categories as $cat){
+ $container["innerHTML"] = str_replace($cat, "", $container["innerHTML"]);
+
$cat = explode(":", $cat, 2);
$table[
@@ -1139,59 +1339,6 @@ class google{
}
}
- // check if traversed div is the description
- /*
- if(
- count(
- $this->fuckhtml
- ->getElementsByTagName("*")
- ) === 0
- ){
-
- $description =
- $this->fuckhtml
- ->getTextContent($inner_category);
- }else{
-
- $this->
-
- // we need to traverse description struct
- foreach($inner_category as $category){
-
- // detect description
- $this->fuckhtml->load($category);
-
- $spans =
- $this->fuckhtml
- ->getElementsByTagName("span");
-
- $is_desc = false;
- $is_first_span = true;
-
- foreach($spans as $span){
-
- // get rating
- if(isset($span["attributes"]["aria-hidden"])){
-
- $table["Rating"] = $span["innerHTML"] . "/5";
- continue;
- }
-
- // get date posted
- if(
- $is_first_span &&
- $date_tmp = strtotime($span["innerHTML"])
- ){
-
- $date = $date_tmp;
- continue;
- }
-
- $is_first_span = false;
- }
- }
- }*/
-
// get sublinks
$this->fuckhtml->load($container["innerHTML"]);
@@ -1285,6 +1432,30 @@ class google{
$related
);
}
+
+ continue;
+ }
+
+ /*
+ Check for spelling autocorrect
+ */
+ $spelling =
+ $this->fuckhtml
+ ->getElementById(
+ "scl"
+ );
+
+ if($spelling){
+
+ $out["spelling"] = [
+ "type" => "including",
+ "using" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $spelling
+ ),
+ "correction" => $search
+ ];
}
/*
@@ -1306,16 +1477,361 @@ class google{
if(count($nextpage) !== 0){
$out["npt"] =
- explode(
- "?",
- $this->fuckhtml
- ->getTextContent(
- $nextpage[0]
- ["attributes"]
- ["href"]
+ $this->nextpage
+ ->store(
+ explode(
+ "?",
+ $this->fuckhtml
+ ->getTextContent(
+ $nextpage[0]
+ ["attributes"]
+ ["href"]
+ )
+ )[1],
+ "web"
+ );
+
+ continue;
+ }
+
+ /*
+ Check for DMCA complaint div
+ */
+ $dmca_table = false;
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent($container);
+
+ if(
+ stripos(
+ $text,
+ "In response to a complaint we received under the US Digital Millennium Copyright Act, we have removed"
+ ) !== false
+ ||
+ stripos(
+ $text,
+ "In response to multiple complaints we received under the US Digital Millennium Copyright Act, we have removed"
+ ) !== false
+ ){
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName("a");
+
+ array_shift($as);
+
+ $dmca_table = [
+ "title" => "Removed results",
+ "description" => [
+ [
+ "type" => "text",
+ "value" => "Google removed results due to DMCA complaints. You can view the removed links by visiting these:\n\n"
+ ]
+ ],
+ "url" => "https://support.google.com/legal/answer/1120734?visit_id=638260070062978894-2242290953",
+ "thumb" => null,
+ "table" => [],
+ "sublink" => []
+ ];
+
+ $i = 0;
+ $c = count($as);
+
+ foreach($as as $a){
+
+ $i++;
+ $u =
+ $this->decodeurl(
+ $a["attributes"]["href"]
+ );
+
+ $dmca_table["description"][] = [
+ "type" => "link",
+ "url" => $u,
+ "value" => $u
+ ];
+
+ if($i !== $c){
+
+ $dmca_table["description"][] = [
+ "type" => "text",
+ "value" => "\n"
+ ];
+ }
+ }
+
+ continue;
+ }
+
+ /*
+ Fallback to parsing it as an embed
+ */
+
+ $table = [
+ "title" => null,
+ "description" => [],
+ "url" => null,
+ "thumb" => null,
+ "table" => [],
+ "sublink" => []
+ ];
+
+ $parts =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "padding" => "12px 16px 12px"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
+
+ if(count($parts) === 0){
+
+ continue;
+ }
+
+ $head = $parts[0];
+
+ $h3 =
+ $this->fuckhtml
+ ->getElementsByTagName("h3");
+
+ if(count($h3) !== 0){
+
+ $h3 = $h3[0];
+
+ $table["title"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $h3
+ );
+
+ $head["innerHTML"] =
+ str_replace(
+ $h3["outerHTML"],
+ "",
+ $head["innerHTML"]
+ );
+
+ $table["description"][] =
+ [
+ "type" => "quote",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $head
+ )
+ ];
+ }
+
+ $audio =
+ $this->fuckhtml
+ ->getElementsByTagName("audio");
+
+ if(count($audio) !== 0){
+
+ $table["description"][] = [
+ "type" => "audio",
+ "url" =>
+ str_replace(
+ "http://",
+ "https://",
+ $this->fuckhtml
+ ->getTextContent(
+ $audio[0]["attributes"]["src"]
+ )
)
- )[1];
+ ];
}
+
+ if(count($parts) >= 2){
+
+ $this->fuckhtml->load($parts[1]);
+
+ $parts =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "padding-bottom" => "12px"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
+
+ foreach($parts as $part){
+
+ $this->fuckhtml->load($part);
+
+ $lists =
+ $this->fuckhtml
+ ->getElementsByTagName("ol");
+
+ if(count($lists) !== 0){
+
+ foreach($lists as $list){
+
+ $this->fuckhtml->load($list);
+
+ $list_items =
+ $this->fuckhtml
+ ->getElementsByTagName("li");
+
+ $index = 0;
+
+ if(count($list_items) !== 0){
+
+ foreach($list_items as $list_item){
+
+ $index++;
+
+ $this->fuckhtml->load($list_item);
+
+ $list_subitems =
+ $this->fuckhtml
+ ->getElementsByTagName("div");
+
+ foreach($list_subitems as $subitem){
+
+ if($subitem["level"] !== 1){ continue; }
+
+ $this->fuckhtml->load($subitem);
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName("span");
+
+ if(count($spans) !== 0){
+
+ $type = "quote";
+ }else{
+
+ $type = "text";
+ }
+
+ $value =
+ $this->fuckhtml
+ ->getTextContent(
+ $subitem
+ );
+
+ if($type == "text"){
+
+ $value = $index . ". " . $value;
+ }
+
+ $table["description"][] = [
+ "type" => $type,
+ "value" => $value
+ ];
+ }
+ }
+ }
+ }
+
+ continue;
+ }
+
+ // get title
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName("span");
+
+ if(count($spans) !== 0){
+
+ foreach($spans as $span){
+
+ $part["innerHTML"] =
+ str_replace(
+ $span["outerHTML"],
+ "",
+ $part["innerHTML"]
+ );
+ }
+
+ if(
+ $this->fuckhtml
+ ->getTextContent(
+ $part
+ )
+ == ""
+ ){
+
+ $table["description"][] = [
+ "type" => "title",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $spans[0]
+ )
+ ];
+
+ continue;
+ }
+ }
+
+ // fallback to getting non-numbered list
+ $nlist =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "white-space" => "pre-line",
+ "word-wrap" => "break-word"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
+
+ if(count($nlist) !== 0){
+
+ foreach($nlist as $nlist_item){
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent($nlist_item);
+
+ if($text == ""){
+
+ continue;
+ }
+
+ $this->fuckhtml->load($nlist_item);
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName("span");
+
+ if(count($spans) !== 0){
+
+ // is a quote node
+ $type = "quote";
+ }else{
+
+ $type = "text";
+ }
+
+ $table["description"][] = [
+ "type" => $type,
+ "value" => $text
+ ];
+ }
+ }
+ }
+ }
+
+ $out["answer"][] = $table;
+ }
+
+ if($dmca_table){
+
+ $out["answer"][] = $dmca_table;
}
return $out;
@@ -1323,6 +1839,55 @@ class google{
public function image($get){
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $lang = $get["lang"];
+ $size = $get["size"];
+ $colortype = $get["colortype"];
+ $color = $get["color"];
+ $type = $get["type"];
+ $rights = $get["rights"];
+ $older = $get["older"];
+ $newer = $get["newer"];
+
+ $params = [];
+
+ // country
+ if($country != "any"){
+
+ $params["gl"] = $country;
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ // language
+ if($lang != "any"){
+
+ $params["lr"] = "lang_" . $lang;
+ }
+
+ // &sort=review-date:r:20090301:20090430
+ $older = $older === false ? false : date("Ymd", $older);
+ $newer = $newer === false ? false : date("Ymd", $newer);
+
+ if(
+ $older !== false &&
+ $newer === false
+ ){
+
+ $newer = date("Ymd", time());
+ }
+
+ if(
+ $older !== false ||
+ $newer !== false
+ ){
+
+ $params["sort"] = "review-date:r:" . $older . ":" . $newer;
+ }
+
$handle = fopen("scraper/google-img.html", "r");
$html = fread($handle, filesize("scraper/google-img.html"));
fclose($handle);
@@ -1380,17 +1945,23 @@ class google{
$imgvl = $imgvl[1];
- $out["npt"] = [
- "q" => $get["s"],
- "tbm" => "isch",
- "async" => "_id:islrg_c,_fmt:html",
- "asearch" => "ichunklite",
- "ved" => $ved,
- "vet" => "1" . $ved . "..i",
- "start" => 100,
- "ijn" => 1,
- "imgvl" => $imgvl
- ];
+ $out["npt"] =
+ $this->nextpage->store(
+ json_encode(
+ [
+ "q" => $get["s"],
+ "tbm" => "isch",
+ "async" => "_id:islrg_c,_fmt:html",
+ "asearch" => "ichunklite",
+ "ved" => $ved,
+ "vet" => "1" . $ved . "..i",
+ "start" => 100,
+ "ijn" => 1,
+ "imgvl" => $imgvl
+ ]
+ ),
+ "images"
+ );
}
foreach($images as $image){
@@ -1529,6 +2100,29 @@ class google{
return $tags;
}
+ private function getimage($id){
+
+ if(
+ isset($this->js_image[$id]) &&
+ $this->js_image[$id] != ""
+ ){
+
+ if(stripos($this->js_image[$id], "data:image") !== false){
+
+ return
+ explode(
+ "\\x3d",
+ $this->js_image[$id],
+ 2
+ )[0];
+ }
+
+ return $this->js_image[$id];
+ }
+
+ return null;
+ }
+
private function decodeurl($url){
preg_match(
@@ -1559,4 +2153,3 @@ class google{
return rtrim($title, ".… \t\n\r\0\x0B");
}
}
-