diff options
author | lolcat <will@lolcat.ca> | 2023-07-26 19:03:06 -0400 |
---|---|---|
committer | lolcat <will@lolcat.ca> | 2023-07-26 19:03:06 -0400 |
commit | 16ee0b368fcf24b48574172726e32a19c275d691 (patch) | |
tree | 563a53aa23bda137c3651d782990fd2e8fb39c13 /scraper/google.php | |
parent | bca265aea67ec62499aaa113a6490ce9ec7fe730 (diff) |
felt quirky, might commit later
Diffstat (limited to 'scraper/google.php')
-rw-r--r-- | scraper/google.php | 1065 |
1 files changed, 829 insertions, 236 deletions
diff --git a/scraper/google.php b/scraper/google.php index 6a746f7..df10754 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -18,37 +18,44 @@ class google{ switch($page){ - case "web": return [];/* + case "web": return [ - "country" => [ + "country" => [ // gl=<country> "display" => "Country", "option" => [ - "zz" => "Instance region", + "any" => "Instance's country", "af" => "Afghanistan", "al" => "Albania", "dz" => "Algeria", "as" => "American Samoa", "ad" => "Andorra", "ao" => "Angola", - "ag" => "Antigua & Barbuda", + "ai" => "Anguilla", + "aq" => "Antarctica", + "ag" => "Antigua and Barbuda", "ar" => "Argentina", "am" => "Armenia", + "aw" => "Aruba", "au" => "Australia", "at" => "Austria", "az" => "Azerbaijan", "bs" => "Bahamas", "bh" => "Bahrain", "bd" => "Bangladesh", + "bb" => "Barbados", "by" => "Belarus", "be" => "Belgium", "bz" => "Belize", "bj" => "Benin", + "bm" => "Bermuda", "bt" => "Bhutan", "bo" => "Bolivia", - "ba" => "Bosnia & Herzegovina", + "ba" => "Bosnia and Herzegovina", "bw" => "Botswana", + "bv" => "Bouvet Island", "br" => "Brazil", - "bn" => "Brunei", + "io" => "British Indian Ocean Territory", + "bn" => "Brunei Darussalam", "bg" => "Bulgaria", "bf" => "Burkina Faso", "bi" => "Burundi", @@ -56,19 +63,24 @@ class google{ "cm" => "Cameroon", "ca" => "Canada", "cv" => "Cape Verde", + "ky" => "Cayman Islands", "cf" => "Central African Republic", "td" => "Chad", "cl" => "Chile", + "cn" => "China", + "cx" => "Christmas Island", + "cc" => "Cocos (Keeling) Islands", "co" => "Colombia", - "cg" => "Congo - Brazzaville", - "cd" => "Congo - Kinshasa", + "km" => "Comoros", + "cg" => "Congo", + "cd" => "Congo, the Democratic Republic", "ck" => "Cook Islands", "cr" => "Costa Rica", - "ci" => "Côte d’Ivoire", + "ci" => "Cote D'ivoire", "hr" => "Croatia", "cu" => "Cuba", "cy" => "Cyprus", - "cz" => "Czechia", + "cz" => "Czech Republic", "dk" => "Denmark", "dj" => "Djibouti", "dm" => "Dominica", @@ -76,11 +88,18 @@ class google{ "ec" => "Ecuador", "eg" => "Egypt", "sv" => "El Salvador", + "gq" => "Equatorial Guinea", + "er" => "Eritrea", "ee" => "Estonia", "et" => "Ethiopia", + "fk" => "Falkland Islands (Malvinas)", + "fo" => "Faroe Islands", "fj" => "Fiji", "fi" => "Finland", "fr" => "France", + "gf" => "French Guiana", + "pf" => "French Polynesia", + "tf" => "French Southern Territories", "ga" => "Gabon", "gm" => "Gambia", "ge" => "Georgia", @@ -89,86 +108,111 @@ class google{ "gi" => "Gibraltar", "gr" => "Greece", "gl" => "Greenland", + "gd" => "Grenada", + "gp" => "Guadeloupe", + "gu" => "Guam", "gt" => "Guatemala", - "gg" => "Guernsey", + "gn" => "Guinea", + "gw" => "Guinea-Bissau", "gy" => "Guyana", "ht" => "Haiti", + "hm" => "Heard Island and Mcdonald Islands", + "va" => "Holy See (Vatican City State)", "hn" => "Honduras", "hk" => "Hong Kong", "hu" => "Hungary", "is" => "Iceland", "in" => "India", "id" => "Indonesia", + "ir" => "Iran, Islamic Republic", "iq" => "Iraq", "ie" => "Ireland", - "im" => "Isle of Man", "il" => "Israel", "it" => "Italy", "jm" => "Jamaica", "jp" => "Japan", - "je" => "Jersey", "jo" => "Jordan", "kz" => "Kazakhstan", "ke" => "Kenya", "ki" => "Kiribati", + "kp" => "Korea, Democratic People's Republic", + "kr" => "Korea, Republic", "kw" => "Kuwait", "kg" => "Kyrgyzstan", - "la" => "Laos", + "la" => "Lao People's Democratic Republic", "lv" => "Latvia", "lb" => "Lebanon", "ls" => "Lesotho", - "ly" => "Libya", + "lr" => "Liberia", + "ly" => "Libyan Arab Jamahiriya", "li" => "Liechtenstein", "lt" => "Lithuania", "lu" => "Luxembourg", + "mo" => "Macao", + "mk" => "Macedonia, the Former Yugosalv Republic", "mg" => "Madagascar", "mw" => "Malawi", "my" => "Malaysia", "mv" => "Maldives", "ml" => "Mali", "mt" => "Malta", + "mh" => "Marshall Islands", + "mq" => "Martinique", + "mr" => "Mauritania", "mu" => "Mauritius", + "yt" => "Mayotte", "mx" => "Mexico", - "fm" => "Micronesia", - "md" => "Moldova", + "fm" => "Micronesia, Federated States", + "md" => "Moldova, Republic", + "mc" => "Monaco", "mn" => "Mongolia", - "me" => "Montenegro", + "ms" => "Montserrat", "ma" => "Morocco", "mz" => "Mozambique", - "mm" => "Myanmar (Burma)", + "mm" => "Myanmar", "na" => "Namibia", "nr" => "Nauru", "np" => "Nepal", "nl" => "Netherlands", + "an" => "Netherlands Antilles", + "nc" => "New Caledonia", "nz" => "New Zealand", "ni" => "Nicaragua", "ne" => "Niger", "ng" => "Nigeria", "nu" => "Niue", - "mk" => "North Macedonia", + "nf" => "Norfolk Island", + "mp" => "Northern Mariana Islands", "no" => "Norway", "om" => "Oman", "pk" => "Pakistan", - "ps" => "Palestine", + "pw" => "Palau", + "ps" => "Palestinian Territory, Occupied", "pa" => "Panama", "pg" => "Papua New Guinea", "py" => "Paraguay", "pe" => "Peru", "ph" => "Philippines", - "pn" => "Pitcairn Islands", + "pn" => "Pitcairn", "pl" => "Poland", "pt" => "Portugal", "pr" => "Puerto Rico", "qa" => "Qatar", + "re" => "Reunion", "ro" => "Romania", - "ru" => "Russia", + "ru" => "Russian Federation", "rw" => "Rwanda", + "sh" => "Saint Helena", + "kn" => "Saint Kitts and Nevis", + "lc" => "Saint Lucia", + "pm" => "Saint Pierre and Miquelon", + "vc" => "Saint Vincent and the Grenadines", "ws" => "Samoa", "sm" => "San Marino", - "st" => "São Tomé & Príncipe", + "st" => "Sao Tome and Principe", "sa" => "Saudi Arabia", "sn" => "Senegal", - "rs" => "Serbia", + "cs" => "Serbia and Montenegro", "sc" => "Seychelles", "sl" => "Sierra Leone", "sg" => "Singapore", @@ -177,36 +221,46 @@ class google{ "sb" => "Solomon Islands", "so" => "Somalia", "za" => "South Africa", - "kr" => "South Korea", + "gs" => "South Georgia and the South Sandwich Islands", "es" => "Spain", "lk" => "Sri Lanka", - "sh" => "St. Helena", - "vc" => "St. Vincent & Grenadines", + "sd" => "Sudan", "sr" => "Suriname", + "sj" => "Svalbard and Jan Mayen", + "sz" => "Swaziland", "se" => "Sweden", "ch" => "Switzerland", - "tw" => "Taiwan", + "sy" => "Syrian Arab Republic", + "tw" => "Taiwan, Province of China", "tj" => "Tajikistan", - "tz" => "Tanzania", + "tz" => "Tanzania, United Republic", "th" => "Thailand", "tl" => "Timor-Leste", "tg" => "Togo", + "tk" => "Tokelau", "to" => "Tonga", - "tt" => "Trinidad & Tobago", + "tt" => "Trinidad and Tobago", "tn" => "Tunisia", - "tr" => "Türkiye", + "tr" => "Turkey", "tm" => "Turkmenistan", - "vi" => "U.S. Virgin Islands", + "tc" => "Turks and Caicos Islands", + "tv" => "Tuvalu", "ug" => "Uganda", "ua" => "Ukraine", "ae" => "United Arab Emirates", - "gb" => "United Kingdom", + "uk" => "United Kingdom", "us" => "United States", + "um" => "United States Minor Outlying Islands", "uy" => "Uruguay", "uz" => "Uzbekistan", "vu" => "Vanuatu", "ve" => "Venezuela", - "vn" => "Vietnam", + "vn" => "Viet Nam", + "vg" => "Virgin Islands, British", + "vi" => "Virgin Islands, U.S.", + "wf" => "Wallis and Futuna", + "eh" => "Western Sahara", + "ye" => "Yemen", "zm" => "Zambia", "zw" => "Zimbabwe" ] @@ -214,81 +268,60 @@ class google{ "nsfw" => [ "display" => "NSFW", "option" => [ - "yes" => "Yes", - "no" => "No" + "yes" => "Yes", // safe=active + "no" => "No" // safe=off ] ], - "lang" => [ // prefix with lang_ + "lang" => [ // lr=<lang> (prefix lang with "lang_") "display" => "Language", "option" => [ "any" => "Any language", - "af" => "Afrikaans", - "ca" => "català", - "cs" => "čeština", - "da" => "dansk", - "de" => "Deutsch", - "et" => "eesti", + "ar" => "Arabic", + "bg" => "Bulgarian", + "ca" => "Catalan", + "cs" => "Czech", + "da" => "Danish", + "de" => "German", + "el" => "Greek", "en" => "English", - "es" => "español", - "eo" => "esperanto", - "tl" => "Filipino", - "fr" => "français", - "hr" => "hrvatski", - "id" => "Indonesia", - "is" => "íslenska", - "it" => "italiano", - "sw" => "Kiswahili", - "lv" => "latviešu", - "lt" => "lietuvių", - "hu" => "magyar", - "nl" => "Nederlands", - "no" => "norsk", - "pl" => "polski", - "pt" => "português", - "ro" => "română", - "sk" => "slovenčina", - "sl" => "slovenščina", - "fi" => "suomi", - "sv" => "svenska", - "vi" => "Tiếng Việt", - "tr" => "Türkçe", - "el" => "Ελληνικά", - "be" => "беларуская", - "bg" => "български", - "ru" => "русский", - "sr" => "српски", - "uk" => "українська", - "hy" => "հայերեն", - "iw" => "עברית", - "ar" => "العربية", - "fa" => "فارسی", - "hi" => "हिन्दी", - "th" => "ไทย", - "ko" => "한국어", - "zh-CN" => "中文 (简体)", - "zh-TW" => "中文 (繁體)", - "ja" => "日本語" + "es" => "Spanish", + "et" => "Estonian", + "fi" => "Finnish", + "fr" => "French", + "hr" => "Croatian", + "hu" => "Hungarian", + "id" => "Indonesian", + "is" => "Icelandic", + "it" => "Italian", + "iw" => "Hebrew", + "ja" => "Japanese", + "ko" => "Korean", + "lt" => "Lithuanian", + "lv" => "Latvian", + "nl" => "Dutch", + "no" => "Norwegian", + "pl" => "Polish", + "pt" => "Portuguese", + "ro" => "Romanian", + "ru" => "Russian", + "sk" => "Slovak", + "sl" => "Slovenian", + "sr" => "Serbian", + "sv" => "Swedish", + "tr" => "Turkish", + "zh-CN" => "Chinese (Simplified)", + "zh-TW" => "Chinese (Traditional)" ] ], - "time" => [ - "display" => "Time posted", - "option" => [ - "any" => "Any time", - "h" => "Last hour", - "d" => "Last 24 hours", - "w" => "Last week", - "m" => "Last month", - "y" => "Last year" - ] + "newer" => [ // &sort=review-date:r:20090301:20090430 + "display" => "Newer than", + "option" => "_DATE" ], - "verbatim" => [ - "display" => "Verbatim", - "option" => [ - "no" => "No", - "yes" => "Yes" - ] + "older" => [ + "display" => "Older than", + "option" => "_DATE" ] - ];*/ + ]; break; case "images": @@ -346,7 +379,7 @@ class google{ "co" => "Colombia", "km" => "Comoros", "cg" => "Congo", - "cd" => "Congo, the Democratic Republic of the", + "cd" => "Congo, the Democratic Republic", "ck" => "Cook Islands", "cr" => "Costa Rica", "ci" => "Cote D'ivoire", @@ -397,7 +430,7 @@ class google{ "is" => "Iceland", "in" => "India", "id" => "Indonesia", - "ir" => "Iran, Islamic Republic of", + "ir" => "Iran, Islamic Republic", "iq" => "Iraq", "ie" => "Ireland", "il" => "Israel", @@ -408,8 +441,8 @@ class google{ "kz" => "Kazakhstan", "ke" => "Kenya", "ki" => "Kiribati", - "kp" => "Korea, Democratic People's Republic of", - "kr" => "Korea, Republic of", + "kp" => "Korea, Democratic People's Republic", + "kr" => "Korea, Republic", "kw" => "Kuwait", "kg" => "Kyrgyzstan", "la" => "Lao People's Democratic Republic", @@ -422,7 +455,7 @@ class google{ "lt" => "Lithuania", "lu" => "Luxembourg", "mo" => "Macao", - "mk" => "Macedonia, the Former Yugosalv Republic of", + "mk" => "Macedonia, the Former Yugosalv Republic", "mg" => "Madagascar", "mw" => "Malawi", "my" => "Malaysia", @@ -435,8 +468,8 @@ class google{ "mu" => "Mauritius", "yt" => "Mayotte", "mx" => "Mexico", - "fm" => "Micronesia, Federated States of", - "md" => "Moldova, Republic of", + "fm" => "Micronesia, Federated States", + "md" => "Moldova, Republic", "mc" => "Monaco", "mn" => "Mongolia", "ms" => "Montserrat", @@ -506,7 +539,7 @@ class google{ "sy" => "Syrian Arab Republic", "tw" => "Taiwan, Province of China", "tj" => "Tajikistan", - "tz" => "Tanzania, United Republic of", + "tz" => "Tanzania, United Republic", "th" => "Thailand", "tl" => "Timor-Leste", "tg" => "Togo", @@ -603,14 +636,20 @@ class google{ "i" => "Icon" ] ], - "color" => [ // tbs=ic:<color> + "colortype" => [ // imgColorType=<color> + "display" => "Color type", + "option" => [ + "any" => "Any color type", + "color" => "Colored", + "gray" => "Gray", + "mono" => "Black & white", + "trans" => "Transparent" + ] + ], + "color" => [ // imgDominantColor=<color> "display" => "Color", "option" => [ "any" => "Any color", - "gray" => "Black and white", - "trans" => "Transparent", - // from here, format is - // tbs=specific,isc:<color> "red" => "Red", "orange" => "Orange", "yellow" => "Yellow", @@ -625,13 +664,15 @@ class google{ "brown" => "Brown" ] ], - "type" => [ // tbs=itp:<type> + "type" => [ // imgType=<type> "display" => "Type", "option" => [ "any" => "Any type", + "face" => "Faces", "clipart" => "Clip Art", "lineart" => "Line Drawing", - "animated" => "GIF" + "stock" => "Stock", + "animated" => "Animated" ] ], "rights" => [ // tbs=il:<rights> @@ -694,6 +735,52 @@ class google{ public function web($get){ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $lang = $get["lang"]; + $older = $get["older"]; + $newer = $get["newer"]; + + $params = [ + "num" => 20 // get 20 results + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // language + if($lang != "any"){ + + $params["lr"] = "lang_" . $lang; + } + + // &sort=review-date:r:20090301:20090430 + $older = $older === false ? false : date("Ymd", $older); + $newer = $newer === false ? false : date("Ymd", $newer); + + if( + $older !== false && + $newer === false + ){ + + $newer = date("Ymd", time()); + } + + if( + $older !== false || + $newer !== false + ){ + + $params["sort"] = "review-date:r:" . $older . ":" . $newer; + } + $handle = fopen("scraper/google.html", "r"); $html = fread($handle, filesize("scraper/google.html")); fclose($handle); @@ -735,15 +822,44 @@ class google{ preg_match( '/google\.ldi=({[^}]+})/', $html, - $js_image + $this->js_image ); - if(count($js_image) !== 0){ + if(count($this->js_image) !== 0){ - $js_image = json_decode($js_image[1], true); + $this->js_image = json_decode($this->js_image[1], true); }else{ - $js_image = []; + $this->js_image = []; + } + + // additional js_images present in <script> tags + // ugh i fucking hate you + $scripts = + $this->fuckhtml + ->getElementsByTagName("script"); + + foreach($scripts as $script){ + + if(!isset($script["innerHTML"])){ + + continue; + } + + preg_match_all( + '/var s=\'(data:image[^\']+)\';var i=\[\'([^\']+)\'];/', + $script["innerHTML"], + $image_grep + ); + + if(count($image_grep[0]) !== 0){ + + $this->js_image[trim($image_grep[2][0])] = + $this->fuckhtml + ->getTextContent( + $image_grep[1][0] + ); + } } // get nodes @@ -859,56 +975,139 @@ class google{ if(count($carousel_title) !== 0){ - if( - $this->fuckhtml - ->getTextContent( - $carousel_title[0] + switch( + strtolower( + $this->fuckhtml + ->getTextContent( + $carousel_title[0] + ) ) - == "Top stories" ){ - $img = - $this->fuckhtml - ->getElementsByTagName("img"); - - if( - count($img) !== 0 && - isset($img[0]["attributes"]["id"]) && - isset($js_image[$img[0]["attributes"]["id"]]) - ){ + case "top stories": + $img = + $this->fuckhtml + ->getElementsByTagName("img"); - $img = [ - "url" => $js_image[$img[0]["attributes"]["id"]], - "ratio" => "16:9" - ]; - }else{ + if( + count($img) !== 0 && + isset($img[0]["attributes"]["id"]) && + isset($this->js_image[$img[0]["attributes"]["id"]]) + ){ + + $img = [ + "url" => $this->getimage($img[0]["attributes"]["id"]), + "ratio" => "16:9" + ]; + }else{ + + $img = [ + "url" => null, + "ratio" => null + ]; + } - $img = [ - "url" => null, - "ratio" => null + /* + Is a news node + */ + $out["news"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $grey_node[0] + ), + "description" => null, + "date" => + strtotime( + explode( + "\n", + $grey_node[1]["innerHTML"] + )[1] + ), + "thumb" => $img, + "url" => $url ]; - } + break; - /* - Is a news node - */ - $out["news"][] = [ - "title" => + case "images": + + /* + We found an image + */ + $imagedata = $this->fuckhtml - ->getTextContent( - $grey_node[0] - ), - "description" => null, - "date" => - strtotime( - explode( - "\n", - $grey_node[1]["innerHTML"] - )[1] + ->getElementsByClassName( + $this->findstyles( + [ + "display" => "block", + "background-color" => "#fff", + "border-radius" => "8px", + "-webkit-box-shadow" => "0 1px 6px rgba(32, 33, 36, 0.28)", + "overflow" => "hidden" + ], + self::is_class + ), + "a" + ); + + if(count($imagedata) === 0){ + + break; + } + + $imagedata = $imagedata[0]; + + // https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/commons/thumb/9/9d/Joe_Biden_presidential_portrait_%2528cropped%2529.jpg/220px-Joe_Biden_presidential_portrait_%2528cropped%2529.jpg&imgrefurl=https://en.wikipedia.org/wiki/President_of_the_United_States&h=293&w=220&tbnid=kkQHBIAMuTitdM&q=who+is+the+president+of+the+united+states&tbnh=115&tbnw=86&usg=AI4_-kQVKi-K2zTGmVkS75_Fo6VldpPxsg&vet=1&docid=d2vgvyYSkU0hiM&sa=X&ved=2ahUKEwjKrMT17KyAAxV1j4kEHRAVCoYQ9QF6BAgFEAQ + parse_str( + parse_url( + $this->fuckhtml + ->getTextContent( + $imagedata["attributes"]["href"] + ), + PHP_URL_QUERY ), - "thumb" => $img, - "url" => $url - ]; + $params + ); + + $image = + $this->fuckhtml + ->getElementsByTagName("img")[0]; + + if(isset($this->js_image[$image["attributes"]["id"]])){ + + $thumbimg = $this->getimage($image["attributes"]["id"]); + }else{ + + $thumbimg = + $this->fuckhtml + ->getTextContent( + $image["attributes"]["src"] + ); + } + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $image["attributes"]["alt"] + ) + ), + "source" => [ + [ + "url" => $params["imgurl"], + "width" => (int)$params["w"], + "height" => (int)$params["h"] + ], + [ + "url" => $thumbimg, + "width" => (int)$params["tbnw"], + "height" => (int)$params["tbnh"] + ] + ], + "url" => $params["imgrefurl"] + ]; + break; } }else{ @@ -1025,12 +1224,11 @@ class google{ if( count($thumb) !== 0 && - isset($js_image[$thumb[0]["attributes"]["id"]]) + isset($this->js_image[$thumb[0]["attributes"]["id"]]) ){ $thumb = [ - "url" => - $js_image[$thumb[0]["attributes"]["id"]], + "url" => $this->getimage($thumb[0]["attributes"]["id"]), "ratio" => "1:1" ]; }else{ @@ -1085,6 +1283,8 @@ class google{ foreach($categories as $cat){ + $container["innerHTML"] = str_replace($cat, "", $container["innerHTML"]); + $cat = explode(":", $cat, 2); $table[ @@ -1139,59 +1339,6 @@ class google{ } } - // check if traversed div is the description - /* - if( - count( - $this->fuckhtml - ->getElementsByTagName("*") - ) === 0 - ){ - - $description = - $this->fuckhtml - ->getTextContent($inner_category); - }else{ - - $this-> - - // we need to traverse description struct - foreach($inner_category as $category){ - - // detect description - $this->fuckhtml->load($category); - - $spans = - $this->fuckhtml - ->getElementsByTagName("span"); - - $is_desc = false; - $is_first_span = true; - - foreach($spans as $span){ - - // get rating - if(isset($span["attributes"]["aria-hidden"])){ - - $table["Rating"] = $span["innerHTML"] . "/5"; - continue; - } - - // get date posted - if( - $is_first_span && - $date_tmp = strtotime($span["innerHTML"]) - ){ - - $date = $date_tmp; - continue; - } - - $is_first_span = false; - } - } - }*/ - // get sublinks $this->fuckhtml->load($container["innerHTML"]); @@ -1285,6 +1432,30 @@ class google{ $related ); } + + continue; + } + + /* + Check for spelling autocorrect + */ + $spelling = + $this->fuckhtml + ->getElementById( + "scl" + ); + + if($spelling){ + + $out["spelling"] = [ + "type" => "including", + "using" => + $this->fuckhtml + ->getTextContent( + $spelling + ), + "correction" => $search + ]; } /* @@ -1306,16 +1477,361 @@ class google{ if(count($nextpage) !== 0){ $out["npt"] = - explode( - "?", - $this->fuckhtml - ->getTextContent( - $nextpage[0] - ["attributes"] - ["href"] + $this->nextpage + ->store( + explode( + "?", + $this->fuckhtml + ->getTextContent( + $nextpage[0] + ["attributes"] + ["href"] + ) + )[1], + "web" + ); + + continue; + } + + /* + Check for DMCA complaint div + */ + $dmca_table = false; + + $text = + $this->fuckhtml + ->getTextContent($container); + + if( + stripos( + $text, + "In response to a complaint we received under the US Digital Millennium Copyright Act, we have removed" + ) !== false + || + stripos( + $text, + "In response to multiple complaints we received under the US Digital Millennium Copyright Act, we have removed" + ) !== false + ){ + + $as = + $this->fuckhtml + ->getElementsByTagName("a"); + + array_shift($as); + + $dmca_table = [ + "title" => "Removed results", + "description" => [ + [ + "type" => "text", + "value" => "Google removed results due to DMCA complaints. You can view the removed links by visiting these:\n\n" + ] + ], + "url" => "https://support.google.com/legal/answer/1120734?visit_id=638260070062978894-2242290953", + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + + $i = 0; + $c = count($as); + + foreach($as as $a){ + + $i++; + $u = + $this->decodeurl( + $a["attributes"]["href"] + ); + + $dmca_table["description"][] = [ + "type" => "link", + "url" => $u, + "value" => $u + ]; + + if($i !== $c){ + + $dmca_table["description"][] = [ + "type" => "text", + "value" => "\n" + ]; + } + } + + continue; + } + + /* + Fallback to parsing it as an embed + */ + + $table = [ + "title" => null, + "description" => [], + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + + $parts = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "padding" => "12px 16px 12px" + ], + self::is_class + ), + "div" + ); + + if(count($parts) === 0){ + + continue; + } + + $head = $parts[0]; + + $h3 = + $this->fuckhtml + ->getElementsByTagName("h3"); + + if(count($h3) !== 0){ + + $h3 = $h3[0]; + + $table["title"] = + $this->fuckhtml + ->getTextContent( + $h3 + ); + + $head["innerHTML"] = + str_replace( + $h3["outerHTML"], + "", + $head["innerHTML"] + ); + + $table["description"][] = + [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $head + ) + ]; + } + + $audio = + $this->fuckhtml + ->getElementsByTagName("audio"); + + if(count($audio) !== 0){ + + $table["description"][] = [ + "type" => "audio", + "url" => + str_replace( + "http://", + "https://", + $this->fuckhtml + ->getTextContent( + $audio[0]["attributes"]["src"] + ) ) - )[1]; + ]; } + + if(count($parts) >= 2){ + + $this->fuckhtml->load($parts[1]); + + $parts = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "padding-bottom" => "12px" + ], + self::is_class + ), + "div" + ); + + foreach($parts as $part){ + + $this->fuckhtml->load($part); + + $lists = + $this->fuckhtml + ->getElementsByTagName("ol"); + + if(count($lists) !== 0){ + + foreach($lists as $list){ + + $this->fuckhtml->load($list); + + $list_items = + $this->fuckhtml + ->getElementsByTagName("li"); + + $index = 0; + + if(count($list_items) !== 0){ + + foreach($list_items as $list_item){ + + $index++; + + $this->fuckhtml->load($list_item); + + $list_subitems = + $this->fuckhtml + ->getElementsByTagName("div"); + + foreach($list_subitems as $subitem){ + + if($subitem["level"] !== 1){ continue; } + + $this->fuckhtml->load($subitem); + + $spans = + $this->fuckhtml + ->getElementsByTagName("span"); + + if(count($spans) !== 0){ + + $type = "quote"; + }else{ + + $type = "text"; + } + + $value = + $this->fuckhtml + ->getTextContent( + $subitem + ); + + if($type == "text"){ + + $value = $index . ". " . $value; + } + + $table["description"][] = [ + "type" => $type, + "value" => $value + ]; + } + } + } + } + + continue; + } + + // get title + $spans = + $this->fuckhtml + ->getElementsByTagName("span"); + + if(count($spans) !== 0){ + + foreach($spans as $span){ + + $part["innerHTML"] = + str_replace( + $span["outerHTML"], + "", + $part["innerHTML"] + ); + } + + if( + $this->fuckhtml + ->getTextContent( + $part + ) + == "" + ){ + + $table["description"][] = [ + "type" => "title", + "value" => + $this->fuckhtml + ->getTextContent( + $spans[0] + ) + ]; + + continue; + } + } + + // fallback to getting non-numbered list + $nlist = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "white-space" => "pre-line", + "word-wrap" => "break-word" + ], + self::is_class + ), + "div" + ); + + if(count($nlist) !== 0){ + + foreach($nlist as $nlist_item){ + + $text = + $this->fuckhtml + ->getTextContent($nlist_item); + + if($text == ""){ + + continue; + } + + $this->fuckhtml->load($nlist_item); + + $spans = + $this->fuckhtml + ->getElementsByTagName("span"); + + if(count($spans) !== 0){ + + // is a quote node + $type = "quote"; + }else{ + + $type = "text"; + } + + $table["description"][] = [ + "type" => $type, + "value" => $text + ]; + } + } + } + } + + $out["answer"][] = $table; + } + + if($dmca_table){ + + $out["answer"][] = $dmca_table; } return $out; @@ -1323,6 +1839,55 @@ class google{ public function image($get){ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $lang = $get["lang"]; + $size = $get["size"]; + $colortype = $get["colortype"]; + $color = $get["color"]; + $type = $get["type"]; + $rights = $get["rights"]; + $older = $get["older"]; + $newer = $get["newer"]; + + $params = []; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // language + if($lang != "any"){ + + $params["lr"] = "lang_" . $lang; + } + + // &sort=review-date:r:20090301:20090430 + $older = $older === false ? false : date("Ymd", $older); + $newer = $newer === false ? false : date("Ymd", $newer); + + if( + $older !== false && + $newer === false + ){ + + $newer = date("Ymd", time()); + } + + if( + $older !== false || + $newer !== false + ){ + + $params["sort"] = "review-date:r:" . $older . ":" . $newer; + } + $handle = fopen("scraper/google-img.html", "r"); $html = fread($handle, filesize("scraper/google-img.html")); fclose($handle); @@ -1380,17 +1945,23 @@ class google{ $imgvl = $imgvl[1]; - $out["npt"] = [ - "q" => $get["s"], - "tbm" => "isch", - "async" => "_id:islrg_c,_fmt:html", - "asearch" => "ichunklite", - "ved" => $ved, - "vet" => "1" . $ved . "..i", - "start" => 100, - "ijn" => 1, - "imgvl" => $imgvl - ]; + $out["npt"] = + $this->nextpage->store( + json_encode( + [ + "q" => $get["s"], + "tbm" => "isch", + "async" => "_id:islrg_c,_fmt:html", + "asearch" => "ichunklite", + "ved" => $ved, + "vet" => "1" . $ved . "..i", + "start" => 100, + "ijn" => 1, + "imgvl" => $imgvl + ] + ), + "images" + ); } foreach($images as $image){ @@ -1529,6 +2100,29 @@ class google{ return $tags; } + private function getimage($id){ + + if( + isset($this->js_image[$id]) && + $this->js_image[$id] != "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAYAAACNiR0NAAABAUlEQVR4AWMYesChoYElLjkzPj4lY3d8csZjIL4MxPNjUzPcSTYsISFLAqj5NBD/h+LPQPwbiT87NCuLh2gDgRr2QzXuT0jNMoBYksARn5zuHJ+UcR0kB6RXE2VYXHJGOlTDZmzyIJcB5e+D1CSkZDgQNBAaZv+jU1JkcKpJygiGeZ0I76a/Byq8jU9NZFqaCNTA48SE33/iDcw8TIyBt0GKQTFN0Msp6f2EIyUpo57YSIlLSrMhIg0WCIBcCfXSdlzJBsheTHQ6jEnOUgEFOLaEDbMIlhZBOYrorAdJk+nroVnvPsSgdGdoOF7HZyhZ2XPoGQoqjbCpIbt0AiejIQMArVLI7k/DXFkAAAAASUVORK5CYII=" + ){ + + if(stripos($this->js_image[$id], "data:image") !== false){ + + return + explode( + "\\x3d", + $this->js_image[$id], + 2 + )[0]; + } + + return $this->js_image[$id]; + } + + return null; + } + private function decodeurl($url){ preg_match( @@ -1559,4 +2153,3 @@ class google{ return rtrim($title, ".… \t\n\r\0\x0B"); } } - |