summaryrefslogtreecommitdiff
path: root/scraper/google.php
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/google.php')
-rw-r--r--scraper/google.php3677
1 files changed, 2676 insertions, 1001 deletions
diff --git a/scraper/google.php b/scraper/google.php
index 055d12a..bf2b0e4 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -16,713 +16,496 @@ class google{
public function getfilters($page){
+ $base = [
+ "country" => [ // gl=<country>
+ "display" => "Country",
+ "option" => [
+ "any" => "Instance's country",
+ "af" => "Afghanistan",
+ "al" => "Albania",
+ "dz" => "Algeria",
+ "as" => "American Samoa",
+ "ad" => "Andorra",
+ "ao" => "Angola",
+ "ai" => "Anguilla",
+ "aq" => "Antarctica",
+ "ag" => "Antigua and Barbuda",
+ "ar" => "Argentina",
+ "am" => "Armenia",
+ "aw" => "Aruba",
+ "au" => "Australia",
+ "at" => "Austria",
+ "az" => "Azerbaijan",
+ "bs" => "Bahamas",
+ "bh" => "Bahrain",
+ "bd" => "Bangladesh",
+ "bb" => "Barbados",
+ "by" => "Belarus",
+ "be" => "Belgium",
+ "bz" => "Belize",
+ "bj" => "Benin",
+ "bm" => "Bermuda",
+ "bt" => "Bhutan",
+ "bo" => "Bolivia",
+ "ba" => "Bosnia and Herzegovina",
+ "bw" => "Botswana",
+ "bv" => "Bouvet Island",
+ "br" => "Brazil",
+ "io" => "British Indian Ocean Territory",
+ "bn" => "Brunei Darussalam",
+ "bg" => "Bulgaria",
+ "bf" => "Burkina Faso",
+ "bi" => "Burundi",
+ "kh" => "Cambodia",
+ "cm" => "Cameroon",
+ "ca" => "Canada",
+ "cv" => "Cape Verde",
+ "ky" => "Cayman Islands",
+ "cf" => "Central African Republic",
+ "td" => "Chad",
+ "cl" => "Chile",
+ "cn" => "China",
+ "cx" => "Christmas Island",
+ "cc" => "Cocos (Keeling) Islands",
+ "co" => "Colombia",
+ "km" => "Comoros",
+ "cg" => "Congo",
+ "cd" => "Congo, the Democratic Republic",
+ "ck" => "Cook Islands",
+ "cr" => "Costa Rica",
+ "ci" => "Cote D'ivoire",
+ "hr" => "Croatia",
+ "cu" => "Cuba",
+ "cy" => "Cyprus",
+ "cz" => "Czech Republic",
+ "dk" => "Denmark",
+ "dj" => "Djibouti",
+ "dm" => "Dominica",
+ "do" => "Dominican Republic",
+ "ec" => "Ecuador",
+ "eg" => "Egypt",
+ "sv" => "El Salvador",
+ "gq" => "Equatorial Guinea",
+ "er" => "Eritrea",
+ "ee" => "Estonia",
+ "et" => "Ethiopia",
+ "fk" => "Falkland Islands (Malvinas)",
+ "fo" => "Faroe Islands",
+ "fj" => "Fiji",
+ "fi" => "Finland",
+ "fr" => "France",
+ "gf" => "French Guiana",
+ "pf" => "French Polynesia",
+ "tf" => "French Southern Territories",
+ "ga" => "Gabon",
+ "gm" => "Gambia",
+ "ge" => "Georgia",
+ "de" => "Germany",
+ "gh" => "Ghana",
+ "gi" => "Gibraltar",
+ "gr" => "Greece",
+ "gl" => "Greenland",
+ "gd" => "Grenada",
+ "gp" => "Guadeloupe",
+ "gu" => "Guam",
+ "gt" => "Guatemala",
+ "gn" => "Guinea",
+ "gw" => "Guinea-Bissau",
+ "gy" => "Guyana",
+ "ht" => "Haiti",
+ "hm" => "Heard Island and Mcdonald Islands",
+ "va" => "Holy See (Vatican City State)",
+ "hn" => "Honduras",
+ "hk" => "Hong Kong",
+ "hu" => "Hungary",
+ "is" => "Iceland",
+ "in" => "India",
+ "id" => "Indonesia",
+ "ir" => "Iran, Islamic Republic",
+ "iq" => "Iraq",
+ "ie" => "Ireland",
+ "il" => "Israel",
+ "it" => "Italy",
+ "jm" => "Jamaica",
+ "jp" => "Japan",
+ "jo" => "Jordan",
+ "kz" => "Kazakhstan",
+ "ke" => "Kenya",
+ "ki" => "Kiribati",
+ "kp" => "Korea, Democratic People's Republic",
+ "kr" => "Korea, Republic",
+ "kw" => "Kuwait",
+ "kg" => "Kyrgyzstan",
+ "la" => "Lao People's Democratic Republic",
+ "lv" => "Latvia",
+ "lb" => "Lebanon",
+ "ls" => "Lesotho",
+ "lr" => "Liberia",
+ "ly" => "Libyan Arab Jamahiriya",
+ "li" => "Liechtenstein",
+ "lt" => "Lithuania",
+ "lu" => "Luxembourg",
+ "mo" => "Macao",
+ "mk" => "Macedonia, the Former Yugosalv Republic",
+ "mg" => "Madagascar",
+ "mw" => "Malawi",
+ "my" => "Malaysia",
+ "mv" => "Maldives",
+ "ml" => "Mali",
+ "mt" => "Malta",
+ "mh" => "Marshall Islands",
+ "mq" => "Martinique",
+ "mr" => "Mauritania",
+ "mu" => "Mauritius",
+ "yt" => "Mayotte",
+ "mx" => "Mexico",
+ "fm" => "Micronesia, Federated States",
+ "md" => "Moldova, Republic",
+ "mc" => "Monaco",
+ "mn" => "Mongolia",
+ "ms" => "Montserrat",
+ "ma" => "Morocco",
+ "mz" => "Mozambique",
+ "mm" => "Myanmar",
+ "na" => "Namibia",
+ "nr" => "Nauru",
+ "np" => "Nepal",
+ "nl" => "Netherlands",
+ "an" => "Netherlands Antilles",
+ "nc" => "New Caledonia",
+ "nz" => "New Zealand",
+ "ni" => "Nicaragua",
+ "ne" => "Niger",
+ "ng" => "Nigeria",
+ "nu" => "Niue",
+ "nf" => "Norfolk Island",
+ "mp" => "Northern Mariana Islands",
+ "no" => "Norway",
+ "om" => "Oman",
+ "pk" => "Pakistan",
+ "pw" => "Palau",
+ "ps" => "Palestinian Territory, Occupied",
+ "pa" => "Panama",
+ "pg" => "Papua New Guinea",
+ "py" => "Paraguay",
+ "pe" => "Peru",
+ "ph" => "Philippines",
+ "pn" => "Pitcairn",
+ "pl" => "Poland",
+ "pt" => "Portugal",
+ "pr" => "Puerto Rico",
+ "qa" => "Qatar",
+ "re" => "Reunion",
+ "ro" => "Romania",
+ "ru" => "Russian Federation",
+ "rw" => "Rwanda",
+ "sh" => "Saint Helena",
+ "kn" => "Saint Kitts and Nevis",
+ "lc" => "Saint Lucia",
+ "pm" => "Saint Pierre and Miquelon",
+ "vc" => "Saint Vincent and the Grenadines",
+ "ws" => "Samoa",
+ "sm" => "San Marino",
+ "st" => "Sao Tome and Principe",
+ "sa" => "Saudi Arabia",
+ "sn" => "Senegal",
+ "cs" => "Serbia and Montenegro",
+ "sc" => "Seychelles",
+ "sl" => "Sierra Leone",
+ "sg" => "Singapore",
+ "sk" => "Slovakia",
+ "si" => "Slovenia",
+ "sb" => "Solomon Islands",
+ "so" => "Somalia",
+ "za" => "South Africa",
+ "gs" => "South Georgia and the South Sandwich Islands",
+ "es" => "Spain",
+ "lk" => "Sri Lanka",
+ "sd" => "Sudan",
+ "sr" => "Suriname",
+ "sj" => "Svalbard and Jan Mayen",
+ "sz" => "Swaziland",
+ "se" => "Sweden",
+ "ch" => "Switzerland",
+ "sy" => "Syrian Arab Republic",
+ "tw" => "Taiwan, Province of China",
+ "tj" => "Tajikistan",
+ "tz" => "Tanzania, United Republic",
+ "th" => "Thailand",
+ "tl" => "Timor-Leste",
+ "tg" => "Togo",
+ "tk" => "Tokelau",
+ "to" => "Tonga",
+ "tt" => "Trinidad and Tobago",
+ "tn" => "Tunisia",
+ "tr" => "Turkey",
+ "tm" => "Turkmenistan",
+ "tc" => "Turks and Caicos Islands",
+ "tv" => "Tuvalu",
+ "ug" => "Uganda",
+ "ua" => "Ukraine",
+ "ae" => "United Arab Emirates",
+ "uk" => "United Kingdom",
+ "us" => "United States",
+ "um" => "United States Minor Outlying Islands",
+ "uy" => "Uruguay",
+ "uz" => "Uzbekistan",
+ "vu" => "Vanuatu",
+ "ve" => "Venezuela",
+ "vn" => "Viet Nam",
+ "vg" => "Virgin Islands, British",
+ "vi" => "Virgin Islands, U.S.",
+ "wf" => "Wallis and Futuna",
+ "eh" => "Western Sahara",
+ "ye" => "Yemen",
+ "zm" => "Zambia",
+ "zw" => "Zimbabwe"
+ ]
+ ],
+ "nsfw" => [
+ "display" => "NSFW",
+ "option" => [
+ "yes" => "Yes", // safe=active
+ "no" => "No" // safe=off
+ ]
+ ],
+ "lang" => [ // lr=<lang> (prefix lang with "lang_")
+ "display" => "Language",
+ "option" => [
+ "any" => "Any language",
+ "ar" => "Arabic",
+ "bg" => "Bulgarian",
+ "ca" => "Catalan",
+ "cs" => "Czech",
+ "da" => "Danish",
+ "de" => "German",
+ "el" => "Greek",
+ "en" => "English",
+ "es" => "Spanish",
+ "et" => "Estonian",
+ "fi" => "Finnish",
+ "fr" => "French",
+ "hr" => "Croatian",
+ "hu" => "Hungarian",
+ "id" => "Indonesian",
+ "is" => "Icelandic",
+ "it" => "Italian",
+ "iw" => "Hebrew",
+ "ja" => "Japanese",
+ "ko" => "Korean",
+ "lt" => "Lithuanian",
+ "lv" => "Latvian",
+ "nl" => "Dutch",
+ "no" => "Norwegian",
+ "pl" => "Polish",
+ "pt" => "Portuguese",
+ "ro" => "Romanian",
+ "ru" => "Russian",
+ "sk" => "Slovak",
+ "sl" => "Slovenian",
+ "sr" => "Serbian",
+ "sv" => "Swedish",
+ "tr" => "Turkish",
+ "zh-CN" => "Chinese (Simplified)",
+ "zh-TW" => "Chinese (Traditional)"
+ ]
+ ]
+ ];
+
switch($page){
case "web":
- case "videos":
- case "news":
- return [
- "country" => [ // gl=<country>
- "display" => "Country",
- "option" => [
- "any" => "Instance's country",
- "af" => "Afghanistan",
- "al" => "Albania",
- "dz" => "Algeria",
- "as" => "American Samoa",
- "ad" => "Andorra",
- "ao" => "Angola",
- "ai" => "Anguilla",
- "aq" => "Antarctica",
- "ag" => "Antigua and Barbuda",
- "ar" => "Argentina",
- "am" => "Armenia",
- "aw" => "Aruba",
- "au" => "Australia",
- "at" => "Austria",
- "az" => "Azerbaijan",
- "bs" => "Bahamas",
- "bh" => "Bahrain",
- "bd" => "Bangladesh",
- "bb" => "Barbados",
- "by" => "Belarus",
- "be" => "Belgium",
- "bz" => "Belize",
- "bj" => "Benin",
- "bm" => "Bermuda",
- "bt" => "Bhutan",
- "bo" => "Bolivia",
- "ba" => "Bosnia and Herzegovina",
- "bw" => "Botswana",
- "bv" => "Bouvet Island",
- "br" => "Brazil",
- "io" => "British Indian Ocean Territory",
- "bn" => "Brunei Darussalam",
- "bg" => "Bulgaria",
- "bf" => "Burkina Faso",
- "bi" => "Burundi",
- "kh" => "Cambodia",
- "cm" => "Cameroon",
- "ca" => "Canada",
- "cv" => "Cape Verde",
- "ky" => "Cayman Islands",
- "cf" => "Central African Republic",
- "td" => "Chad",
- "cl" => "Chile",
- "cn" => "China",
- "cx" => "Christmas Island",
- "cc" => "Cocos (Keeling) Islands",
- "co" => "Colombia",
- "km" => "Comoros",
- "cg" => "Congo",
- "cd" => "Congo, the Democratic Republic",
- "ck" => "Cook Islands",
- "cr" => "Costa Rica",
- "ci" => "Cote D'ivoire",
- "hr" => "Croatia",
- "cu" => "Cuba",
- "cy" => "Cyprus",
- "cz" => "Czech Republic",
- "dk" => "Denmark",
- "dj" => "Djibouti",
- "dm" => "Dominica",
- "do" => "Dominican Republic",
- "ec" => "Ecuador",
- "eg" => "Egypt",
- "sv" => "El Salvador",
- "gq" => "Equatorial Guinea",
- "er" => "Eritrea",
- "ee" => "Estonia",
- "et" => "Ethiopia",
- "fk" => "Falkland Islands (Malvinas)",
- "fo" => "Faroe Islands",
- "fj" => "Fiji",
- "fi" => "Finland",
- "fr" => "France",
- "gf" => "French Guiana",
- "pf" => "French Polynesia",
- "tf" => "French Southern Territories",
- "ga" => "Gabon",
- "gm" => "Gambia",
- "ge" => "Georgia",
- "de" => "Germany",
- "gh" => "Ghana",
- "gi" => "Gibraltar",
- "gr" => "Greece",
- "gl" => "Greenland",
- "gd" => "Grenada",
- "gp" => "Guadeloupe",
- "gu" => "Guam",
- "gt" => "Guatemala",
- "gn" => "Guinea",
- "gw" => "Guinea-Bissau",
- "gy" => "Guyana",
- "ht" => "Haiti",
- "hm" => "Heard Island and Mcdonald Islands",
- "va" => "Holy See (Vatican City State)",
- "hn" => "Honduras",
- "hk" => "Hong Kong",
- "hu" => "Hungary",
- "is" => "Iceland",
- "in" => "India",
- "id" => "Indonesia",
- "ir" => "Iran, Islamic Republic",
- "iq" => "Iraq",
- "ie" => "Ireland",
- "il" => "Israel",
- "it" => "Italy",
- "jm" => "Jamaica",
- "jp" => "Japan",
- "jo" => "Jordan",
- "kz" => "Kazakhstan",
- "ke" => "Kenya",
- "ki" => "Kiribati",
- "kp" => "Korea, Democratic People's Republic",
- "kr" => "Korea, Republic",
- "kw" => "Kuwait",
- "kg" => "Kyrgyzstan",
- "la" => "Lao People's Democratic Republic",
- "lv" => "Latvia",
- "lb" => "Lebanon",
- "ls" => "Lesotho",
- "lr" => "Liberia",
- "ly" => "Libyan Arab Jamahiriya",
- "li" => "Liechtenstein",
- "lt" => "Lithuania",
- "lu" => "Luxembourg",
- "mo" => "Macao",
- "mk" => "Macedonia, the Former Yugosalv Republic",
- "mg" => "Madagascar",
- "mw" => "Malawi",
- "my" => "Malaysia",
- "mv" => "Maldives",
- "ml" => "Mali",
- "mt" => "Malta",
- "mh" => "Marshall Islands",
- "mq" => "Martinique",
- "mr" => "Mauritania",
- "mu" => "Mauritius",
- "yt" => "Mayotte",
- "mx" => "Mexico",
- "fm" => "Micronesia, Federated States",
- "md" => "Moldova, Republic",
- "mc" => "Monaco",
- "mn" => "Mongolia",
- "ms" => "Montserrat",
- "ma" => "Morocco",
- "mz" => "Mozambique",
- "mm" => "Myanmar",
- "na" => "Namibia",
- "nr" => "Nauru",
- "np" => "Nepal",
- "nl" => "Netherlands",
- "an" => "Netherlands Antilles",
- "nc" => "New Caledonia",
- "nz" => "New Zealand",
- "ni" => "Nicaragua",
- "ne" => "Niger",
- "ng" => "Nigeria",
- "nu" => "Niue",
- "nf" => "Norfolk Island",
- "mp" => "Northern Mariana Islands",
- "no" => "Norway",
- "om" => "Oman",
- "pk" => "Pakistan",
- "pw" => "Palau",
- "ps" => "Palestinian Territory, Occupied",
- "pa" => "Panama",
- "pg" => "Papua New Guinea",
- "py" => "Paraguay",
- "pe" => "Peru",
- "ph" => "Philippines",
- "pn" => "Pitcairn",
- "pl" => "Poland",
- "pt" => "Portugal",
- "pr" => "Puerto Rico",
- "qa" => "Qatar",
- "re" => "Reunion",
- "ro" => "Romania",
- "ru" => "Russian Federation",
- "rw" => "Rwanda",
- "sh" => "Saint Helena",
- "kn" => "Saint Kitts and Nevis",
- "lc" => "Saint Lucia",
- "pm" => "Saint Pierre and Miquelon",
- "vc" => "Saint Vincent and the Grenadines",
- "ws" => "Samoa",
- "sm" => "San Marino",
- "st" => "Sao Tome and Principe",
- "sa" => "Saudi Arabia",
- "sn" => "Senegal",
- "cs" => "Serbia and Montenegro",
- "sc" => "Seychelles",
- "sl" => "Sierra Leone",
- "sg" => "Singapore",
- "sk" => "Slovakia",
- "si" => "Slovenia",
- "sb" => "Solomon Islands",
- "so" => "Somalia",
- "za" => "South Africa",
- "gs" => "South Georgia and the South Sandwich Islands",
- "es" => "Spain",
- "lk" => "Sri Lanka",
- "sd" => "Sudan",
- "sr" => "Suriname",
- "sj" => "Svalbard and Jan Mayen",
- "sz" => "Swaziland",
- "se" => "Sweden",
- "ch" => "Switzerland",
- "sy" => "Syrian Arab Republic",
- "tw" => "Taiwan, Province of China",
- "tj" => "Tajikistan",
- "tz" => "Tanzania, United Republic",
- "th" => "Thailand",
- "tl" => "Timor-Leste",
- "tg" => "Togo",
- "tk" => "Tokelau",
- "to" => "Tonga",
- "tt" => "Trinidad and Tobago",
- "tn" => "Tunisia",
- "tr" => "Turkey",
- "tm" => "Turkmenistan",
- "tc" => "Turks and Caicos Islands",
- "tv" => "Tuvalu",
- "ug" => "Uganda",
- "ua" => "Ukraine",
- "ae" => "United Arab Emirates",
- "uk" => "United Kingdom",
- "us" => "United States",
- "um" => "United States Minor Outlying Islands",
- "uy" => "Uruguay",
- "uz" => "Uzbekistan",
- "vu" => "Vanuatu",
- "ve" => "Venezuela",
- "vn" => "Viet Nam",
- "vg" => "Virgin Islands, British",
- "vi" => "Virgin Islands, U.S.",
- "wf" => "Wallis and Futuna",
- "eh" => "Western Sahara",
- "ye" => "Yemen",
- "zm" => "Zambia",
- "zw" => "Zimbabwe"
- ]
- ],
- "nsfw" => [
- "display" => "NSFW",
- "option" => [
- "yes" => "Yes", // safe=active
- "no" => "No" // safe=off
- ]
- ],
- "lang" => [ // lr=<lang> (prefix lang with "lang_")
- "display" => "Language",
- "option" => [
- "any" => "Any language",
- "ar" => "Arabic",
- "bg" => "Bulgarian",
- "ca" => "Catalan",
- "cs" => "Czech",
- "da" => "Danish",
- "de" => "German",
- "el" => "Greek",
- "en" => "English",
- "es" => "Spanish",
- "et" => "Estonian",
- "fi" => "Finnish",
- "fr" => "French",
- "hr" => "Croatian",
- "hu" => "Hungarian",
- "id" => "Indonesian",
- "is" => "Icelandic",
- "it" => "Italian",
- "iw" => "Hebrew",
- "ja" => "Japanese",
- "ko" => "Korean",
- "lt" => "Lithuanian",
- "lv" => "Latvian",
- "nl" => "Dutch",
- "no" => "Norwegian",
- "pl" => "Polish",
- "pt" => "Portuguese",
- "ro" => "Romanian",
- "ru" => "Russian",
- "sk" => "Slovak",
- "sl" => "Slovenian",
- "sr" => "Serbian",
- "sv" => "Swedish",
- "tr" => "Turkish",
- "zh-CN" => "Chinese (Simplified)",
- "zh-TW" => "Chinese (Traditional)"
+ return array_merge(
+ $base,
+ [
+ "newer" => [ // &sort=review-date:r:20090301:20090430
+ "display" => "Newer than",
+ "option" => "_DATE"
+ ],
+ "older" => [
+ "display" => "Older than",
+ "option" => "_DATE"
]
- ],
- "newer" => [ // &sort=review-date:r:20090301:20090430
- "display" => "Newer than",
- "option" => "_DATE"
- ],
- "older" => [
- "display" => "Older than",
- "option" => "_DATE"
]
- ];
+ );
break;
case "images":
- return [
- "country" => [ // gl=<country>
- "display" => "Country",
- "option" => [
- "any" => "Instance's country",
- "af" => "Afghanistan",
- "al" => "Albania",
- "dz" => "Algeria",
- "as" => "American Samoa",
- "ad" => "Andorra",
- "ao" => "Angola",
- "ai" => "Anguilla",
- "aq" => "Antarctica",
- "ag" => "Antigua and Barbuda",
- "ar" => "Argentina",
- "am" => "Armenia",
- "aw" => "Aruba",
- "au" => "Australia",
- "at" => "Austria",
- "az" => "Azerbaijan",
- "bs" => "Bahamas",
- "bh" => "Bahrain",
- "bd" => "Bangladesh",
- "bb" => "Barbados",
- "by" => "Belarus",
- "be" => "Belgium",
- "bz" => "Belize",
- "bj" => "Benin",
- "bm" => "Bermuda",
- "bt" => "Bhutan",
- "bo" => "Bolivia",
- "ba" => "Bosnia and Herzegovina",
- "bw" => "Botswana",
- "bv" => "Bouvet Island",
- "br" => "Brazil",
- "io" => "British Indian Ocean Territory",
- "bn" => "Brunei Darussalam",
- "bg" => "Bulgaria",
- "bf" => "Burkina Faso",
- "bi" => "Burundi",
- "kh" => "Cambodia",
- "cm" => "Cameroon",
- "ca" => "Canada",
- "cv" => "Cape Verde",
- "ky" => "Cayman Islands",
- "cf" => "Central African Republic",
- "td" => "Chad",
- "cl" => "Chile",
- "cn" => "China",
- "cx" => "Christmas Island",
- "cc" => "Cocos (Keeling) Islands",
- "co" => "Colombia",
- "km" => "Comoros",
- "cg" => "Congo",
- "cd" => "Congo, the Democratic Republic",
- "ck" => "Cook Islands",
- "cr" => "Costa Rica",
- "ci" => "Cote D'ivoire",
- "hr" => "Croatia",
- "cu" => "Cuba",
- "cy" => "Cyprus",
- "cz" => "Czech Republic",
- "dk" => "Denmark",
- "dj" => "Djibouti",
- "dm" => "Dominica",
- "do" => "Dominican Republic",
- "ec" => "Ecuador",
- "eg" => "Egypt",
- "sv" => "El Salvador",
- "gq" => "Equatorial Guinea",
- "er" => "Eritrea",
- "ee" => "Estonia",
- "et" => "Ethiopia",
- "fk" => "Falkland Islands (Malvinas)",
- "fo" => "Faroe Islands",
- "fj" => "Fiji",
- "fi" => "Finland",
- "fr" => "France",
- "gf" => "French Guiana",
- "pf" => "French Polynesia",
- "tf" => "French Southern Territories",
- "ga" => "Gabon",
- "gm" => "Gambia",
- "ge" => "Georgia",
- "de" => "Germany",
- "gh" => "Ghana",
- "gi" => "Gibraltar",
- "gr" => "Greece",
- "gl" => "Greenland",
- "gd" => "Grenada",
- "gp" => "Guadeloupe",
- "gu" => "Guam",
- "gt" => "Guatemala",
- "gn" => "Guinea",
- "gw" => "Guinea-Bissau",
- "gy" => "Guyana",
- "ht" => "Haiti",
- "hm" => "Heard Island and Mcdonald Islands",
- "va" => "Holy See (Vatican City State)",
- "hn" => "Honduras",
- "hk" => "Hong Kong",
- "hu" => "Hungary",
- "is" => "Iceland",
- "in" => "India",
- "id" => "Indonesia",
- "ir" => "Iran, Islamic Republic",
- "iq" => "Iraq",
- "ie" => "Ireland",
- "il" => "Israel",
- "it" => "Italy",
- "jm" => "Jamaica",
- "jp" => "Japan",
- "jo" => "Jordan",
- "kz" => "Kazakhstan",
- "ke" => "Kenya",
- "ki" => "Kiribati",
- "kp" => "Korea, Democratic People's Republic",
- "kr" => "Korea, Republic",
- "kw" => "Kuwait",
- "kg" => "Kyrgyzstan",
- "la" => "Lao People's Democratic Republic",
- "lv" => "Latvia",
- "lb" => "Lebanon",
- "ls" => "Lesotho",
- "lr" => "Liberia",
- "ly" => "Libyan Arab Jamahiriya",
- "li" => "Liechtenstein",
- "lt" => "Lithuania",
- "lu" => "Luxembourg",
- "mo" => "Macao",
- "mk" => "Macedonia, the Former Yugosalv Republic",
- "mg" => "Madagascar",
- "mw" => "Malawi",
- "my" => "Malaysia",
- "mv" => "Maldives",
- "ml" => "Mali",
- "mt" => "Malta",
- "mh" => "Marshall Islands",
- "mq" => "Martinique",
- "mr" => "Mauritania",
- "mu" => "Mauritius",
- "yt" => "Mayotte",
- "mx" => "Mexico",
- "fm" => "Micronesia, Federated States",
- "md" => "Moldova, Republic",
- "mc" => "Monaco",
- "mn" => "Mongolia",
- "ms" => "Montserrat",
- "ma" => "Morocco",
- "mz" => "Mozambique",
- "mm" => "Myanmar",
- "na" => "Namibia",
- "nr" => "Nauru",
- "np" => "Nepal",
- "nl" => "Netherlands",
- "an" => "Netherlands Antilles",
- "nc" => "New Caledonia",
- "nz" => "New Zealand",
- "ni" => "Nicaragua",
- "ne" => "Niger",
- "ng" => "Nigeria",
- "nu" => "Niue",
- "nf" => "Norfolk Island",
- "mp" => "Northern Mariana Islands",
- "no" => "Norway",
- "om" => "Oman",
- "pk" => "Pakistan",
- "pw" => "Palau",
- "ps" => "Palestinian Territory, Occupied",
- "pa" => "Panama",
- "pg" => "Papua New Guinea",
- "py" => "Paraguay",
- "pe" => "Peru",
- "ph" => "Philippines",
- "pn" => "Pitcairn",
- "pl" => "Poland",
- "pt" => "Portugal",
- "pr" => "Puerto Rico",
- "qa" => "Qatar",
- "re" => "Reunion",
- "ro" => "Romania",
- "ru" => "Russian Federation",
- "rw" => "Rwanda",
- "sh" => "Saint Helena",
- "kn" => "Saint Kitts and Nevis",
- "lc" => "Saint Lucia",
- "pm" => "Saint Pierre and Miquelon",
- "vc" => "Saint Vincent and the Grenadines",
- "ws" => "Samoa",
- "sm" => "San Marino",
- "st" => "Sao Tome and Principe",
- "sa" => "Saudi Arabia",
- "sn" => "Senegal",
- "cs" => "Serbia and Montenegro",
- "sc" => "Seychelles",
- "sl" => "Sierra Leone",
- "sg" => "Singapore",
- "sk" => "Slovakia",
- "si" => "Slovenia",
- "sb" => "Solomon Islands",
- "so" => "Somalia",
- "za" => "South Africa",
- "gs" => "South Georgia and the South Sandwich Islands",
- "es" => "Spain",
- "lk" => "Sri Lanka",
- "sd" => "Sudan",
- "sr" => "Suriname",
- "sj" => "Svalbard and Jan Mayen",
- "sz" => "Swaziland",
- "se" => "Sweden",
- "ch" => "Switzerland",
- "sy" => "Syrian Arab Republic",
- "tw" => "Taiwan, Province of China",
- "tj" => "Tajikistan",
- "tz" => "Tanzania, United Republic",
- "th" => "Thailand",
- "tl" => "Timor-Leste",
- "tg" => "Togo",
- "tk" => "Tokelau",
- "to" => "Tonga",
- "tt" => "Trinidad and Tobago",
- "tn" => "Tunisia",
- "tr" => "Turkey",
- "tm" => "Turkmenistan",
- "tc" => "Turks and Caicos Islands",
- "tv" => "Tuvalu",
- "ug" => "Uganda",
- "ua" => "Ukraine",
- "ae" => "United Arab Emirates",
- "uk" => "United Kingdom",
- "us" => "United States",
- "um" => "United States Minor Outlying Islands",
- "uy" => "Uruguay",
- "uz" => "Uzbekistan",
- "vu" => "Vanuatu",
- "ve" => "Venezuela",
- "vn" => "Viet Nam",
- "vg" => "Virgin Islands, British",
- "vi" => "Virgin Islands, U.S.",
- "wf" => "Wallis and Futuna",
- "eh" => "Western Sahara",
- "ye" => "Yemen",
- "zm" => "Zambia",
- "zw" => "Zimbabwe"
- ]
- ],
- "nsfw" => [
- "display" => "NSFW",
- "option" => [
- "yes" => "Yes", // safe=active
- "no" => "No" // safe=off
- ]
- ],
- "lang" => [ // lr=<lang> (prefix lang with "lang_")
- "display" => "Language",
- "option" => [
- "any" => "Any language",
- "ar" => "Arabic",
- "bg" => "Bulgarian",
- "ca" => "Catalan",
- "cs" => "Czech",
- "da" => "Danish",
- "de" => "German",
- "el" => "Greek",
- "en" => "English",
- "es" => "Spanish",
- "et" => "Estonian",
- "fi" => "Finnish",
- "fr" => "French",
- "hr" => "Croatian",
- "hu" => "Hungarian",
- "id" => "Indonesian",
- "is" => "Icelandic",
- "it" => "Italian",
- "iw" => "Hebrew",
- "ja" => "Japanese",
- "ko" => "Korean",
- "lt" => "Lithuanian",
- "lv" => "Latvian",
- "nl" => "Dutch",
- "no" => "Norwegian",
- "pl" => "Polish",
- "pt" => "Portuguese",
- "ro" => "Romanian",
- "ru" => "Russian",
- "sk" => "Slovak",
- "sl" => "Slovenian",
- "sr" => "Serbian",
- "sv" => "Swedish",
- "tr" => "Turkish",
- "zh-CN" => "Chinese (Simplified)",
- "zh-TW" => "Chinese (Traditional)"
- ]
- ],
- "time" => [ // tbs=qrd:<size>
- "display" => "Time posted",
- "option" => [
- "any" => "Any time",
- "d" => "Past 24 hours",
- "w" => "Past week",
- "m" => "Past month",
- "y" => "Past year"
- ]
- ],
- "size" => [
- "display" => "Size",
- "option" => [
- // tbs=isz:<size>
- "any" => "Any size",
- "l" => "Large",
- "m" => "Medium",
- "i" => "Icon",
- // from here
- // tbz:lt,islt:<size>
- "qsvga" => "Larger than 400x300",
- "vga" => "Larger than 640x480",
- "qsvga" => "Larger than 800x600",
- "xga" => "Larger than 1024x768",
- "2mp" => "Larger than 2MP",
- "4mp" => "Larger than 4MP",
- "6mp" => "Larger than 6MP",
- "8mp" => "Larger than 8MP",
- "10mp" => "Larger than 10MP",
- "12mp" => "Larger than 12MP",
- "15mp" => "Larger than 15MP",
- "20mp" => "Larger than 20MP",
- "40mp" => "Larger than 40MP",
- "70mp" => "Larger than 70MP"
- ]
- ],
- "ratio" => [ // tbs=iar:<size>
- "display" => "Aspect ratio",
- "option" => [
- "any" => "Any ratio",
- "t" => "Tall",
- "s" => "Square",
- "w" => "Wide",
- "xw" => "Panoramic"
- ]
- ],
- "color" => [ // tbs=ic:<color>
- "display" => "Color",
- "option" => [
- "any" => "Any color",
- "color" => "Full color",
- "gray" => "Black & white",
- "trans" => "Transparent",
- // from there, its ic:specific,isc:<color>
- "red" => "Red",
- "orange" => "Orange",
- "yellow" => "Yellow",
- "green" => "Green",
- "teal" => "Teal",
- "blue" => "Blue",
- "purple" => "Purple",
- "pink" => "Pink",
- "white" => "White",
- "gray" => "Gray",
- "black" => "Black",
- "brown" => "Brown"
- ]
- ],
- "type" => [ // tbs=itp:<type>
- "display" => "Type",
- "option" => [
- "any" => "Any type",
- "face" => "Faces",
- "clipart" => "Clip Art",
- "lineart" => "Line Drawing",
- "stock" => "Stock",
- "animated" => "Animated"
+ return array_merge(
+ $base,
+ [
+ "time" => [ // tbs=qrd:<size>
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "d" => "Past 24 hours",
+ "w" => "Past week",
+ "m" => "Past month",
+ "y" => "Past year"
+ ]
+ ],
+ "size" => [
+ "display" => "Size",
+ "option" => [
+ // tbs=isz:<size>
+ "any" => "Any size",
+ "l" => "Large",
+ "m" => "Medium",
+ "i" => "Icon",
+ // from here
+ // tbz:lt,islt:<size>
+ "qsvga" => "Larger than 400x300",
+ "vga" => "Larger than 640x480",
+ "qsvga" => "Larger than 800x600",
+ "xga" => "Larger than 1024x768",
+ "2mp" => "Larger than 2MP",
+ "4mp" => "Larger than 4MP",
+ "6mp" => "Larger than 6MP",
+ "8mp" => "Larger than 8MP",
+ "10mp" => "Larger than 10MP",
+ "12mp" => "Larger than 12MP",
+ "15mp" => "Larger than 15MP",
+ "20mp" => "Larger than 20MP",
+ "40mp" => "Larger than 40MP",
+ "70mp" => "Larger than 70MP"
+ ]
+ ],
+ "ratio" => [ // tbs=iar:<size>
+ "display" => "Aspect ratio",
+ "option" => [
+ "any" => "Any ratio",
+ "t" => "Tall",
+ "s" => "Square",
+ "w" => "Wide",
+ "xw" => "Panoramic"
+ ]
+ ],
+ "color" => [ // tbs=ic:<color>
+ "display" => "Color",
+ "option" => [
+ "any" => "Any color",
+ "color" => "Full color",
+ "gray" => "Black & white",
+ "trans" => "Transparent",
+ // from there, its ic:specific,isc:<color>
+ "red" => "Red",
+ "orange" => "Orange",
+ "yellow" => "Yellow",
+ "green" => "Green",
+ "teal" => "Teal",
+ "blue" => "Blue",
+ "purple" => "Purple",
+ "pink" => "Pink",
+ "white" => "White",
+ "gray" => "Gray",
+ "black" => "Black",
+ "brown" => "Brown"
+ ]
+ ],
+ "type" => [ // tbs=itp:<type>
+ "display" => "Type",
+ "option" => [
+ "any" => "Any type",
+ "face" => "Faces",
+ "clipart" => "Clip Art",
+ "lineart" => "Line Drawing",
+ "stock" => "Stock",
+ "animated" => "Animated"
+ ]
+ ],
+ "format" => [ // tbs=ift:<format>
+ "display" => "Format",
+ "option" => [
+ "any" => "Any format",
+ "jpg" => "JPG",
+ "gif" => "GIF",
+ "png" => "PNG",
+ "bmp" => "BMP",
+ "svg" => "SVG",
+ "webp" => "WEBP",
+ "ico" => "ICO",
+ "craw" => "RAW"
+ ]
+ ],
+ "rights" => [ // tbs=il:<rights>
+ "display" => "Usage rights",
+ "option" => [
+ "any" => "Any license",
+ "cl" => "Creative Commons licenses",
+ "ol" => "Commercial & other licenses"
+ ]
]
- ],
- "format" => [ // tbs=ift:<format>
- "display" => "Format",
- "option" => [
- "any" => "Any format",
- "jpg" => "JPG",
- "gif" => "GIF",
- "png" => "PNG",
- "bmp" => "BMP",
- "svg" => "SVG",
- "webp" => "WEBP",
- "ico" => "ICO",
- "craw" => "RAW"
+ ]
+ );
+ break;
+
+ case "videos":
+ return array_merge(
+ $base,
+ [
+ "time" => [
+ "display" => "Time posted",
+ "option" => [ // tbs=qdr
+ "any" => "Any time",
+ "h" => "Past hour",
+ "d" => "Past 24 hours",
+ "w" => "Past week",
+ "m" => "Past month",
+ "y" => "Past year"
+ ]
+ ],
+ "duration" => [
+ "display" => "Duration",
+ "option" => [
+ "any" => "Any duration",
+ "s" => "Short (0-4min)", // tbs=dur:s
+ "m" => "Medium (4-20min)", // tbs=dur:m
+ "l" => "Long (20+ min)" // tbs=dur:l
+ ]
+ ],
+ "quality" => [
+ "display" => "Quality",
+ "option" => [
+ "any" => "Any quality",
+ "h" => "High quality" // tbs=hq:h
+ ]
+ ],
+ "captions" => [
+ "display" => "Captions",
+ "option" => [
+ "any" => "No preference",
+ "yes" => "Closed captioned" // tbs=cc:1
+ ]
]
- ],
- "rights" => [ // tbs=il:<rights>
- "display" => "Usage rights",
- "option" => [
- "any" => "Any license",
- "cl" => "Creative Commons licenses",
- "ol" => "Commercial & other licenses"
+ ]
+ );
+ break;
+
+ case "news":
+ return array_merge(
+ $base,
+ [
+ "time" => [
+ "display" => "Time posted",
+ "option" => [ // tbs=qdr
+ "any" => "Any time",
+ "h" => "Past hour",
+ "d" => "Past 24 hours",
+ "w" => "Past week",
+ "m" => "Past month",
+ "y" => "Past year",
+ "a" => "Archives" // tbs=ar:1
+ ]
+ ],
+ "sort" => [
+ "display" => "Sort",
+ "option" => [
+ "relevance" => "Relevance",
+ "date" => "Date" // sbd:1
+ ]
]
]
- ];
+ );
break;
}
}
@@ -773,58 +556,402 @@ class google{
curl_close($curlproc);
return $data;
}
- /*
+
+
+
public function web($get){
- $search = $get["s"];
- $country = $get["country"];
- $nsfw = $get["nsfw"];
- $lang = $get["lang"];
- $older = $get["older"];
- $newer = $get["newer"];
+ if($get["npt"]){
+
+ [$req, $ip] = $this->backend->get($get["npt"], "web");
+ parse_str(
+ parse_url($req, PHP_URL_QUERY),
+ $search
+ );
+
+ if(isset($search["q"])){
+
+ $search = $search["q"];
+ }else{
+
+ $search = "a"; // lol
+ }
+
+ try{
+ $html =
+ $this->get(
+ $ip,
+ "https://www.google.com" . $req,
+ []
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+ }else{
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $lang = $get["lang"];
+ $older = $get["older"];
+ $newer = $get["newer"];
+ $ip = $this->backend->get_ip();
+
+ $params = [
+ "q" => $search,
+ "num" => 20 // get 20 results
+ ];
+
+ // country
+ if($country != "any"){
+
+ $params["gl"] = $country;
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ // language
+ if($lang != "any"){
+
+ $params["lr"] = "lang_" . $lang;
+ }
+
+ // &sort=review-date:r:20090301:20090430
+ $older = $older === false ? false : date("Ymd", $older);
+ $newer = $newer === false ? false : date("Ymd", $newer);
+
+ if(
+ $older !== false &&
+ $newer === false
+ ){
+
+ $newer = date("Ymd", time());
+ }
+
+ if(
+ $older !== false ||
+ $newer !== false
+ ){
+
+ $params["sort"] = "review-date:r:" . $older . ":" . $newer;
+ }
+
+ try{
+ $html =
+ $this->get(
+ $ip,
+ "https://www.google.com/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+ }
- $params = [
- "num" => 20 // get 20 results
- ];
+ return $this->parsepage($html, "web", $search, $ip);
+ }
+
+
+
+ public function video($get){
- // country
- if($country != "any"){
+ if($get["npt"]){
+
+ [$req, $ip] = $this->backend->get($get["npt"], "videos");
+ parse_str(
+ parse_url($req, PHP_URL_QUERY),
+ $search
+ );
+
+ if(isset($search["q"])){
+
+ $search = $search["q"];
+ }else{
+
+ $search = "a"; // lol
+ }
+
+ try{
+
+ $html =
+ $this->get(
+ $ip,
+ "https://www.google.com" . $req,
+ []
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ }else{
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $lang = $get["lang"];
+ $time = $get["time"];
+ $duration = $get["duration"];
+ $quality = $get["quality"];
+ $captions = $get["captions"];
+ $ip = $this->backend->get_ip();
+
+ $params = [
+ "q" => $search,
+ "tbm" => "vid",
+ "num" => "20"
+ ];
+
+ // country
+ if($country != "any"){
+
+ $params["gl"] = $country;
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ // language
+ if($lang != "any"){
+
+ $params["lr"] = "lang_" . $lang;
+ }
+
+ $tbs = [];
+
+ // time
+ if($time != "any"){
+
+ $tbs[] = "qdr:" . $time;
+ }
+
+ // duration
+ if($duration != "any"){
+
+ $tbs[] = "dur:" . $duration;
+ }
- $params["gl"] = $country;
+ // quality
+ if($quality != "any"){
+
+ $tbs[] = "hq:" . $quality;
+ }
+
+ // captions
+ if($captions != "any"){
+
+ $tbs[] = "cc:" . $captions;
+ }
+
+ // append tbs
+ if(count($tbs) !== 0){
+
+ $params["tbs"] =
+ implode(",", $tbs);
+ }
+
+ try{
+ $html =
+ $this->get(
+ $ip,
+ "https://www.google.com/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
}
- // nsfw
- $params["safe"] = $nsfw == "yes" ? "off" : "active";
+ $json = $this->parsepage($html, "videos", $search, $ip);
+ $out = [
+ "status" => "ok",
+ "npt" => $json["npt"],
+ "video" => [],
+ "author" => [],
+ "livestream" => [],
+ "playlist" => [],
+ "reel" => []
+ ];
- // language
- if($lang != "any"){
+ foreach($json["web"] as $item){
- $params["lr"] = "lang_" . $lang;
+ $out["video"][] = [
+ "title" => $item["title"],
+ "description" => $item["description"],
+ "author" => [
+ "name" => null,
+ "url" => null,
+ "avatar" => null
+ ],
+ "date" => isset($item["table"]["Posted"]) ? strtotime($item["table"]["Posted"]) : null,
+ "duration" => isset($item["table"]["Duration"]) ? $this->hms2int($item["table"]["Duration"]) : null,
+ "views" => null,
+ "thumb" =>
+ $item["thumb"]["url"] === null ?
+ [
+ "url" => null,
+ "ratio" => null
+ ] :
+ [
+ "url" => $item["thumb"]["url"],
+ "ratio" => "16:9"
+ ],
+ "url" => $item["url"]
+ ];
}
- // &sort=review-date:r:20090301:20090430
- $older = $older === false ? false : date("Ymd", $older);
- $newer = $newer === false ? false : date("Ymd", $newer);
+ return $out;
+ }
+
+
+
+ public function news($get){
- if(
- $older !== false &&
- $newer === false
- ){
+ if($get["npt"]){
+
+ [$req, $ip] = $this->backend->get($get["npt"], "news");
+ parse_str(
+ parse_url($req, PHP_URL_QUERY),
+ $search
+ );
+
+ if(isset($search["q"])){
+
+ $search = $search["q"];
+ }else{
+
+ $search = "a"; // lol
+ }
+
+ try{
+
+ $html =
+ $this->get(
+ $ip,
+ "https://www.google.com" . $req,
+ []
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ }else{
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $lang = $get["lang"];
+ $time = $get["time"];
+ $sort = $get["sort"];
+ $ip = $this->backend->get_ip();
- $newer = date("Ymd", time());
+ $params = [
+ "q" => $search,
+ "tbm" => "nws",
+ "num" => "20"
+ ];
+
+ // country
+ if($country != "any"){
+
+ $params["gl"] = $country;
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ // language
+ if($lang != "any"){
+
+ $params["lr"] = "lang_" . $lang;
+ }
+
+ $tbs = [];
+
+ // time
+ if($time != "any"){
+
+ if($time == "a"){
+
+ $tbs[] = "ar:1";
+ }else{
+
+ $tbs[] = "qdr:" . $time;
+ }
+ }
+
+ // relevance
+ if($sort == "date"){
+
+ $tbs[] = "sbd:1";
+ }
+
+ // append tbs
+ if(count($tbs) !== 0){
+
+ $params["tbs"] =
+ implode(",", $tbs);
+ }
+
+ $html =
+ $this->get(
+ $ip,
+ "https://www.google.com/search",
+ $params
+ );
}
- if(
- $older !== false ||
- $newer !== false
- ){
+ $json = $this->parsepage($html, "news", $search, $ip);
+ $out = [
+ "status" => "ok",
+ "npt" => $json["npt"],
+ "news" => []
+ ];
+
+ foreach($json["web"] as $item){
+
+ $description = array_key_first($item["table"]);
+
+ if($description !== null){
+
+ $date = $item["table"][$description];
+ }else{
+
+ $date = null;
+ }
- $params["sort"] = "review-date:r:" . $older . ":" . $newer;
+ $out["news"][] = [
+ "title" => $item["title"],
+ "author" => $item["author"],
+ "description" => $description,
+ "date" => strtotime($date),
+ "thumb" =>
+ $item["thumb"]["url"] === null ?
+ [
+ "url" => null,
+ "ratio" => null
+ ] :
+ [
+ "url" => $item["thumb"]["url"],
+ "ratio" => "16:9"
+ ],
+ "url" => $item["url"]
+ ];
}
+ return $out;
+ }
+
+
+
+ private function parsepage($html, $pagetype, $search, $ip){
+ /*
$handle = fopen("scraper/google.html", "r");
$html = fread($handle, filesize("scraper/google.html"));
fclose($handle);
+ */
$out = [
"status" => "ok",
@@ -844,6 +971,156 @@ class google{
$this->parsejavascript($html);
+ //
+ // parse accdef's
+ //
+ $has_appended_accdef = false;
+
+ preg_match_all(
+ '/window\.jsl\.dh\(\'(accdef_[0-9]+)\',\'(.*)\'\);/',
+ $html,
+ $accdefs_regex
+ );
+
+ $accdefs = [];
+ for($i=0; $i<count($accdefs_regex[0]); $i++){
+
+ // decode UTF-16 string
+ $answer =
+ $this->fuckhtml
+ ->parseJsString(
+ $accdefs_regex[2][$i]
+ );
+
+ $this->fuckhtml->load($answer);
+
+ // get description
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "padding" => "12px 16px 12px",
+ ],
+ self::is_class
+ ),
+ "div"
+ )[1];
+
+ // get date (rare)
+ $date =
+ $this->fuckhtml
+ ->getElementsByTagName("sub");
+
+ if(count($date) !== 0){
+
+ $description =
+ str_replace(
+ $date[0]["outerHTML"],
+ "",
+ $description["innerHTML"]
+ );
+
+ $date =
+ strtotime(
+ $this->fuckhtml
+ ->getTextContent(
+ $date[0]
+ )
+ );
+ }else{
+
+ $date = null;
+ }
+
+ // get information table
+ $table = [];
+
+ $tbody =
+ $this->fuckhtml
+ ->getElementsByTagName("tbody");
+
+ if(count($tbody) !== 0){
+
+ $this->fuckhtml->load($tbody[0]);
+
+ $trs =
+ $this->fuckhtml
+ ->getElementsByTagName("tr");
+
+ foreach($trs as $tr){
+
+ $this->fuckhtml->load($tr);
+
+ $tds =
+ $this->fuckhtml
+ ->getElementsByTagName("td");
+
+ if(count($tds) === 2){
+
+ $table[
+ $this->fuckhtml
+ ->getTextContent(
+ $tds[0]
+ )
+ ] =
+ $this->fuckhtml
+ ->getTextContent(
+ $tds[1]
+ );
+ }
+ }
+
+ // load back what we had
+ $this->fuckhtml->load($answer);
+ }
+
+ // get title & link
+ $a =
+ $this->fuckhtml
+ ->getElementsByTagName("a")[0];
+
+ $this->fuckhtml->load($a);
+
+ $title =
+ $this->fuckhtml
+ ->getElementsByTagName("span");
+
+ if(count($title) === 0){
+
+ continue;
+ }
+
+ $accdefs[] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ )
+ ),
+ "description" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $description
+ ),
+ "url" =>
+ $this->unshiturl(
+ $a["attributes"]["href"]
+ ),
+ "date" => $date,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => $table
+ ];
+ }
+
+ $this->fuckhtml->load($html);
+
$containers =
$this->fuckhtml
->getElementsByClassName(
@@ -863,6 +1140,94 @@ class google{
$this->fuckhtml->load($container);
+ // detect spelling
+ $spelling =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "font-size" => "20px",
+ "line-height" => "26px",
+ "padding-top" => "2px",
+ "margin-bottom" => "1px"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
+
+ if(count($spelling) !== 0){
+
+ $a =
+ $this->fuckhtml
+ ->getElementsByTagName("a");
+
+ if(count($a) !== 0){
+
+ $scripts =
+ $this->fuckhtml
+ ->getElementsByTagName("script");
+
+ foreach($scripts as $script){
+
+ $container["innerHTML"] =
+ str_replace(
+ $script["outerHTML"],
+ "",
+ $container["innerHTML"]
+ );
+ }
+
+ $container["innerHTML"] =
+ $this->fuckhtml
+ ->getTextContent(
+ str_replace(
+ $a[0]["outerHTML"],
+ "",
+ $container["innerHTML"]
+ )
+ );
+
+ if(
+ preg_match(
+ '/^did you mean/i',
+ $container["innerHTML"]
+ )
+ ){
+
+ $out["spelling"] = [
+ "type" => "not_many",
+ "using" => $search,
+ "correction" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $a[0]
+ )
+ ];
+ }
+
+ elseif(
+ preg_match(
+ '/^showing results for/i',
+ $container["innerHTML"]
+ )
+ ){
+
+ $out["spelling"] = [
+ "type" => "including",
+ "using" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $a[0]
+ ),
+ "correction" => $search
+ ];
+ }
+ }
+
+ continue;
+ }
+
$title =
$this->fuckhtml
->getElementsByClassName(
@@ -891,14 +1256,7 @@ class google{
)
),
"description" => null,
- "url" =>
- $this->decodeurl(
- $this->fuckhtml
- ->getElementsByTagName("a")
- [0]
- ["attributes"]
- ["href"]
- ),
+ "url" => null,
"date" => null,
"type" => "web",
"thumb" => [
@@ -909,33 +1267,235 @@ class google{
"table" => []
];
+ // get link
+ $web["url"] =
+ $this->unshiturl(
+ $this->fuckhtml
+ ->getElementsByTagName("a")
+ [0]
+ ["attributes"]
+ ["href"]
+ );
+
+ //
+ // check if link contains a carousel
+ //
+ $carousels = $this->parsecarousels();
+ if(count($carousels) !== 0){
+
+ $first = true;
+ foreach($carousels as $carousel_cat){
+
+ foreach($carousel_cat as $carousel){
+
+ if($first === true){
+
+ $first = false;
+ }elseif($carousel["image"] !== null){
+
+ $out["image"][] = [
+ "title" => $carousel["title"],
+ "source" => [
+ [
+ "url" => $carousel["image"],
+ "width" => null,
+ "height" => null
+ ]
+ ],
+ "url" => $carousel["url"]
+ ];
+ }
+
+ $web["sublink"][] = [
+ "title" => $carousel["title"],
+ "date" => $carousel["date"],
+ "description" => $carousel["description"],
+ "url" => $carousel["url"]
+ ];
+ }
+ }
+
+ if($carousels[0][0]["image"] !== null){
+ $web["thumb"] = [
+ "url" => $carousels[0][0]["image"],
+ "ratio" => "16:9"
+ ];
+ }
+
+ $out["web"][] = $web;
+ continue;
+ }
+
+ //
+ // no carousel entries, parse as normal link
+ //
+ $this->fuckhtml->load($container);
+
+ // parse URL
+ $web["url"] =
+ $this->unshiturl(
+ $this->fuckhtml
+ ->getElementsByTagName("a")
+ [0]
+ ["attributes"]
+ ["href"]
+ );
+
$container = $container["innerHTML"];
- $description_container =
+ $line_detect =
$this->fuckhtml
->getElementsByClassName(
$this->findstyles(
[
- "padding" => "12px 16px 12px"
+ "height" => "1px",
+ "background-color" => "#dadce0",
+ "margin" => "0 16px"
],
self::is_class
),
"div"
- )[1];
+ );
+
+ if(count($line_detect) !== 0){
+
+ // we found a line, this means we're dealing with a
+ // "featured snippet"
+ $featured = true;
+
+ $description_container =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "white-space" => "pre-line",
+ "word-wrap" => "break-word"
+ ],
+ self::is_class
+ ),
+ "div"
+ )[1];
+
+ // get date node for it
+ $date =
+ $this->fuckhtml
+ ->getElementsByTagName("sub");
+
+ if(count($date) !== 0){
+ $web["date"] =
+ strtotime(
+ $this->fuckhtml
+ ->getTextContent(
+ $date[0]
+ )
+ );
+ }
+ }else{
+
+ // we're dealing with a normal link
+ $featured = false;
+
+ $description_container =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "padding" => "12px 16px 12px"
+ ],
+ self::is_class
+ ),
+ "div"
+ )[1];
+ }
+
+ //
+ // Get author if we're parsing news
+ //
+ if($pagetype == "news"){
+
+ $author =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "position" => "absolute",
+ "width" => "100%",
+ "top" => "0",
+ "left" => "0",
+ "padding-top" => "1px",
+ "margin-bottom" => "-1px"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
+
+ if(count($author) !== 0){
+
+ $web["author"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $author[0]
+ );
+ }else{
+
+ $web["author"] = null;
+ }
+ }
$description =
$description_container["innerHTML"];
- // get sublinks
$this->fuckhtml->load($description);
+ //
+ // get thumbnail before we call loadhtml again
+ //
+ $img =
+ $this->fuckhtml
+ ->getElementsByTagName("img");
+
+ if(count($img) !== 0){
+
+ $skip = true;
+
+ if(
+ isset($img[0]["attributes"]["alt"]) &&
+ stripos($img[0]["attributes"]["alt"], "Video for") !== false
+ ){
+
+ // is a video thumbnail
+ $web["thumb"]["ratio"] = "16:9";
+ }else{
+
+ // is a google thumbnail
+ $web["thumb"]["ratio"] = "1:1";
+ }
+
+ $web["thumb"]["url"] =
+ $this->getimage(
+ $img[0]["attributes"]["id"]
+ );
+ }else{
+
+ $skip = false;
+ }
+
+ //
+ // get sublinks
+ //
$links =
$this->fuckhtml
->getElementsByTagName("a");
- $skip = true;
foreach($links as $link){
+ if($skip === true){
+
+ $skip = false;
+ continue;
+ }
+
$description =
str_replace(
$link["outerHTML"],
@@ -943,12 +1503,6 @@ class google{
$description
);
- if($skip){
-
- $skip = false;
- continue;
- }
-
$sublink = [
"title" => null,
"description" => null,
@@ -957,214 +1511,799 @@ class google{
];
$sublink["title"] =
- $this->fuckhtml
- ->getTextContent(
- $link
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $link
+ )
);
$sublink["url"] =
- $this->decodeurl(
+ $this->unshiturl(
$link
["attributes"]
["href"]
);
- $web["sublink"][] = $sublink;
+ if(parse_url($sublink["url"], PHP_URL_HOST) !== null){
+
+ $web["sublink"][] = $sublink;
+ }
}
- // get thumbnail before we call loadhtml again
- $img =
+ //
+ // Parse spans in description
+ //
+ $this->fuckhtml->load($description);
+
+ if($featured === false){
+
+ $levels =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "padding-bottom" => "8px"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
+
+ // oh my god yes, fucking great, sometimes there are NO levels
+ // hahahahahhahahahahahahahahahhahaa
+ if(count($levels) === 0){
+
+ $levels = [$description];
+ }
+
+ foreach($levels as $level){
+
+ $this->fuckhtml->load($level);
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ );
+
+ $is_rating = -1;
+
+ foreach($spans as $span){
+
+ $innertext =
+ trim(
+ $this->fuckhtml
+ ->getTextContent(
+ $span
+ ),
+ " ·."
+ );
+
+ if($innertext == ""){ continue; }
+
+ if(
+ strtolower($innertext)
+ == "rating"
+ ){
+
+ $is_rating = 0;
+
+ // clean up before we go
+ $description =
+ str_replace(
+ $span["outerHTML"],
+ "",
+ $description
+ );
+ continue;
+ }
+
+ //
+ // Parse rating object
+ //
+ if($is_rating >= 0){
+
+ // clean up description
+ $description =
+ str_replace(
+ $span["outerHTML"],
+ "",
+ $description
+ );
+
+ if($span["level"] !== 1){ continue; }
+ $is_rating++;
+
+ // 10/10 (123)
+ if($is_rating === 1){
+
+ $innertext = explode(" ", $innertext, 2);
+
+ $web["table"]["Rating"] = $innertext[0];
+
+ if(count($innertext) === 2){
+ $web["table"]["Hits"] =
+ trim(
+ str_replace(
+ [
+ "(",
+ ")"
+ ],
+ "",
+ $innertext[1]
+ )
+ );
+
+ if($web["table"]["Hits"] == ""){
+
+ unset($web["table"]["Hits"]);
+ }
+ }
+ continue;
+ }
+
+ // US$4.99
+ // MYR 50.00
+ // $38.34
+ // JP¥6,480
+ // Reviewed by your mom
+ if($is_rating === 2){
+
+ if(
+ preg_match(
+ '/^Review by (.+)/',
+ $innertext,
+ $match
+ )
+ ){
+
+ $web["table"]["Author"] = $match[1];
+ continue;
+ }
+
+ $web["table"]["Price"] = $innertext;
+ continue;
+ }
+
+ // Android / In stock
+ if($is_rating === 3){
+
+ $web["table"]["Support"] = $innertext;
+ continue;
+ }
+
+ // ignore the rest
+ continue;
+ }
+
+ //
+ // Parse standalone text
+ //
+
+ // If we reach this point:
+ // 1. Ratings have been parsed
+ // 2. We're parsing a WEB link, not some shitty piece of shit
+
+ // check for date
+ // if span has no text before it, assume it's a date
+ $desc_split =
+ explode(
+ $span["outerHTML"],
+ $description,
+ 2
+ );
+
+ if(
+ $this->fuckhtml
+ ->getTextContent(
+ $desc_split[0]
+ ) == ""
+ ){
+
+ // has no text before
+ $date = strtotime($innertext);
+ if($date){
+
+ $web["date"] = $date;
+ }
+
+ // cleanup
+ $description =
+ str_replace(
+ $span["outerHTML"],
+ "",
+ $description
+ );
+
+ continue;
+ }
+
+ // Ready to parse table
+ if(count($desc_split) === 2){
+ $this->fuckhtml->load($desc_split[1]);
+
+ $web["table"][
+ $this->fuckhtml
+ ->getTextContent(
+ trim($desc_split[0], ": ")
+ )
+ ] = $innertext;
+
+ // cleanup
+ $description =
+ str_replace(
+ $desc_split[0] . $span["outerHTML"],
+ "",
+ $description
+ );
+ }
+ }
+ }
+ }
+
+ $web["description"] =
+ trim(
+ $this->fuckhtml
+ ->getTextContent(
+ $description
+ ),
+ " ·."
+ );
+
+ if($web["description"] == ""){
+
+ $web["description"] = null;
+ }
+
+ $out["web"][] = $web;
+
+ continue;
+ }
+
+ //
+ // Detect wikipedia shit
+ //
+ $wiki_title =
+ $this->fuckhtml
+ ->getElementsByTagName("h3");
+
+ if(count($wiki_title) !== 0){
+
+ $description_after = [];
+ $description = [];
+ $table = [];
+ $sublink = [];
+
+ $as =
$this->fuckhtml
- ->getElementsByTagName("img");
+ ->getElementsByTagName("a");
- if(count($img) !== 0){
+ foreach($as as $a){
if(
- isset($img[0]["attributes"]["alt"]) &&
- stripos($img[0]["attributes"]["alt"], "Video for") !== false
+ isset($a["attributes"]["href"]) &&
+ parse_url($a["attributes"]["href"], PHP_URL_HOST) == "maps.google.com"
){
- // is a video thumbnail
- $web["thumb"]["ratio"] = "16:9";
- }else{
-
- // is a google thumbnail
- $web["thumb"]["ratio"] = "1:1";
+ // detected maps embed, ignore
+ continue 2;
}
+ }
+
+ // get carousels and remove them from container for image grepper
+ $carousels = $this->parsecarousels($container["innerHTML"]);
+ $this->fuckhtml->load($container);
+
+ // add images to image tab, if applicable
+ for($i=0; $i<count($carousels); $i++){
- $web["thumb"]["url"] =
- $this->getimage(
- $img[0]["attributes"]["id"]
- );
+ foreach($carousels[$i] as $item){
+
+ if(
+ $item["url"] !== null &&
+ $item["ref"] !== null &&
+ $item["image"] !== null &&
+ $item["title"] !== null
+ ){
+
+ $out["image"][] = [
+ "title" => $item["title"],
+ "source" => [
+ [
+ "url" => $item["url"],
+ "width" => $item["image_width"],
+ "height" => $item["image_height"]
+ ],
+ [
+ "url" => $item["image"],
+ "width" => $item["thumb_width"],
+ "height" => $item["thumb_height"]
+ ]
+ ],
+ "url" => $item["ref"]
+ ];
+
+ unset($carousels[$i]);
+ }
+ }
}
- // get table elements
- $this->fuckhtml->load($description);
+ $carousels = array_values($carousels);
- $levels =
+ // interpret remaining carousels as title + carousel
+ $titles =
$this->fuckhtml
->getElementsByClassName(
$this->findstyles(
[
- "padding-bottom" => "8px"
+ "font-weight" => "700",
+ "letter-spacing" => "0.75px",
+ "text-transform" => "uppercase"
],
self::is_class
- ),
- "div"
+ )
);
- $additional_info = [];
- foreach($levels as $level){
+ for($i=0; $i<count($titles); $i++){
- $this->fuckhtml->load($level);
+ if(!isset($carousels[$i])){
+
+ break;
+ }
- $spans =
- $this->fuckhtml
- ->getElementsByTagName(
- "span"
- );
+ $description_after[] = [
+ "type" => "title",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $titles[$i]
+ )
+ ];
+
+ foreach($carousels[$i] as $carousel){
+
+ $description_after[] = [
+ "type" => "link",
+ "url" => "web?s=" . urlencode($carousel["description"]) . "&scraper=google",
+ "value" => $carousel["description"]
+ ];
+
+ if($carousel["subtext"] !== null){
+
+ $description_after[] = [
+ "type" => "quote",
+ "value" => $carousel["subtext"]
+ ];
+ }
+
+ $description_after[] = [
+ "type" => "image",
+ "url" => $carousel["image"]
+ ];
+ }
+ }
+
+ $categories =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "padding" => "12px 16px 12px"
+ ],
+ self::is_class
+ )
+ );
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByTagName("img");
+
+ if(count($image) !== 0){
- $is_rating = -2;
+ $image = $this->getimage($image[0]["attributes"]["id"]);
+ }else{
- foreach($spans as $span){
+ $image = null;
+ }
+
+ $url = null;
+
+ for($i=0; $i<count($categories); $i++){
+
+ $this->fuckhtml->load($categories[$i]);
+
+ if($i === 0){
+ // first node. this should be the header with the small
+ // information snippet
+
+ $url =
+ $this->fuckhtml
+ ->getElementsByTagName("a");
- // clean up description
- $description =
+ if(count($url) !== 0){
+
+ $url =
+ $this->unshiturl(
+ $url[0]["attributes"]["href"]
+ );
+
+ if(parse_url($url, PHP_URL_HOST) == "encrypted-tbn0.gstatic.com"){
+
+ $image = $url;
+ $url = null;
+ }
+ }else{
+
+ $url = null;
+ }
+
+ $categories[$i]["innerHTML"] =
str_replace(
- $span["outerHTML"],
+ $wiki_title[0]["outerHTML"],
"",
- $description
+ $categories[$i]["innerHTML"]
);
- $innertext =
+ $subtext =
$this->fuckhtml
->getTextContent(
- $span
+ $categories[$i]["innerHTML"]
);
- if($innertext == ""){ continue; }
+ if(strlen($subtext) !== 0){
+
+ $description[] = [
+ "type" => "quote",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $categories[$i]["innerHTML"]
+ )
+ ];
+ }
+
+ // detect audio file
+ $audio =
+ $this->fuckhtml
+ ->getElementsByTagName("audio");
- if(
- strtolower($innertext)
- == "rating"
- ){
+ if(count($audio) !== 0){
- $is_rating = -1;
- continue;
+ $description[] = [
+ "type" => "audio",
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $audio[0]["attributes"]["src"]
+ )
+ ];
}
+ }else{
- //
- // Parse rating object
- //
+ // check for separator elements IN THERE
+ $separators =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "white-space" => "pre-line",
+ "word-wrap" => "break-word"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
- if($is_rating >= -1){
+ // detect container type
+ foreach($separators as $separator){
- if($span["level"] !== 1){ continue; }
+ $this->fuckhtml->load($separator);
- $is_rating++;
+ // ignore wrong levels
+ if($separator["level"] !== 2){
+
+ continue;
+ }
+
+ //
+ // Detect word definition
+ //
+ $wordwraps =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "padding-bottom" => "12px"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
- // 10/10 (123)
- if($is_rating === 0){
+ if(count($wordwraps) !== 0){
+
+ foreach($wordwraps as $word){
+
+ $this->fuckhtml->load($word);
+
+ // detect title
+ $span =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ );
+
+ if(
+ count($span) === 1 &&
+ $this->fuckhtml
+ ->getTextContent(
+ str_replace(
+ $span[0]["outerHTML"],
+ "",
+ $word["innerHTML"]
+ )
+ ) == ""
+ ){
+
+ $description[] = [
+ "type" => "title",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $span[0]
+ )
+ ];
+ continue;
+ }
+
+ // detect list element
+ $lists =
+ $this->fuckhtml
+ ->getElementsByTagName("ol");
+
+ if(count($lists) !== 0){
+ foreach($lists as $list){
+
+ $this->fuckhtml->load($list);
+
+ $items =
+ $this->fuckhtml
+ ->getElementsByTagName("li");
+
+ $w = 0;
+ foreach($items as $item){
+
+ $w++;
+ $this->fuckhtml->load($item);
+
+ // get subnodes
+ $subnodes =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "white-space" => "pre-line",
+ "word-wrap" => "break-word"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
+
+ foreach($subnodes as $subnode){
+
+ $this->fuckhtml->load($subnode);
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName("span");
+
+ if(count($spans) !== 0){
+
+ // append quote
+ $description[] = [
+ "type" => "quote",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $subnode
+ )
+ ];
+ }else{
+
+ // append text
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ $w . ". " .
+ $this->fuckhtml
+ ->getTextContent(
+ $subnode
+ )
+ ];
+ }
+ }
+ }
+ }
+ }else{
+
+ // parse without list
+ // get subnodes
+ $subnodes =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "white-space" => "pre-line",
+ "word-wrap" => "break-word"
+ ],
+ self::is_class
+ ),
+ "div"
+ );
+
+ foreach($subnodes as $subnode){
+
+ $this->fuckhtml->load($subnode);
+
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName("span");
+
+ if(count($spans) !== 0){
+
+ // append quote
+ $description[] = [
+ "type" => "quote",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $subnode
+ )
+ ];
+ }else{
+
+ // append text
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $subnode
+ )
+ ];
+ }
+ }
+ }
+ }
+ }else{
+
+ //
+ // Parse table
+ //
+ $spans =
+ $this->fuckhtml
+ ->getElementsByTagName("span");
+
+ foreach($spans as $span){
+
+ if(!isset($span["attributes"]["class"])){
+
+ // found table
+ $row =
+ explode(
+ ":",
+ $this->fuckhtml
+ ->getTextContent(
+ $separator
+ ),
+ 2
+ );
+
+ if(count($row) === 2){
+
+ $table[rtrim($row[0])] =
+ ltrim($row[1]);
+
+ }
+ continue 2;
+ }
+ }
- $innertext = explode(" ", $innertext, 2);
+ //
+ // Parse normal description
+ //
+ $links_rem =
+ $this->fuckhtml
+ ->getElementsByTagName("a");
- $web["table"]["Rating"] = $innertext[0];
- $web["table"]["Hits"] =
- trim(
+ foreach($links_rem as $rem){
+
+ $separator["innerHTML"] =
str_replace(
- [
- "(",
- ")"
- ],
+ $rem["outerHTML"],
"",
- $innertext[1]
+ $separator["innerHTML"]
+ );
+ }
+
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ rtrim(
+ $this->fuckhtml
+ ->getTextContent(
+ $separator
+ ),
+ " .,"
)
- );
- continue;
+ ];
}
+ }
+ }
- // US$4.99
- // MYR 50.00
- // $38.34
- // JP¥6,480
- if($is_rating === 2){
+ // detect huge buttons
+ $buttons =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "display" => "table-cell",
+ "vertical-align" => "middle",
+ "height" => "52px",
+ "text-align" => "center"
+ ],
+ self::is_class
+ ),
+ "a"
+ );
+
+ if(count($buttons) !== 0){
- $web["table"]["Price"] = $innertext;
- continue;
- }
+ foreach($buttons as $button){
- // Android / In stock
- if($is_rating === 4){
+ if(isset($button["attributes"]["href"])){
- $web["table"]["Support"] = $innertext;
- continue;
+ $sublink[
+ $this->fuckhtml
+ ->getTextContent(
+ $button
+ )
+ ] =
+ $this->unshiturl(
+ $button["attributes"]["href"]
+ );
}
-
- // ignore the rest
- continue;
}
-
- //
- // Parse standalone text
- //
- $additional_info[] = $innertext;
}
}
- for($i=0; $i<count($additional_info); $i++){
-
- // @TODO
- // generate better node names
- $web["table"]["Info node #$i"] = $additional_info[$i];
- }
-
- $this->fuckhtml->load($description);
-
- // get date node
- $span =
- $this->fuckhtml
- ->getElementsByTagName(
- "span"
- );
-
- if(count($span) !== 0){
-
- $description =
- str_replace(
- $span[0]["outerHTML"],
- "",
- $description
- );
-
- $span =
- strtotime(
- $this->fuckhtml
- ->getTextContent(
- $span[0]
- )
- );
-
- if($span){
-
- $web["date"] = $span;
- }
- }
+ // append description_after (contains carousel info)
+ $description = array_merge(
+ $description,
+ $description_after
+ );
- $web["description"] =
- trim(
+ $out["answer"][] = [
+ "title" =>
$this->fuckhtml
->getTextContent(
- $description
+ $wiki_title[0]
),
- " ·."
- );
-
- $out["web"][] = $web;
+ "description" => $description,
+ "url" => $url,
+ "thumb" => $image,
+ "table" => $table,
+ "sublink" => $sublink
+ ];
continue;
}
- // check for container title header
+ //
+ // Detect related searches containers
+ //
$container_title =
$this->fuckhtml
->getElementsByClassName(
@@ -1183,6 +2322,21 @@ class google{
if(count($container_title) !== 0){
+ // get carousel entries
+ $carousels = $this->parsecarousels($container["innerHTML"]);
+ $this->fuckhtml->load($container);
+
+ foreach($carousels as $carousel){
+
+ foreach($carousel as $item){
+
+ if($item["url"] !== null){
+
+ $out["related"][] = $item["url"];
+ }
+ }
+ }
+
$container_title =
strtolower(
$this->fuckhtml
@@ -1191,158 +2345,300 @@ class google{
)
);
- if(
- $container_title == "related searches" ||
- $container_title == "people also search for"
- ){
+ switch($container_title){
- //
- // Parse related searches
- //
- $as =
- $this->fuckhtml
- ->getElementsByTagName("a");
-
- foreach($as as $a){
+ case "related searches":
+ case "people also search for":
+ //
+ // Parse related searches
+ //
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName("a");
- $out["related"][] =
+ foreach($as as $a){
+
+ $out["related"][] =
+ $this->fuckhtml
+ ->getTextContent($a);
+ }
+ break;
+
+ case "people also ask":
+ // get related queries
+ $divs =
$this->fuckhtml
- ->getTextContent($a);
- }
+ ->getElementsByTagName("div");
+
+ foreach($divs as $div){
+
+ // add accdef's here
+ if($has_appended_accdef === false){
+
+ $out["web"] = array_merge($out["web"], $accdefs);
+ $has_appended_accdef = true;
+ }
+
+ // add accdef's questions
+ if(isset($div["attributes"]["role"])){
+
+ $out["related"][] =
+ $this->fuckhtml
+ ->getTextContent($div);
+
+ continue;
+ }
+ }
+ break;
}
continue;
}
//
- // Parse image carousel
+ // Parse news
//
- $title_container =
+ $title =
$this->fuckhtml
->getElementsByClassName(
$this->findstyles(
[
- "padding" => "12px 16px 12px"
+ "font-size" => "16px",
+ "line-height" => "20px",
+ "font-weight" => "400"
],
self::is_class
),
"div"
);
- if(count($title_container) !== 0){
+ if(count($title) !== 0){
- $title_container =
+ $carousels = $this->parsecarousels();
+ $this->fuckhtml->load($container);
+
+ if(count($carousels) === 0){
+
+ // no carousels found
+ continue;
+ }
+
+ $title =
strtolower(
$this->fuckhtml
->getTextContent(
- $title_container[0]
+ $title[0]
)
);
- if($title_container == "imagesview all"){
-
- //
- // Image carousel
- //
- $pcitem =
- $this->fuckhtml
- ->getElementsByClassName(
- "pcitem",
- "div"
- );
+ if(
+ preg_match(
+ '/^latest from|^top stories/',
+ $title
+ )
+ ){
- foreach($pcitem as $item){
-
- $this->fuckhtml->load($item);
-
- $link =
- $this->fuckhtml
- ->getElementsByTagName(
- "a"
- )[0];
-
- parse_str(
- parse_url(
- $this->fuckhtml
- ->getTextContent(
- $link
- ["attributes"]
- ["href"]
- ),
- PHP_URL_QUERY
- ),
- $link
- );
+ // Found news article
+ foreach($carousels[0] as $carousel){
- if(isset($link["tbm"])){
+ if($carousel["image"] !== null){
- continue;
+ $thumb = [
+ "url" => $carousel["image"],
+ "ratio" => "16:9"
+ ];
+ }else{
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
}
- $image =
- $this->fuckhtml
- ->getElementsByTagName("img")[0];
-
- $title =
- $this->fuckhtml
- ->getTextContent(
- $image
- ["attributes"]
- ["alt"]
- );
-
- $image =
- $this->getimage(
- $image
- ["attributes"]
- ["id"]
- );
+ $out["news"][] = [
+ "title" => $carousel["title"],
+ "description" => $carousel["description"],
+ "date" => $carousel["date"],
+ "thumb" => $thumb,
+ "url" => $carousel["url"]
+ ];
+ }
+ }
+
+ elseif(
+ $title == "images"
+ ){
+
+ foreach($carousels as $carousel){
- $out["image"][] = [
- "title" => $title,
- "source" => [
- [
- "url" => $link["imgurl"],
- "width" => (int)$link["w"],
- "height" => (int)$link["h"]
+ foreach($carousel as $item){
+
+ $out["image"][] = [
+ "title" => $item["title"],
+ "source" => [
+ [
+ "url" => $item["url"],
+ "width" => $item["image_width"],
+ "height" => $item["image_height"]
+ ],
+ [
+ "url" => $item["image"],
+ "width" => $item["thumb_width"],
+ "height" => $item["thumb_height"]
+ ]
],
- [
- "url" => $image,
- "width" => (int)$link["tbnw"],
- "height" => (int)$link["tbnh"]
- ]
- ],
- "url" => $link["imgrefurl"]
- ];
+ "url" => $item["ref"]
+ ];
+ }
}
}
+
+ continue;
}
//
- // Get next page
+ // Detect nodes with only text + links
//
+
+ // ignore elements with <style> tags
+ $style =
+ $this->fuckhtml
+ ->getElementsByTagName("style");
+
+ if(count($style) !== 0){
+
+ continue;
+ }
+
$as =
$this->fuckhtml
->getElementsByTagName("a");
+ $description = [];
+
foreach($as as $a){
+ //
+ // Detect next page
+ //
if(
isset($a["attributes"]["aria-label"]) &&
strtolower($a["attributes"]["aria-label"]) == "next page"
){
$out["npt"] =
+ $this->backend->store(
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ ),
+ $pagetype,
+ $ip
+ );
+ continue 2;
+ }
+
+ //
+ // Parse as text node
+ //
+ $container["innerHTML"] =
+ explode(
+ $a["outerHTML"],
+ $container["innerHTML"],
+ 2
+ );
+
+ $before =
+ $this->fuckhtml
+ ->getTextContent(
+ $container["innerHTML"][0],
+ false,
+ false
+ );
+
+ // set after
+ if(count($container["innerHTML"]) === 2){
+
+ $container["innerHTML"] =
+ $container["innerHTML"][1];
+ }else{
+
+ $container["innerHTML"] = "";
+ }
+
+ if($before != ""){
+
+ $description[] = [
+ "type" => "text",
+ "value" => $before
+ ];
+ }
+
+ // add link
+ $description[] = [
+ "type" => "link",
+ "url" =>
+ $this->unshiturl(
+ $a["attributes"]
+ ["href"]
+ ),
+ "value" =>
$this->fuckhtml
->getTextContent(
- $a["attributes"]["href"]
- );
+ $a
+ )
+ ];
+ }
+
+ if($container["innerHTML"] != ""){
+
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $container["innerHTML"]
+ )
+ ];
+ }
+
+ $out["answer"][] = [
+ "title" => "Notice",
+ "description" => $description,
+ "url" => null,
+ "thumb" => null,
+ "table" => [],
+ "sublink" => []
+ ];
+ }
+
+ //
+ // remove duplicate web links cause instant answers
+ // sometimes contains duplicates
+ //
+ $c = count($out["web"]);
+ $links = [];
+
+ for($i=0; $i<$c; $i++){
+
+ foreach($links as $link){
+
+ if($out["web"][$i]["url"] == $link){
+
+ unset($out["web"][$i]);
+ continue 2;
}
}
+
+ $links[] = $out["web"][$i]["url"];
}
+ $out["web"] = array_values($out["web"]);
+
return $out;
- }*/
+ }
+
+
public function image($get){
@@ -1691,7 +2987,7 @@ class google{
array_merge(
$this->computedstyle,
$this->parsestyles($style["innerHTML"])
- );
+ );
}
// get images in javascript var
@@ -1723,18 +3019,24 @@ class google{
}
preg_match_all(
- '/var s=\'(data:image[^\']+)\';var i=\[\'([^\']+)\'];/',
+ '/var s=\'(data:image[^\']+)\';var i=\[(\'[^\;]*\')];/',
$script["innerHTML"],
$image_grep
);
if(count($image_grep[0]) !== 0){
- $this->js_image[trim($image_grep[2][0])] =
+ $items = explode(",", $image_grep[2][0]);
+ $value =
$this->fuckhtml
->getTextContent(
$image_grep[1][0]
);
+
+ foreach($items as $item){
+
+ $this->js_image[trim($item, "' ")] = $value;
+ }
}
// even more javascript crap
@@ -1748,7 +3050,7 @@ class google{
for($i=0; $i<count($ask_grep[0]); $i++){
$this->ask[trim($ask_grep[1][$i])] =
- stripcslashes(
+ $this->fuckhtml->parseJsString(
$ask_grep[2][$i]
);
}
@@ -1831,43 +3133,321 @@ class google{
private function getimage($id){
- if(
- isset($this->js_image[$id]) &&
- $this->js_image[$id] != ""
- ){
+ if(isset($this->js_image[$id])){
+
+ $return = $this->fuckhtml->parseJsString($this->js_image[$id]);
- if(stripos($this->js_image[$id], "data:image") !== false){
+ if(
+ $return != "" &&
+ $return != ""
+ ){
- return
- explode(
- "\\x3d",
- $this->js_image[$id],
- 2
- )[0];
+ if(
+ preg_match(
+ '/^\/\//',
+ $return
+ )
+ ){
+
+ return 'https:' . $return;
+ }
+
+ return $return;
}
- return $this->js_image[$id];
+ return null;
}
+ }
+
+ private function parsecarousels(&$item_to_remove = false){
- return null;
+ $carousels =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "padding" => "16px",
+ "position" => "relative"
+ ],
+ self::is_class
+ )
+ );
+
+ $return = [];
+
+ for($i=0; $i<count($carousels); $i++){
+
+ if(!isset($carousels[$i]["outerHTML"])){
+
+ continue;
+ }
+
+ $this->fuckhtml->load($carousels[$i]);
+
+ if($item_to_remove !== false){
+
+ $item_to_remove =
+ str_replace(
+ $carousels[$i]["outerHTML"],
+ "",
+ $item_to_remove
+ );
+ }
+
+ $pcitems =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "pcitem",
+ "div"
+ );
+
+ foreach($pcitems as $pcitem){
+
+ $this->fuckhtml->load($pcitem);
+
+ $out = [
+ "url" => null,
+ "ref" => null,
+ "image" => null,
+ "thumb_width" => null,
+ "thumb_height" => null,
+ "image_width" => null,
+ "image_height" => null,
+ "title" => null,
+ "description" => null,
+ "subtext" => null,
+ "date" => null
+ ];
+
+ $url =
+ $this->unshiturl(
+ $this->fuckhtml
+ ->getElementsByTagName("a")
+ [0]
+ ["attributes"]
+ ["href"],
+ true
+ );
+
+ // set ref
+ $out["ref"] = $url["ref"];
+
+ // set url
+ $out["url"] = $url["url"];
+
+ // set sizes
+ $out["thumb_width"] = $url["thumb_width"];
+ $out["thumb_height"] = $url["thumb_height"];
+ $out["image_width"] = $url["image_width"];
+ $out["image_height"] = $url["image_height"];
+
+ // get image
+ $out["image"] =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ );
+
+ if(count($out["image"]) !== 0){
+
+ // get title from image
+ if(isset($out["image"][0]["attributes"]["alt"])){
+
+ $out["title"] =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $out["image"][0]["attributes"]["alt"]
+ )
+ );
+ }
+
+ // get image url
+ if(isset($out["image"][0]["attributes"]["id"])){
+
+ $out["image"] = $this->getimage($out["image"][0]["attributes"]["id"]);
+ }
+
+ elseif(isset($out["image"][0]["attributes"]["data-ll"])){
+
+ $out["image"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $out["image"][0]["attributes"]["data-ll"]
+ );
+ }else{
+
+ // failed to get image information
+ $out["image"] = null;
+ }
+
+ if($out["image"] == ''){
+
+ // found arrow image base64, skip entry
+ continue;
+ }
+ }else{
+
+ // Could not find any image in node
+ $out["image"] = null;
+ }
+
+ // get title from spans
+ $title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "color" => "#1967d2"
+ ],
+ self::is_class
+ ),
+ "span"
+ );
+
+ if(count($title) !== 0){
+
+ $out["title"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ );
+ }
+
+ // get textnodes
+ $textnodes =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->findstyles(
+ [
+ "white-space" => "pre-line",
+ "word-wrap" => "break-word"
+ ],
+ self::is_class
+ )
+ );
+
+ $subtext = null;
+
+ if(count($textnodes) !== 0){
+
+ // get date
+ $date =
+ $this->fuckhtml
+ ->getTextContent(
+ $textnodes[count($textnodes) - 1],
+ true
+ );
+
+ if(str_replace("\n", " ", $date) == $title){
+
+ $date = null;
+ }else{
+
+ if(strpos($date, "\n") !== false){
+
+ $date = explode("\n", $date);
+ $date = $date[count($date) - 1];
+ }
+ elseif(strpos($date, "•") !== false){
+
+ $date = explode("•", $date);
+ $date = ltrim($date[count($date) - 1]);
+ }else{
+
+ $date = null;
+ }
+ }
+
+ if($date !== null){
+
+ $date = strtotime($date);
+ }
+
+ // get description
+ $description =
+ $this->fuckhtml
+ ->getTextContent(
+ $textnodes[0]
+ );
+
+ if($out["title"] === null){
+
+ if($date === null){
+
+ $out["title"] = $description;
+ $description = null;
+ }else{
+
+ $out["title"] = parse_url($out["url"], PHP_URL_HOST);
+ }
+ }
+
+ if(isset($textnodes[1])){
+
+ $out["subtext"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $textnodes[1]
+ );
+ }
+
+ }else{
+
+ $date = null;
+ $description = null;
+ }
+
+ $out["date"] = $date;
+ $out["description"] = $this->titledots($description);
+
+ if($out["url"] === null){
+
+ $out["url"] = $out["title"];
+ }
+
+ if($out["title"] == $out["description"]){
+
+ $out["description"] = null;
+ }
+
+ $return[$i][] = $out;
+ }
+ }
+
+ return $return;
}
- private function decodeurl($url){
+ private function unshiturl($url, $return_size = false){
- preg_match(
- '/^\/url\?q=([^&]+)|^\/interstitial\?url=([^&]+)/',
+ // get parameters from URL
+ $url =
$this->fuckhtml
- ->getTextContent($url),
- $match
- );
+ ->getTextContent($url);
- if(count($match) === 0){
+ $newurl = parse_url($url, PHP_URL_QUERY);
+
+ if($newurl == ""){
- return null;
+ // probably telephone number
+ return $url;
}
- $url = empty($match[1]) ? urldecode($match[2]) : urldecode($match[1]);
+ $url = $newurl;
+ unset($newurl);
+ parse_str($url, $query);
+
+ if(isset($query["imgurl"])){
+
+ $url = $query["imgurl"];
+ }
+ elseif(isset($query["q"])){
+
+ $url = $query["q"];
+ }
+
+ // rewrite URLs to remove extra tracking parameters
$domain = parse_url($url, PHP_URL_HOST);
if(
@@ -1889,7 +3469,7 @@ class google{
);
}
- if(
+ elseif(
preg_match(
'/imdb\.com$|youtube\.[^.]+$/',
$domain
@@ -1909,6 +3489,101 @@ class google{
}
+ elseif(
+ preg_match(
+ '/play\.google\.[^.]+$/',
+ $domain
+ )
+ ){
+
+ // remove referrers from play.google.com
+ $oldquery = parse_url($url, PHP_URL_QUERY);
+ if($oldquery !== null){
+
+ parse_str($oldquery, $query);
+ if(isset($query["referrer"])){ unset($query["referrer"]); }
+ if(isset($query["hl"])){ unset($query["hl"]); }
+ if(isset($query["gl"])){ unset($query["gl"]); }
+
+ $query = http_build_query($query);
+
+ $url =
+ str_replace(
+ $oldquery,
+ $query,
+ $url
+ );
+ }
+ }
+
+ elseif(
+ preg_match(
+ '/twitter\.com$/',
+ $domain
+ )
+ ){
+
+ // remove more referrers from twitter.com
+ $oldquery = parse_url($url, PHP_URL_QUERY);
+ if($oldquery !== null){
+
+ parse_str($oldquery, $query);
+ if(isset($query["ref_src"])){ unset($query["ref_src"]); }
+
+ $query = http_build_query($query);
+
+ if($query != ""){
+
+ $query .= "?" . $query;
+ }
+
+ $url =
+ str_replace(
+ '?' . $oldquery,
+ $query,
+ $url
+ );
+ }
+ }
+
+ elseif(
+ preg_match(
+ '/maps\.google\.[^.]+/',
+ $domain
+ )
+ ){
+
+ if(stripos($url, "maps?") !== false){
+
+ //https://maps.google.com/maps?daddr=Johnny,+603+Rue+St+Georges,+Saint-J%C3%A9r%C3%B4me,+Quebec+J7Z+5B7
+ $query = parse_url($url, PHP_URL_QUERY);
+
+ if($query !== null){
+
+ parse_str($query, $query);
+
+ if(isset($query["daddr"])){
+
+ $url =
+ "https://maps.google.com/maps?daddr=" .
+ urlencode($query["daddr"]);
+ }
+ }
+ }
+ }
+
+ if($return_size){
+
+ return [
+ "url" => $url,
+ "ref" => isset($query["imgrefurl"]) ? $query["imgrefurl"] : null,
+ "thumb_width" => isset($query["tbnw"]) ? (int)$query["tbnw"] : null,
+ "thumb_height" => isset($query["tbnh"]) ? (int)$query["tbnh"] : null,
+ "image_width" => isset($query["w"]) ? (int)$query["w"] : null,
+ "image_height" => isset($query["h"]) ? (int)$query["h"] : null
+ ];
+ }
+
return $url;
}
@@ -1924,6 +3599,6 @@ class google{
private function titledots($title){
- return rtrim($title, ".… \t\n\r\0\x0B");
+ return rtrim($title, ". \t\n\r\0\x0B");
}
}