From 2519666e1c831f85836f9ce6d975d3ceb975c09d Mon Sep 17 00:00:00 2001 From: lolcat Date: Mon, 27 Nov 2023 01:01:56 -0500 Subject: google web, videos and news, various other fixes --- scraper/google.php | 3709 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 2692 insertions(+), 1017 deletions(-) (limited to 'scraper/google.php') diff --git a/scraper/google.php b/scraper/google.php index 055d12a..bf2b0e4 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -16,713 +16,496 @@ class google{ public function getfilters($page){ + $base = [ + "country" => [ // gl= + "display" => "Country", + "option" => [ + "any" => "Instance's country", + "af" => "Afghanistan", + "al" => "Albania", + "dz" => "Algeria", + "as" => "American Samoa", + "ad" => "Andorra", + "ao" => "Angola", + "ai" => "Anguilla", + "aq" => "Antarctica", + "ag" => "Antigua and Barbuda", + "ar" => "Argentina", + "am" => "Armenia", + "aw" => "Aruba", + "au" => "Australia", + "at" => "Austria", + "az" => "Azerbaijan", + "bs" => "Bahamas", + "bh" => "Bahrain", + "bd" => "Bangladesh", + "bb" => "Barbados", + "by" => "Belarus", + "be" => "Belgium", + "bz" => "Belize", + "bj" => "Benin", + "bm" => "Bermuda", + "bt" => "Bhutan", + "bo" => "Bolivia", + "ba" => "Bosnia and Herzegovina", + "bw" => "Botswana", + "bv" => "Bouvet Island", + "br" => "Brazil", + "io" => "British Indian Ocean Territory", + "bn" => "Brunei Darussalam", + "bg" => "Bulgaria", + "bf" => "Burkina Faso", + "bi" => "Burundi", + "kh" => "Cambodia", + "cm" => "Cameroon", + "ca" => "Canada", + "cv" => "Cape Verde", + "ky" => "Cayman Islands", + "cf" => "Central African Republic", + "td" => "Chad", + "cl" => "Chile", + "cn" => "China", + "cx" => "Christmas Island", + "cc" => "Cocos (Keeling) Islands", + "co" => "Colombia", + "km" => "Comoros", + "cg" => "Congo", + "cd" => "Congo, the Democratic Republic", + "ck" => "Cook Islands", + "cr" => "Costa Rica", + "ci" => "Cote D'ivoire", + "hr" => "Croatia", + "cu" => "Cuba", + "cy" => "Cyprus", + "cz" => "Czech Republic", + "dk" => "Denmark", + "dj" => "Djibouti", + "dm" => "Dominica", + "do" => "Dominican Republic", + "ec" => "Ecuador", + "eg" => "Egypt", + "sv" => "El Salvador", + "gq" => "Equatorial Guinea", + "er" => "Eritrea", + "ee" => "Estonia", + "et" => "Ethiopia", + "fk" => "Falkland Islands (Malvinas)", + "fo" => "Faroe Islands", + "fj" => "Fiji", + "fi" => "Finland", + "fr" => "France", + "gf" => "French Guiana", + "pf" => "French Polynesia", + "tf" => "French Southern Territories", + "ga" => "Gabon", + "gm" => "Gambia", + "ge" => "Georgia", + "de" => "Germany", + "gh" => "Ghana", + "gi" => "Gibraltar", + "gr" => "Greece", + "gl" => "Greenland", + "gd" => "Grenada", + "gp" => "Guadeloupe", + "gu" => "Guam", + "gt" => "Guatemala", + "gn" => "Guinea", + "gw" => "Guinea-Bissau", + "gy" => "Guyana", + "ht" => "Haiti", + "hm" => "Heard Island and Mcdonald Islands", + "va" => "Holy See (Vatican City State)", + "hn" => "Honduras", + "hk" => "Hong Kong", + "hu" => "Hungary", + "is" => "Iceland", + "in" => "India", + "id" => "Indonesia", + "ir" => "Iran, Islamic Republic", + "iq" => "Iraq", + "ie" => "Ireland", + "il" => "Israel", + "it" => "Italy", + "jm" => "Jamaica", + "jp" => "Japan", + "jo" => "Jordan", + "kz" => "Kazakhstan", + "ke" => "Kenya", + "ki" => "Kiribati", + "kp" => "Korea, Democratic People's Republic", + "kr" => "Korea, Republic", + "kw" => "Kuwait", + "kg" => "Kyrgyzstan", + "la" => "Lao People's Democratic Republic", + "lv" => "Latvia", + "lb" => "Lebanon", + "ls" => "Lesotho", + "lr" => "Liberia", + "ly" => "Libyan Arab Jamahiriya", + "li" => "Liechtenstein", + "lt" => "Lithuania", + "lu" => "Luxembourg", + "mo" => "Macao", + "mk" => "Macedonia, the Former Yugosalv Republic", + "mg" => "Madagascar", + "mw" => "Malawi", + "my" => "Malaysia", + "mv" => "Maldives", + "ml" => "Mali", + "mt" => "Malta", + "mh" => "Marshall Islands", + "mq" => "Martinique", + "mr" => "Mauritania", + "mu" => "Mauritius", + "yt" => "Mayotte", + "mx" => "Mexico", + "fm" => "Micronesia, Federated States", + "md" => "Moldova, Republic", + "mc" => "Monaco", + "mn" => "Mongolia", + "ms" => "Montserrat", + "ma" => "Morocco", + "mz" => "Mozambique", + "mm" => "Myanmar", + "na" => "Namibia", + "nr" => "Nauru", + "np" => "Nepal", + "nl" => "Netherlands", + "an" => "Netherlands Antilles", + "nc" => "New Caledonia", + "nz" => "New Zealand", + "ni" => "Nicaragua", + "ne" => "Niger", + "ng" => "Nigeria", + "nu" => "Niue", + "nf" => "Norfolk Island", + "mp" => "Northern Mariana Islands", + "no" => "Norway", + "om" => "Oman", + "pk" => "Pakistan", + "pw" => "Palau", + "ps" => "Palestinian Territory, Occupied", + "pa" => "Panama", + "pg" => "Papua New Guinea", + "py" => "Paraguay", + "pe" => "Peru", + "ph" => "Philippines", + "pn" => "Pitcairn", + "pl" => "Poland", + "pt" => "Portugal", + "pr" => "Puerto Rico", + "qa" => "Qatar", + "re" => "Reunion", + "ro" => "Romania", + "ru" => "Russian Federation", + "rw" => "Rwanda", + "sh" => "Saint Helena", + "kn" => "Saint Kitts and Nevis", + "lc" => "Saint Lucia", + "pm" => "Saint Pierre and Miquelon", + "vc" => "Saint Vincent and the Grenadines", + "ws" => "Samoa", + "sm" => "San Marino", + "st" => "Sao Tome and Principe", + "sa" => "Saudi Arabia", + "sn" => "Senegal", + "cs" => "Serbia and Montenegro", + "sc" => "Seychelles", + "sl" => "Sierra Leone", + "sg" => "Singapore", + "sk" => "Slovakia", + "si" => "Slovenia", + "sb" => "Solomon Islands", + "so" => "Somalia", + "za" => "South Africa", + "gs" => "South Georgia and the South Sandwich Islands", + "es" => "Spain", + "lk" => "Sri Lanka", + "sd" => "Sudan", + "sr" => "Suriname", + "sj" => "Svalbard and Jan Mayen", + "sz" => "Swaziland", + "se" => "Sweden", + "ch" => "Switzerland", + "sy" => "Syrian Arab Republic", + "tw" => "Taiwan, Province of China", + "tj" => "Tajikistan", + "tz" => "Tanzania, United Republic", + "th" => "Thailand", + "tl" => "Timor-Leste", + "tg" => "Togo", + "tk" => "Tokelau", + "to" => "Tonga", + "tt" => "Trinidad and Tobago", + "tn" => "Tunisia", + "tr" => "Turkey", + "tm" => "Turkmenistan", + "tc" => "Turks and Caicos Islands", + "tv" => "Tuvalu", + "ug" => "Uganda", + "ua" => "Ukraine", + "ae" => "United Arab Emirates", + "uk" => "United Kingdom", + "us" => "United States", + "um" => "United States Minor Outlying Islands", + "uy" => "Uruguay", + "uz" => "Uzbekistan", + "vu" => "Vanuatu", + "ve" => "Venezuela", + "vn" => "Viet Nam", + "vg" => "Virgin Islands, British", + "vi" => "Virgin Islands, U.S.", + "wf" => "Wallis and Futuna", + "eh" => "Western Sahara", + "ye" => "Yemen", + "zm" => "Zambia", + "zw" => "Zimbabwe" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // safe=active + "no" => "No" // safe=off + ] + ], + "lang" => [ // lr= (prefix lang with "lang_") + "display" => "Language", + "option" => [ + "any" => "Any language", + "ar" => "Arabic", + "bg" => "Bulgarian", + "ca" => "Catalan", + "cs" => "Czech", + "da" => "Danish", + "de" => "German", + "el" => "Greek", + "en" => "English", + "es" => "Spanish", + "et" => "Estonian", + "fi" => "Finnish", + "fr" => "French", + "hr" => "Croatian", + "hu" => "Hungarian", + "id" => "Indonesian", + "is" => "Icelandic", + "it" => "Italian", + "iw" => "Hebrew", + "ja" => "Japanese", + "ko" => "Korean", + "lt" => "Lithuanian", + "lv" => "Latvian", + "nl" => "Dutch", + "no" => "Norwegian", + "pl" => "Polish", + "pt" => "Portuguese", + "ro" => "Romanian", + "ru" => "Russian", + "sk" => "Slovak", + "sl" => "Slovenian", + "sr" => "Serbian", + "sv" => "Swedish", + "tr" => "Turkish", + "zh-CN" => "Chinese (Simplified)", + "zh-TW" => "Chinese (Traditional)" + ] + ] + ]; + switch($page){ case "web": - case "videos": - case "news": - return [ - "country" => [ // gl= - "display" => "Country", - "option" => [ - "any" => "Instance's country", - "af" => "Afghanistan", - "al" => "Albania", - "dz" => "Algeria", - "as" => "American Samoa", - "ad" => "Andorra", - "ao" => "Angola", - "ai" => "Anguilla", - "aq" => "Antarctica", - "ag" => "Antigua and Barbuda", - "ar" => "Argentina", - "am" => "Armenia", - "aw" => "Aruba", - "au" => "Australia", - "at" => "Austria", - "az" => "Azerbaijan", - "bs" => "Bahamas", - "bh" => "Bahrain", - "bd" => "Bangladesh", - "bb" => "Barbados", - "by" => "Belarus", - "be" => "Belgium", - "bz" => "Belize", - "bj" => "Benin", - "bm" => "Bermuda", - "bt" => "Bhutan", - "bo" => "Bolivia", - "ba" => "Bosnia and Herzegovina", - "bw" => "Botswana", - "bv" => "Bouvet Island", - "br" => "Brazil", - "io" => "British Indian Ocean Territory", - "bn" => "Brunei Darussalam", - "bg" => "Bulgaria", - "bf" => "Burkina Faso", - "bi" => "Burundi", - "kh" => "Cambodia", - "cm" => "Cameroon", - "ca" => "Canada", - "cv" => "Cape Verde", - "ky" => "Cayman Islands", - "cf" => "Central African Republic", - "td" => "Chad", - "cl" => "Chile", - "cn" => "China", - "cx" => "Christmas Island", - "cc" => "Cocos (Keeling) Islands", - "co" => "Colombia", - "km" => "Comoros", - "cg" => "Congo", - "cd" => "Congo, the Democratic Republic", - "ck" => "Cook Islands", - "cr" => "Costa Rica", - "ci" => "Cote D'ivoire", - "hr" => "Croatia", - "cu" => "Cuba", - "cy" => "Cyprus", - "cz" => "Czech Republic", - "dk" => "Denmark", - "dj" => "Djibouti", - "dm" => "Dominica", - "do" => "Dominican Republic", - "ec" => "Ecuador", - "eg" => "Egypt", - "sv" => "El Salvador", - "gq" => "Equatorial Guinea", - "er" => "Eritrea", - "ee" => "Estonia", - "et" => "Ethiopia", - "fk" => "Falkland Islands (Malvinas)", - "fo" => "Faroe Islands", - "fj" => "Fiji", - "fi" => "Finland", - "fr" => "France", - "gf" => "French Guiana", - "pf" => "French Polynesia", - "tf" => "French Southern Territories", - "ga" => "Gabon", - "gm" => "Gambia", - "ge" => "Georgia", - "de" => "Germany", - "gh" => "Ghana", - "gi" => "Gibraltar", - "gr" => "Greece", - "gl" => "Greenland", - "gd" => "Grenada", - "gp" => "Guadeloupe", - "gu" => "Guam", - "gt" => "Guatemala", - "gn" => "Guinea", - "gw" => "Guinea-Bissau", - "gy" => "Guyana", - "ht" => "Haiti", - "hm" => "Heard Island and Mcdonald Islands", - "va" => "Holy See (Vatican City State)", - "hn" => "Honduras", - "hk" => "Hong Kong", - "hu" => "Hungary", - "is" => "Iceland", - "in" => "India", - "id" => "Indonesia", - "ir" => "Iran, Islamic Republic", - "iq" => "Iraq", - "ie" => "Ireland", - "il" => "Israel", - "it" => "Italy", - "jm" => "Jamaica", - "jp" => "Japan", - "jo" => "Jordan", - "kz" => "Kazakhstan", - "ke" => "Kenya", - "ki" => "Kiribati", - "kp" => "Korea, Democratic People's Republic", - "kr" => "Korea, Republic", - "kw" => "Kuwait", - "kg" => "Kyrgyzstan", - "la" => "Lao People's Democratic Republic", - "lv" => "Latvia", - "lb" => "Lebanon", - "ls" => "Lesotho", - "lr" => "Liberia", - "ly" => "Libyan Arab Jamahiriya", - "li" => "Liechtenstein", - "lt" => "Lithuania", - "lu" => "Luxembourg", - "mo" => "Macao", - "mk" => "Macedonia, the Former Yugosalv Republic", - "mg" => "Madagascar", - "mw" => "Malawi", - "my" => "Malaysia", - "mv" => "Maldives", - "ml" => "Mali", - "mt" => "Malta", - "mh" => "Marshall Islands", - "mq" => "Martinique", - "mr" => "Mauritania", - "mu" => "Mauritius", - "yt" => "Mayotte", - "mx" => "Mexico", - "fm" => "Micronesia, Federated States", - "md" => "Moldova, Republic", - "mc" => "Monaco", - "mn" => "Mongolia", - "ms" => "Montserrat", - "ma" => "Morocco", - "mz" => "Mozambique", - "mm" => "Myanmar", - "na" => "Namibia", - "nr" => "Nauru", - "np" => "Nepal", - "nl" => "Netherlands", - "an" => "Netherlands Antilles", - "nc" => "New Caledonia", - "nz" => "New Zealand", - "ni" => "Nicaragua", - "ne" => "Niger", - "ng" => "Nigeria", - "nu" => "Niue", - "nf" => "Norfolk Island", - "mp" => "Northern Mariana Islands", - "no" => "Norway", - "om" => "Oman", - "pk" => "Pakistan", - "pw" => "Palau", - "ps" => "Palestinian Territory, Occupied", - "pa" => "Panama", - "pg" => "Papua New Guinea", - "py" => "Paraguay", - "pe" => "Peru", - "ph" => "Philippines", - "pn" => "Pitcairn", - "pl" => "Poland", - "pt" => "Portugal", - "pr" => "Puerto Rico", - "qa" => "Qatar", - "re" => "Reunion", - "ro" => "Romania", - "ru" => "Russian Federation", - "rw" => "Rwanda", - "sh" => "Saint Helena", - "kn" => "Saint Kitts and Nevis", - "lc" => "Saint Lucia", - "pm" => "Saint Pierre and Miquelon", - "vc" => "Saint Vincent and the Grenadines", - "ws" => "Samoa", - "sm" => "San Marino", - "st" => "Sao Tome and Principe", - "sa" => "Saudi Arabia", - "sn" => "Senegal", - "cs" => "Serbia and Montenegro", - "sc" => "Seychelles", - "sl" => "Sierra Leone", - "sg" => "Singapore", - "sk" => "Slovakia", - "si" => "Slovenia", - "sb" => "Solomon Islands", - "so" => "Somalia", - "za" => "South Africa", - "gs" => "South Georgia and the South Sandwich Islands", - "es" => "Spain", - "lk" => "Sri Lanka", - "sd" => "Sudan", - "sr" => "Suriname", - "sj" => "Svalbard and Jan Mayen", - "sz" => "Swaziland", - "se" => "Sweden", - "ch" => "Switzerland", - "sy" => "Syrian Arab Republic", - "tw" => "Taiwan, Province of China", - "tj" => "Tajikistan", - "tz" => "Tanzania, United Republic", - "th" => "Thailand", - "tl" => "Timor-Leste", - "tg" => "Togo", - "tk" => "Tokelau", - "to" => "Tonga", - "tt" => "Trinidad and Tobago", - "tn" => "Tunisia", - "tr" => "Turkey", - "tm" => "Turkmenistan", - "tc" => "Turks and Caicos Islands", - "tv" => "Tuvalu", - "ug" => "Uganda", - "ua" => "Ukraine", - "ae" => "United Arab Emirates", - "uk" => "United Kingdom", - "us" => "United States", - "um" => "United States Minor Outlying Islands", - "uy" => "Uruguay", - "uz" => "Uzbekistan", - "vu" => "Vanuatu", - "ve" => "Venezuela", - "vn" => "Viet Nam", - "vg" => "Virgin Islands, British", - "vi" => "Virgin Islands, U.S.", - "wf" => "Wallis and Futuna", - "eh" => "Western Sahara", - "ye" => "Yemen", - "zm" => "Zambia", - "zw" => "Zimbabwe" - ] - ], - "nsfw" => [ - "display" => "NSFW", - "option" => [ - "yes" => "Yes", // safe=active - "no" => "No" // safe=off - ] - ], - "lang" => [ // lr= (prefix lang with "lang_") - "display" => "Language", - "option" => [ - "any" => "Any language", - "ar" => "Arabic", - "bg" => "Bulgarian", - "ca" => "Catalan", - "cs" => "Czech", - "da" => "Danish", - "de" => "German", - "el" => "Greek", - "en" => "English", - "es" => "Spanish", - "et" => "Estonian", - "fi" => "Finnish", - "fr" => "French", - "hr" => "Croatian", - "hu" => "Hungarian", - "id" => "Indonesian", - "is" => "Icelandic", - "it" => "Italian", - "iw" => "Hebrew", - "ja" => "Japanese", - "ko" => "Korean", - "lt" => "Lithuanian", - "lv" => "Latvian", - "nl" => "Dutch", - "no" => "Norwegian", - "pl" => "Polish", - "pt" => "Portuguese", - "ro" => "Romanian", - "ru" => "Russian", - "sk" => "Slovak", - "sl" => "Slovenian", - "sr" => "Serbian", - "sv" => "Swedish", - "tr" => "Turkish", - "zh-CN" => "Chinese (Simplified)", - "zh-TW" => "Chinese (Traditional)" + return array_merge( + $base, + [ + "newer" => [ // &sort=review-date:r:20090301:20090430 + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" ] - ], - "newer" => [ // &sort=review-date:r:20090301:20090430 - "display" => "Newer than", - "option" => "_DATE" - ], - "older" => [ - "display" => "Older than", - "option" => "_DATE" ] - ]; + ); break; case "images": - return [ - "country" => [ // gl= - "display" => "Country", - "option" => [ - "any" => "Instance's country", - "af" => "Afghanistan", - "al" => "Albania", - "dz" => "Algeria", - "as" => "American Samoa", - "ad" => "Andorra", - "ao" => "Angola", - "ai" => "Anguilla", - "aq" => "Antarctica", - "ag" => "Antigua and Barbuda", - "ar" => "Argentina", - "am" => "Armenia", - "aw" => "Aruba", - "au" => "Australia", - "at" => "Austria", - "az" => "Azerbaijan", - "bs" => "Bahamas", - "bh" => "Bahrain", - "bd" => "Bangladesh", - "bb" => "Barbados", - "by" => "Belarus", - "be" => "Belgium", - "bz" => "Belize", - "bj" => "Benin", - "bm" => "Bermuda", - "bt" => "Bhutan", - "bo" => "Bolivia", - "ba" => "Bosnia and Herzegovina", - "bw" => "Botswana", - "bv" => "Bouvet Island", - "br" => "Brazil", - "io" => "British Indian Ocean Territory", - "bn" => "Brunei Darussalam", - "bg" => "Bulgaria", - "bf" => "Burkina Faso", - "bi" => "Burundi", - "kh" => "Cambodia", - "cm" => "Cameroon", - "ca" => "Canada", - "cv" => "Cape Verde", - "ky" => "Cayman Islands", - "cf" => "Central African Republic", - "td" => "Chad", - "cl" => "Chile", - "cn" => "China", - "cx" => "Christmas Island", - "cc" => "Cocos (Keeling) Islands", - "co" => "Colombia", - "km" => "Comoros", - "cg" => "Congo", - "cd" => "Congo, the Democratic Republic", - "ck" => "Cook Islands", - "cr" => "Costa Rica", - "ci" => "Cote D'ivoire", - "hr" => "Croatia", - "cu" => "Cuba", - "cy" => "Cyprus", - "cz" => "Czech Republic", - "dk" => "Denmark", - "dj" => "Djibouti", - "dm" => "Dominica", - "do" => "Dominican Republic", - "ec" => "Ecuador", - "eg" => "Egypt", - "sv" => "El Salvador", - "gq" => "Equatorial Guinea", - "er" => "Eritrea", - "ee" => "Estonia", - "et" => "Ethiopia", - "fk" => "Falkland Islands (Malvinas)", - "fo" => "Faroe Islands", - "fj" => "Fiji", - "fi" => "Finland", - "fr" => "France", - "gf" => "French Guiana", - "pf" => "French Polynesia", - "tf" => "French Southern Territories", - "ga" => "Gabon", - "gm" => "Gambia", - "ge" => "Georgia", - "de" => "Germany", - "gh" => "Ghana", - "gi" => "Gibraltar", - "gr" => "Greece", - "gl" => "Greenland", - "gd" => "Grenada", - "gp" => "Guadeloupe", - "gu" => "Guam", - "gt" => "Guatemala", - "gn" => "Guinea", - "gw" => "Guinea-Bissau", - "gy" => "Guyana", - "ht" => "Haiti", - "hm" => "Heard Island and Mcdonald Islands", - "va" => "Holy See (Vatican City State)", - "hn" => "Honduras", - "hk" => "Hong Kong", - "hu" => "Hungary", - "is" => "Iceland", - "in" => "India", - "id" => "Indonesia", - "ir" => "Iran, Islamic Republic", - "iq" => "Iraq", - "ie" => "Ireland", - "il" => "Israel", - "it" => "Italy", - "jm" => "Jamaica", - "jp" => "Japan", - "jo" => "Jordan", - "kz" => "Kazakhstan", - "ke" => "Kenya", - "ki" => "Kiribati", - "kp" => "Korea, Democratic People's Republic", - "kr" => "Korea, Republic", - "kw" => "Kuwait", - "kg" => "Kyrgyzstan", - "la" => "Lao People's Democratic Republic", - "lv" => "Latvia", - "lb" => "Lebanon", - "ls" => "Lesotho", - "lr" => "Liberia", - "ly" => "Libyan Arab Jamahiriya", - "li" => "Liechtenstein", - "lt" => "Lithuania", - "lu" => "Luxembourg", - "mo" => "Macao", - "mk" => "Macedonia, the Former Yugosalv Republic", - "mg" => "Madagascar", - "mw" => "Malawi", - "my" => "Malaysia", - "mv" => "Maldives", - "ml" => "Mali", - "mt" => "Malta", - "mh" => "Marshall Islands", - "mq" => "Martinique", - "mr" => "Mauritania", - "mu" => "Mauritius", - "yt" => "Mayotte", - "mx" => "Mexico", - "fm" => "Micronesia, Federated States", - "md" => "Moldova, Republic", - "mc" => "Monaco", - "mn" => "Mongolia", - "ms" => "Montserrat", - "ma" => "Morocco", - "mz" => "Mozambique", - "mm" => "Myanmar", - "na" => "Namibia", - "nr" => "Nauru", - "np" => "Nepal", - "nl" => "Netherlands", - "an" => "Netherlands Antilles", - "nc" => "New Caledonia", - "nz" => "New Zealand", - "ni" => "Nicaragua", - "ne" => "Niger", - "ng" => "Nigeria", - "nu" => "Niue", - "nf" => "Norfolk Island", - "mp" => "Northern Mariana Islands", - "no" => "Norway", - "om" => "Oman", - "pk" => "Pakistan", - "pw" => "Palau", - "ps" => "Palestinian Territory, Occupied", - "pa" => "Panama", - "pg" => "Papua New Guinea", - "py" => "Paraguay", - "pe" => "Peru", - "ph" => "Philippines", - "pn" => "Pitcairn", - "pl" => "Poland", - "pt" => "Portugal", - "pr" => "Puerto Rico", - "qa" => "Qatar", - "re" => "Reunion", - "ro" => "Romania", - "ru" => "Russian Federation", - "rw" => "Rwanda", - "sh" => "Saint Helena", - "kn" => "Saint Kitts and Nevis", - "lc" => "Saint Lucia", - "pm" => "Saint Pierre and Miquelon", - "vc" => "Saint Vincent and the Grenadines", - "ws" => "Samoa", - "sm" => "San Marino", - "st" => "Sao Tome and Principe", - "sa" => "Saudi Arabia", - "sn" => "Senegal", - "cs" => "Serbia and Montenegro", - "sc" => "Seychelles", - "sl" => "Sierra Leone", - "sg" => "Singapore", - "sk" => "Slovakia", - "si" => "Slovenia", - "sb" => "Solomon Islands", - "so" => "Somalia", - "za" => "South Africa", - "gs" => "South Georgia and the South Sandwich Islands", - "es" => "Spain", - "lk" => "Sri Lanka", - "sd" => "Sudan", - "sr" => "Suriname", - "sj" => "Svalbard and Jan Mayen", - "sz" => "Swaziland", - "se" => "Sweden", - "ch" => "Switzerland", - "sy" => "Syrian Arab Republic", - "tw" => "Taiwan, Province of China", - "tj" => "Tajikistan", - "tz" => "Tanzania, United Republic", - "th" => "Thailand", - "tl" => "Timor-Leste", - "tg" => "Togo", - "tk" => "Tokelau", - "to" => "Tonga", - "tt" => "Trinidad and Tobago", - "tn" => "Tunisia", - "tr" => "Turkey", - "tm" => "Turkmenistan", - "tc" => "Turks and Caicos Islands", - "tv" => "Tuvalu", - "ug" => "Uganda", - "ua" => "Ukraine", - "ae" => "United Arab Emirates", - "uk" => "United Kingdom", - "us" => "United States", - "um" => "United States Minor Outlying Islands", - "uy" => "Uruguay", - "uz" => "Uzbekistan", - "vu" => "Vanuatu", - "ve" => "Venezuela", - "vn" => "Viet Nam", - "vg" => "Virgin Islands, British", - "vi" => "Virgin Islands, U.S.", - "wf" => "Wallis and Futuna", - "eh" => "Western Sahara", - "ye" => "Yemen", - "zm" => "Zambia", - "zw" => "Zimbabwe" - ] - ], - "nsfw" => [ - "display" => "NSFW", - "option" => [ - "yes" => "Yes", // safe=active - "no" => "No" // safe=off - ] - ], - "lang" => [ // lr= (prefix lang with "lang_") - "display" => "Language", - "option" => [ - "any" => "Any language", - "ar" => "Arabic", - "bg" => "Bulgarian", - "ca" => "Catalan", - "cs" => "Czech", - "da" => "Danish", - "de" => "German", - "el" => "Greek", - "en" => "English", - "es" => "Spanish", - "et" => "Estonian", - "fi" => "Finnish", - "fr" => "French", - "hr" => "Croatian", - "hu" => "Hungarian", - "id" => "Indonesian", - "is" => "Icelandic", - "it" => "Italian", - "iw" => "Hebrew", - "ja" => "Japanese", - "ko" => "Korean", - "lt" => "Lithuanian", - "lv" => "Latvian", - "nl" => "Dutch", - "no" => "Norwegian", - "pl" => "Polish", - "pt" => "Portuguese", - "ro" => "Romanian", - "ru" => "Russian", - "sk" => "Slovak", - "sl" => "Slovenian", - "sr" => "Serbian", - "sv" => "Swedish", - "tr" => "Turkish", - "zh-CN" => "Chinese (Simplified)", - "zh-TW" => "Chinese (Traditional)" - ] - ], - "time" => [ // tbs=qrd: - "display" => "Time posted", - "option" => [ - "any" => "Any time", - "d" => "Past 24 hours", - "w" => "Past week", - "m" => "Past month", - "y" => "Past year" - ] - ], - "size" => [ - "display" => "Size", - "option" => [ - // tbs=isz: - "any" => "Any size", - "l" => "Large", - "m" => "Medium", - "i" => "Icon", - // from here - // tbz:lt,islt: - "qsvga" => "Larger than 400x300", - "vga" => "Larger than 640x480", - "qsvga" => "Larger than 800x600", - "xga" => "Larger than 1024x768", - "2mp" => "Larger than 2MP", - "4mp" => "Larger than 4MP", - "6mp" => "Larger than 6MP", - "8mp" => "Larger than 8MP", - "10mp" => "Larger than 10MP", - "12mp" => "Larger than 12MP", - "15mp" => "Larger than 15MP", - "20mp" => "Larger than 20MP", - "40mp" => "Larger than 40MP", - "70mp" => "Larger than 70MP" - ] - ], - "ratio" => [ // tbs=iar: - "display" => "Aspect ratio", - "option" => [ - "any" => "Any ratio", - "t" => "Tall", - "s" => "Square", - "w" => "Wide", - "xw" => "Panoramic" - ] - ], - "color" => [ // tbs=ic: - "display" => "Color", - "option" => [ - "any" => "Any color", - "color" => "Full color", - "gray" => "Black & white", - "trans" => "Transparent", - // from there, its ic:specific,isc: - "red" => "Red", - "orange" => "Orange", - "yellow" => "Yellow", - "green" => "Green", - "teal" => "Teal", - "blue" => "Blue", - "purple" => "Purple", - "pink" => "Pink", - "white" => "White", - "gray" => "Gray", - "black" => "Black", - "brown" => "Brown" - ] - ], - "type" => [ // tbs=itp: - "display" => "Type", - "option" => [ - "any" => "Any type", - "face" => "Faces", - "clipart" => "Clip Art", - "lineart" => "Line Drawing", - "stock" => "Stock", - "animated" => "Animated" + return array_merge( + $base, + [ + "time" => [ // tbs=qrd: + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "d" => "Past 24 hours", + "w" => "Past week", + "m" => "Past month", + "y" => "Past year" + ] + ], + "size" => [ + "display" => "Size", + "option" => [ + // tbs=isz: + "any" => "Any size", + "l" => "Large", + "m" => "Medium", + "i" => "Icon", + // from here + // tbz:lt,islt: + "qsvga" => "Larger than 400x300", + "vga" => "Larger than 640x480", + "qsvga" => "Larger than 800x600", + "xga" => "Larger than 1024x768", + "2mp" => "Larger than 2MP", + "4mp" => "Larger than 4MP", + "6mp" => "Larger than 6MP", + "8mp" => "Larger than 8MP", + "10mp" => "Larger than 10MP", + "12mp" => "Larger than 12MP", + "15mp" => "Larger than 15MP", + "20mp" => "Larger than 20MP", + "40mp" => "Larger than 40MP", + "70mp" => "Larger than 70MP" + ] + ], + "ratio" => [ // tbs=iar: + "display" => "Aspect ratio", + "option" => [ + "any" => "Any ratio", + "t" => "Tall", + "s" => "Square", + "w" => "Wide", + "xw" => "Panoramic" + ] + ], + "color" => [ // tbs=ic: + "display" => "Color", + "option" => [ + "any" => "Any color", + "color" => "Full color", + "gray" => "Black & white", + "trans" => "Transparent", + // from there, its ic:specific,isc: + "red" => "Red", + "orange" => "Orange", + "yellow" => "Yellow", + "green" => "Green", + "teal" => "Teal", + "blue" => "Blue", + "purple" => "Purple", + "pink" => "Pink", + "white" => "White", + "gray" => "Gray", + "black" => "Black", + "brown" => "Brown" + ] + ], + "type" => [ // tbs=itp: + "display" => "Type", + "option" => [ + "any" => "Any type", + "face" => "Faces", + "clipart" => "Clip Art", + "lineart" => "Line Drawing", + "stock" => "Stock", + "animated" => "Animated" + ] + ], + "format" => [ // tbs=ift: + "display" => "Format", + "option" => [ + "any" => "Any format", + "jpg" => "JPG", + "gif" => "GIF", + "png" => "PNG", + "bmp" => "BMP", + "svg" => "SVG", + "webp" => "WEBP", + "ico" => "ICO", + "craw" => "RAW" + ] + ], + "rights" => [ // tbs=il: + "display" => "Usage rights", + "option" => [ + "any" => "Any license", + "cl" => "Creative Commons licenses", + "ol" => "Commercial & other licenses" + ] ] - ], - "format" => [ // tbs=ift: - "display" => "Format", - "option" => [ - "any" => "Any format", - "jpg" => "JPG", - "gif" => "GIF", - "png" => "PNG", - "bmp" => "BMP", - "svg" => "SVG", - "webp" => "WEBP", - "ico" => "ICO", - "craw" => "RAW" + ] + ); + break; + + case "videos": + return array_merge( + $base, + [ + "time" => [ + "display" => "Time posted", + "option" => [ // tbs=qdr + "any" => "Any time", + "h" => "Past hour", + "d" => "Past 24 hours", + "w" => "Past week", + "m" => "Past month", + "y" => "Past year" + ] + ], + "duration" => [ + "display" => "Duration", + "option" => [ + "any" => "Any duration", + "s" => "Short (0-4min)", // tbs=dur:s + "m" => "Medium (4-20min)", // tbs=dur:m + "l" => "Long (20+ min)" // tbs=dur:l + ] + ], + "quality" => [ + "display" => "Quality", + "option" => [ + "any" => "Any quality", + "h" => "High quality" // tbs=hq:h + ] + ], + "captions" => [ + "display" => "Captions", + "option" => [ + "any" => "No preference", + "yes" => "Closed captioned" // tbs=cc:1 + ] ] - ], - "rights" => [ // tbs=il: - "display" => "Usage rights", - "option" => [ - "any" => "Any license", - "cl" => "Creative Commons licenses", - "ol" => "Commercial & other licenses" + ] + ); + break; + + case "news": + return array_merge( + $base, + [ + "time" => [ + "display" => "Time posted", + "option" => [ // tbs=qdr + "any" => "Any time", + "h" => "Past hour", + "d" => "Past 24 hours", + "w" => "Past week", + "m" => "Past month", + "y" => "Past year", + "a" => "Archives" // tbs=ar:1 + ] + ], + "sort" => [ + "display" => "Sort", + "option" => [ + "relevance" => "Relevance", + "date" => "Date" // sbd:1 + ] ] ] - ]; + ); break; } } @@ -773,58 +556,402 @@ class google{ curl_close($curlproc); return $data; } - /* + + + public function web($get){ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $lang = $get["lang"]; - $older = $get["older"]; - $newer = $get["newer"]; + if($get["npt"]){ + + [$req, $ip] = $this->backend->get($get["npt"], "web"); + parse_str( + parse_url($req, PHP_URL_QUERY), + $search + ); + + if(isset($search["q"])){ + + $search = $search["q"]; + }else{ + + $search = "a"; // lol + } + + try{ + $html = + $this->get( + $ip, + "https://www.google.com" . $req, + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $lang = $get["lang"]; + $older = $get["older"]; + $newer = $get["newer"]; + $ip = $this->backend->get_ip(); + + $params = [ + "q" => $search, + "num" => 20 // get 20 results + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // language + if($lang != "any"){ + + $params["lr"] = "lang_" . $lang; + } + + // &sort=review-date:r:20090301:20090430 + $older = $older === false ? false : date("Ymd", $older); + $newer = $newer === false ? false : date("Ymd", $newer); + + if( + $older !== false && + $newer === false + ){ + + $newer = date("Ymd", time()); + } + + if( + $older !== false || + $newer !== false + ){ + + $params["sort"] = "review-date:r:" . $older . ":" . $newer; + } + + try{ + $html = + $this->get( + $ip, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + } - $params = [ - "num" => 20 // get 20 results - ]; + return $this->parsepage($html, "web", $search, $ip); + } + + + + public function video($get){ - // country - if($country != "any"){ + if($get["npt"]){ + + [$req, $ip] = $this->backend->get($get["npt"], "videos"); + parse_str( + parse_url($req, PHP_URL_QUERY), + $search + ); + + if(isset($search["q"])){ + + $search = $search["q"]; + }else{ + + $search = "a"; // lol + } + + try{ + + $html = + $this->get( + $ip, + "https://www.google.com" . $req, + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $lang = $get["lang"]; + $time = $get["time"]; + $duration = $get["duration"]; + $quality = $get["quality"]; + $captions = $get["captions"]; + $ip = $this->backend->get_ip(); + + $params = [ + "q" => $search, + "tbm" => "vid", + "num" => "20" + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // language + if($lang != "any"){ + + $params["lr"] = "lang_" . $lang; + } + + $tbs = []; + + // time + if($time != "any"){ + + $tbs[] = "qdr:" . $time; + } + + // duration + if($duration != "any"){ + + $tbs[] = "dur:" . $duration; + } + + // quality + if($quality != "any"){ + + $tbs[] = "hq:" . $quality; + } + + // captions + if($captions != "any"){ + + $tbs[] = "cc:" . $captions; + } + + // append tbs + if(count($tbs) !== 0){ + + $params["tbs"] = + implode(",", $tbs); + } - $params["gl"] = $country; + try{ + $html = + $this->get( + $ip, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } } - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; + $json = $this->parsepage($html, "videos", $search, $ip); + $out = [ + "status" => "ok", + "npt" => $json["npt"], + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; - // language - if($lang != "any"){ + foreach($json["web"] as $item){ - $params["lr"] = "lang_" . $lang; + $out["video"][] = [ + "title" => $item["title"], + "description" => $item["description"], + "author" => [ + "name" => null, + "url" => null, + "avatar" => null + ], + "date" => isset($item["table"]["Posted"]) ? strtotime($item["table"]["Posted"]) : null, + "duration" => isset($item["table"]["Duration"]) ? $this->hms2int($item["table"]["Duration"]) : null, + "views" => null, + "thumb" => + $item["thumb"]["url"] === null ? + [ + "url" => null, + "ratio" => null + ] : + [ + "url" => $item["thumb"]["url"], + "ratio" => "16:9" + ], + "url" => $item["url"] + ]; } - // &sort=review-date:r:20090301:20090430 - $older = $older === false ? false : date("Ymd", $older); - $newer = $newer === false ? false : date("Ymd", $newer); + return $out; + } + + + + public function news($get){ - if( - $older !== false && - $newer === false - ){ + if($get["npt"]){ + + [$req, $ip] = $this->backend->get($get["npt"], "news"); + parse_str( + parse_url($req, PHP_URL_QUERY), + $search + ); - $newer = date("Ymd", time()); + if(isset($search["q"])){ + + $search = $search["q"]; + }else{ + + $search = "a"; // lol + } + + try{ + + $html = + $this->get( + $ip, + "https://www.google.com" . $req, + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $lang = $get["lang"]; + $time = $get["time"]; + $sort = $get["sort"]; + $ip = $this->backend->get_ip(); + + $params = [ + "q" => $search, + "tbm" => "nws", + "num" => "20" + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // language + if($lang != "any"){ + + $params["lr"] = "lang_" . $lang; + } + + $tbs = []; + + // time + if($time != "any"){ + + if($time == "a"){ + + $tbs[] = "ar:1"; + }else{ + + $tbs[] = "qdr:" . $time; + } + } + + // relevance + if($sort == "date"){ + + $tbs[] = "sbd:1"; + } + + // append tbs + if(count($tbs) !== 0){ + + $params["tbs"] = + implode(",", $tbs); + } + + $html = + $this->get( + $ip, + "https://www.google.com/search", + $params + ); } - if( - $older !== false || - $newer !== false - ){ + $json = $this->parsepage($html, "news", $search, $ip); + $out = [ + "status" => "ok", + "npt" => $json["npt"], + "news" => [] + ]; + + foreach($json["web"] as $item){ + + $description = array_key_first($item["table"]); - $params["sort"] = "review-date:r:" . $older . ":" . $newer; + if($description !== null){ + + $date = $item["table"][$description]; + }else{ + + $date = null; + } + + $out["news"][] = [ + "title" => $item["title"], + "author" => $item["author"], + "description" => $description, + "date" => strtotime($date), + "thumb" => + $item["thumb"]["url"] === null ? + [ + "url" => null, + "ratio" => null + ] : + [ + "url" => $item["thumb"]["url"], + "ratio" => "16:9" + ], + "url" => $item["url"] + ]; } + return $out; + } + + + + private function parsepage($html, $pagetype, $search, $ip){ + /* $handle = fopen("scraper/google.html", "r"); $html = fread($handle, filesize("scraper/google.html")); fclose($handle); + */ $out = [ "status" => "ok", @@ -844,6 +971,156 @@ class google{ $this->parsejavascript($html); + // + // parse accdef's + // + $has_appended_accdef = false; + + preg_match_all( + '/window\.jsl\.dh\(\'(accdef_[0-9]+)\',\'(.*)\'\);/', + $html, + $accdefs_regex + ); + + $accdefs = []; + for($i=0; $ifuckhtml + ->parseJsString( + $accdefs_regex[2][$i] + ); + + $this->fuckhtml->load($answer); + + // get description + $description = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "padding" => "12px 16px 12px", + ], + self::is_class + ), + "div" + )[1]; + + // get date (rare) + $date = + $this->fuckhtml + ->getElementsByTagName("sub"); + + if(count($date) !== 0){ + + $description = + str_replace( + $date[0]["outerHTML"], + "", + $description["innerHTML"] + ); + + $date = + strtotime( + $this->fuckhtml + ->getTextContent( + $date[0] + ) + ); + }else{ + + $date = null; + } + + // get information table + $table = []; + + $tbody = + $this->fuckhtml + ->getElementsByTagName("tbody"); + + if(count($tbody) !== 0){ + + $this->fuckhtml->load($tbody[0]); + + $trs = + $this->fuckhtml + ->getElementsByTagName("tr"); + + foreach($trs as $tr){ + + $this->fuckhtml->load($tr); + + $tds = + $this->fuckhtml + ->getElementsByTagName("td"); + + if(count($tds) === 2){ + + $table[ + $this->fuckhtml + ->getTextContent( + $tds[0] + ) + ] = + $this->fuckhtml + ->getTextContent( + $tds[1] + ); + } + } + + // load back what we had + $this->fuckhtml->load($answer); + } + + // get title & link + $a = + $this->fuckhtml + ->getElementsByTagName("a")[0]; + + $this->fuckhtml->load($a); + + $title = + $this->fuckhtml + ->getElementsByTagName("span"); + + if(count($title) === 0){ + + continue; + } + + $accdefs[] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ), + "description" => + $this->fuckhtml + ->getTextContent( + $description + ), + "url" => + $this->unshiturl( + $a["attributes"]["href"] + ), + "date" => $date, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => $table + ]; + } + + $this->fuckhtml->load($html); + $containers = $this->fuckhtml ->getElementsByClassName( @@ -863,6 +1140,94 @@ class google{ $this->fuckhtml->load($container); + // detect spelling + $spelling = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "font-size" => "20px", + "line-height" => "26px", + "padding-top" => "2px", + "margin-bottom" => "1px" + ], + self::is_class + ), + "div" + ); + + if(count($spelling) !== 0){ + + $a = + $this->fuckhtml + ->getElementsByTagName("a"); + + if(count($a) !== 0){ + + $scripts = + $this->fuckhtml + ->getElementsByTagName("script"); + + foreach($scripts as $script){ + + $container["innerHTML"] = + str_replace( + $script["outerHTML"], + "", + $container["innerHTML"] + ); + } + + $container["innerHTML"] = + $this->fuckhtml + ->getTextContent( + str_replace( + $a[0]["outerHTML"], + "", + $container["innerHTML"] + ) + ); + + if( + preg_match( + '/^did you mean/i', + $container["innerHTML"] + ) + ){ + + $out["spelling"] = [ + "type" => "not_many", + "using" => $search, + "correction" => + $this->fuckhtml + ->getTextContent( + $a[0] + ) + ]; + } + + elseif( + preg_match( + '/^showing results for/i', + $container["innerHTML"] + ) + ){ + + $out["spelling"] = [ + "type" => "including", + "using" => + $this->fuckhtml + ->getTextContent( + $a[0] + ), + "correction" => $search + ]; + } + } + + continue; + } + $title = $this->fuckhtml ->getElementsByClassName( @@ -891,14 +1256,7 @@ class google{ ) ), "description" => null, - "url" => - $this->decodeurl( - $this->fuckhtml - ->getElementsByTagName("a") - [0] - ["attributes"] - ["href"] - ), + "url" => null, "date" => null, "type" => "web", "thumb" => [ @@ -909,33 +1267,235 @@ class google{ "table" => [] ]; + // get link + $web["url"] = + $this->unshiturl( + $this->fuckhtml + ->getElementsByTagName("a") + [0] + ["attributes"] + ["href"] + ); + + // + // check if link contains a carousel + // + $carousels = $this->parsecarousels(); + if(count($carousels) !== 0){ + + $first = true; + foreach($carousels as $carousel_cat){ + + foreach($carousel_cat as $carousel){ + + if($first === true){ + + $first = false; + }elseif($carousel["image"] !== null){ + + $out["image"][] = [ + "title" => $carousel["title"], + "source" => [ + [ + "url" => $carousel["image"], + "width" => null, + "height" => null + ] + ], + "url" => $carousel["url"] + ]; + } + + $web["sublink"][] = [ + "title" => $carousel["title"], + "date" => $carousel["date"], + "description" => $carousel["description"], + "url" => $carousel["url"] + ]; + } + } + + if($carousels[0][0]["image"] !== null){ + $web["thumb"] = [ + "url" => $carousels[0][0]["image"], + "ratio" => "16:9" + ]; + } + + $out["web"][] = $web; + continue; + } + + // + // no carousel entries, parse as normal link + // + $this->fuckhtml->load($container); + + // parse URL + $web["url"] = + $this->unshiturl( + $this->fuckhtml + ->getElementsByTagName("a") + [0] + ["attributes"] + ["href"] + ); + $container = $container["innerHTML"]; - $description_container = + $line_detect = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ - "padding" => "12px 16px 12px" + "height" => "1px", + "background-color" => "#dadce0", + "margin" => "0 16px" ], self::is_class ), "div" - )[1]; + ); + + if(count($line_detect) !== 0){ + + // we found a line, this means we're dealing with a + // "featured snippet" + $featured = true; + + $description_container = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "white-space" => "pre-line", + "word-wrap" => "break-word" + ], + self::is_class + ), + "div" + )[1]; + + // get date node for it + $date = + $this->fuckhtml + ->getElementsByTagName("sub"); + + if(count($date) !== 0){ + $web["date"] = + strtotime( + $this->fuckhtml + ->getTextContent( + $date[0] + ) + ); + } + }else{ + + // we're dealing with a normal link + $featured = false; + + $description_container = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "padding" => "12px 16px 12px" + ], + self::is_class + ), + "div" + )[1]; + } + + // + // Get author if we're parsing news + // + if($pagetype == "news"){ + + $author = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "position" => "absolute", + "width" => "100%", + "top" => "0", + "left" => "0", + "padding-top" => "1px", + "margin-bottom" => "-1px" + ], + self::is_class + ), + "div" + ); + + if(count($author) !== 0){ + + $web["author"] = + $this->fuckhtml + ->getTextContent( + $author[0] + ); + }else{ + + $web["author"] = null; + } + } $description = $description_container["innerHTML"]; - // get sublinks $this->fuckhtml->load($description); + // + // get thumbnail before we call loadhtml again + // + $img = + $this->fuckhtml + ->getElementsByTagName("img"); + + if(count($img) !== 0){ + + $skip = true; + + if( + isset($img[0]["attributes"]["alt"]) && + stripos($img[0]["attributes"]["alt"], "Video for") !== false + ){ + + // is a video thumbnail + $web["thumb"]["ratio"] = "16:9"; + }else{ + + // is a google thumbnail + $web["thumb"]["ratio"] = "1:1"; + } + + $web["thumb"]["url"] = + $this->getimage( + $img[0]["attributes"]["id"] + ); + }else{ + + $skip = false; + } + + // + // get sublinks + // $links = $this->fuckhtml ->getElementsByTagName("a"); - $skip = true; foreach($links as $link){ + if($skip === true){ + + $skip = false; + continue; + } + $description = str_replace( $link["outerHTML"], @@ -943,12 +1503,6 @@ class google{ $description ); - if($skip){ - - $skip = false; - continue; - } - $sublink = [ "title" => null, "description" => null, @@ -957,214 +1511,799 @@ class google{ ]; $sublink["title"] = - $this->fuckhtml - ->getTextContent( - $link + $this->titledots( + $this->fuckhtml + ->getTextContent( + $link + ) ); $sublink["url"] = - $this->decodeurl( + $this->unshiturl( $link ["attributes"] ["href"] ); - $web["sublink"][] = $sublink; + if(parse_url($sublink["url"], PHP_URL_HOST) !== null){ + + $web["sublink"][] = $sublink; + } } - // get thumbnail before we call loadhtml again - $img = + // + // Parse spans in description + // + $this->fuckhtml->load($description); + + if($featured === false){ + + $levels = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "padding-bottom" => "8px" + ], + self::is_class + ), + "div" + ); + + // oh my god yes, fucking great, sometimes there are NO levels + // hahahahahhahahahahahahahahahhahaa + if(count($levels) === 0){ + + $levels = [$description]; + } + + foreach($levels as $level){ + + $this->fuckhtml->load($level); + + $spans = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + $is_rating = -1; + + foreach($spans as $span){ + + $innertext = + trim( + $this->fuckhtml + ->getTextContent( + $span + ), + " ·." + ); + + if($innertext == ""){ continue; } + + if( + strtolower($innertext) + == "rating" + ){ + + $is_rating = 0; + + // clean up before we go + $description = + str_replace( + $span["outerHTML"], + "", + $description + ); + continue; + } + + // + // Parse rating object + // + if($is_rating >= 0){ + + // clean up description + $description = + str_replace( + $span["outerHTML"], + "", + $description + ); + + if($span["level"] !== 1){ continue; } + $is_rating++; + + // 10/10 (123) + if($is_rating === 1){ + + $innertext = explode(" ", $innertext, 2); + + $web["table"]["Rating"] = $innertext[0]; + + if(count($innertext) === 2){ + $web["table"]["Hits"] = + trim( + str_replace( + [ + "(", + ")" + ], + "", + $innertext[1] + ) + ); + + if($web["table"]["Hits"] == ""){ + + unset($web["table"]["Hits"]); + } + } + continue; + } + + // US$4.99 + // MYR 50.00 + // $38.34 + // JP¥6,480 + // Reviewed by your mom + if($is_rating === 2){ + + if( + preg_match( + '/^Review by (.+)/', + $innertext, + $match + ) + ){ + + $web["table"]["Author"] = $match[1]; + continue; + } + + $web["table"]["Price"] = $innertext; + continue; + } + + // Android / In stock + if($is_rating === 3){ + + $web["table"]["Support"] = $innertext; + continue; + } + + // ignore the rest + continue; + } + + // + // Parse standalone text + // + + // If we reach this point: + // 1. Ratings have been parsed + // 2. We're parsing a WEB link, not some shitty piece of shit + + // check for date + // if span has no text before it, assume it's a date + $desc_split = + explode( + $span["outerHTML"], + $description, + 2 + ); + + if( + $this->fuckhtml + ->getTextContent( + $desc_split[0] + ) == "" + ){ + + // has no text before + $date = strtotime($innertext); + if($date){ + + $web["date"] = $date; + } + + // cleanup + $description = + str_replace( + $span["outerHTML"], + "", + $description + ); + + continue; + } + + // Ready to parse table + if(count($desc_split) === 2){ + $this->fuckhtml->load($desc_split[1]); + + $web["table"][ + $this->fuckhtml + ->getTextContent( + trim($desc_split[0], ": ") + ) + ] = $innertext; + + // cleanup + $description = + str_replace( + $desc_split[0] . $span["outerHTML"], + "", + $description + ); + } + } + } + } + + $web["description"] = + trim( + $this->fuckhtml + ->getTextContent( + $description + ), + " ·." + ); + + if($web["description"] == ""){ + + $web["description"] = null; + } + + $out["web"][] = $web; + + continue; + } + + // + // Detect wikipedia shit + // + $wiki_title = + $this->fuckhtml + ->getElementsByTagName("h3"); + + if(count($wiki_title) !== 0){ + + $description_after = []; + $description = []; + $table = []; + $sublink = []; + + $as = $this->fuckhtml - ->getElementsByTagName("img"); + ->getElementsByTagName("a"); + + foreach($as as $a){ + + if( + isset($a["attributes"]["href"]) && + parse_url($a["attributes"]["href"], PHP_URL_HOST) == "maps.google.com" + ){ + + // detected maps embed, ignore + continue 2; + } + } + + // get carousels and remove them from container for image grepper + $carousels = $this->parsecarousels($container["innerHTML"]); + $this->fuckhtml->load($container); + + // add images to image tab, if applicable + for($i=0; $i $item["title"], + "source" => [ + [ + "url" => $item["url"], + "width" => $item["image_width"], + "height" => $item["image_height"] + ], + [ + "url" => $item["image"], + "width" => $item["thumb_width"], + "height" => $item["thumb_height"] + ] + ], + "url" => $item["ref"] + ]; + + unset($carousels[$i]); + } + } + } + + $carousels = array_values($carousels); + + // interpret remaining carousels as title + carousel + $titles = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "font-weight" => "700", + "letter-spacing" => "0.75px", + "text-transform" => "uppercase" + ], + self::is_class + ) + ); - if(count($img) !== 0){ + for($i=0; $igetimage( - $img[0]["attributes"]["id"] - ); + $description_after[] = [ + "type" => "title", + "value" => + $this->fuckhtml + ->getTextContent( + $titles[$i] + ) + ]; + + foreach($carousels[$i] as $carousel){ + + $description_after[] = [ + "type" => "link", + "url" => "web?s=" . urlencode($carousel["description"]) . "&scraper=google", + "value" => $carousel["description"] + ]; + + if($carousel["subtext"] !== null){ + + $description_after[] = [ + "type" => "quote", + "value" => $carousel["subtext"] + ]; + } + + $description_after[] = [ + "type" => "image", + "url" => $carousel["image"] + ]; + } } - // get table elements - $this->fuckhtml->load($description); - - $levels = + $categories = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ - "padding-bottom" => "8px" + "padding" => "12px 16px 12px" ], self::is_class - ), - "div" + ) ); - $additional_info = []; - foreach($levels as $level){ + $image = + $this->fuckhtml + ->getElementsByTagName("img"); + + if(count($image) !== 0){ - $this->fuckhtml->load($level); + $image = $this->getimage($image[0]["attributes"]["id"]); + }else{ - $spans = - $this->fuckhtml - ->getElementsByTagName( - "span" - ); + $image = null; + } + + $url = null; + + for($i=0; $ifuckhtml->load($categories[$i]); - foreach($spans as $span){ + if($i === 0){ + // first node. this should be the header with the small + // information snippet + + $url = + $this->fuckhtml + ->getElementsByTagName("a"); + + if(count($url) !== 0){ + + $url = + $this->unshiturl( + $url[0]["attributes"]["href"] + ); + + if(parse_url($url, PHP_URL_HOST) == "encrypted-tbn0.gstatic.com"){ + + $image = $url; + $url = null; + } + }else{ + + $url = null; + } - // clean up description - $description = + $categories[$i]["innerHTML"] = str_replace( - $span["outerHTML"], + $wiki_title[0]["outerHTML"], "", - $description + $categories[$i]["innerHTML"] ); - $innertext = + $subtext = $this->fuckhtml ->getTextContent( - $span + $categories[$i]["innerHTML"] ); - if($innertext == ""){ continue; } + if(strlen($subtext) !== 0){ + + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $categories[$i]["innerHTML"] + ) + ]; + } - if( - strtolower($innertext) - == "rating" - ){ + // detect audio file + $audio = + $this->fuckhtml + ->getElementsByTagName("audio"); + + if(count($audio) !== 0){ - $is_rating = -1; - continue; + $description[] = [ + "type" => "audio", + "url" => + $this->fuckhtml + ->getTextContent( + $audio[0]["attributes"]["src"] + ) + ]; } + }else{ - // - // Parse rating object - // + // check for separator elements IN THERE + $separators = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "white-space" => "pre-line", + "word-wrap" => "break-word" + ], + self::is_class + ), + "div" + ); - if($is_rating >= -1){ + // detect container type + foreach($separators as $separator){ - if($span["level"] !== 1){ continue; } + $this->fuckhtml->load($separator); - $is_rating++; + // ignore wrong levels + if($separator["level"] !== 2){ + + continue; + } + + // + // Detect word definition + // + $wordwraps = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "padding-bottom" => "12px" + ], + self::is_class + ), + "div" + ); - // 10/10 (123) - if($is_rating === 0){ + if(count($wordwraps) !== 0){ + + foreach($wordwraps as $word){ + + $this->fuckhtml->load($word); + + // detect title + $span = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + if( + count($span) === 1 && + $this->fuckhtml + ->getTextContent( + str_replace( + $span[0]["outerHTML"], + "", + $word["innerHTML"] + ) + ) == "" + ){ + + $description[] = [ + "type" => "title", + "value" => + $this->fuckhtml + ->getTextContent( + $span[0] + ) + ]; + continue; + } + + // detect list element + $lists = + $this->fuckhtml + ->getElementsByTagName("ol"); + + if(count($lists) !== 0){ + foreach($lists as $list){ + + $this->fuckhtml->load($list); + + $items = + $this->fuckhtml + ->getElementsByTagName("li"); + + $w = 0; + foreach($items as $item){ + + $w++; + $this->fuckhtml->load($item); + + // get subnodes + $subnodes = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "white-space" => "pre-line", + "word-wrap" => "break-word" + ], + self::is_class + ), + "div" + ); + + foreach($subnodes as $subnode){ + + $this->fuckhtml->load($subnode); + + $spans = + $this->fuckhtml + ->getElementsByTagName("span"); + + if(count($spans) !== 0){ + + // append quote + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $subnode + ) + ]; + }else{ + + // append text + $description[] = [ + "type" => "text", + "value" => + $w . ". " . + $this->fuckhtml + ->getTextContent( + $subnode + ) + ]; + } + } + } + } + }else{ + + // parse without list + // get subnodes + $subnodes = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "white-space" => "pre-line", + "word-wrap" => "break-word" + ], + self::is_class + ), + "div" + ); + + foreach($subnodes as $subnode){ + + $this->fuckhtml->load($subnode); + + $spans = + $this->fuckhtml + ->getElementsByTagName("span"); + + if(count($spans) !== 0){ + + // append quote + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $subnode + ) + ]; + }else{ + + // append text + $description[] = [ + "type" => "text", + "value" => + $this->fuckhtml + ->getTextContent( + $subnode + ) + ]; + } + } + } + } + }else{ + + // + // Parse table + // + $spans = + $this->fuckhtml + ->getElementsByTagName("span"); - $innertext = explode(" ", $innertext, 2); + foreach($spans as $span){ + + if(!isset($span["attributes"]["class"])){ + + // found table + $row = + explode( + ":", + $this->fuckhtml + ->getTextContent( + $separator + ), + 2 + ); + + if(count($row) === 2){ + + $table[rtrim($row[0])] = + ltrim($row[1]); + + } + continue 2; + } + } - $web["table"]["Rating"] = $innertext[0]; - $web["table"]["Hits"] = - trim( + // + // Parse normal description + // + $links_rem = + $this->fuckhtml + ->getElementsByTagName("a"); + + foreach($links_rem as $rem){ + + $separator["innerHTML"] = str_replace( - [ - "(", - ")" - ], + $rem["outerHTML"], "", - $innertext[1] + $separator["innerHTML"] + ); + } + + $description[] = [ + "type" => "text", + "value" => + rtrim( + $this->fuckhtml + ->getTextContent( + $separator + ), + " .," ) - ); - continue; + ]; } + } + } - // US$4.99 - // MYR 50.00 - // $38.34 - // JP¥6,480 - if($is_rating === 2){ + // detect huge buttons + $buttons = + $this->fuckhtml + ->getElementsByClassName( + $this->findstyles( + [ + "display" => "table-cell", + "vertical-align" => "middle", + "height" => "52px", + "text-align" => "center" + ], + self::is_class + ), + "a" + ); + + if(count($buttons) !== 0){ - $web["table"]["Price"] = $innertext; - continue; - } + foreach($buttons as $button){ - // Android / In stock - if($is_rating === 4){ + if(isset($button["attributes"]["href"])){ - $web["table"]["Support"] = $innertext; - continue; + $sublink[ + $this->fuckhtml + ->getTextContent( + $button + ) + ] = + $this->unshiturl( + $button["attributes"]["href"] + ); } - - // ignore the rest - continue; } - - // - // Parse standalone text - // - $additional_info[] = $innertext; } } - for($i=0; $ifuckhtml->load($description); - - // get date node - $span = - $this->fuckhtml - ->getElementsByTagName( - "span" - ); - - if(count($span) !== 0){ - - $description = - str_replace( - $span[0]["outerHTML"], - "", - $description - ); - - $span = - strtotime( - $this->fuckhtml - ->getTextContent( - $span[0] - ) - ); - - if($span){ - - $web["date"] = $span; - } - } + // append description_after (contains carousel info) + $description = array_merge( + $description, + $description_after + ); - $web["description"] = - trim( + $out["answer"][] = [ + "title" => $this->fuckhtml ->getTextContent( - $description + $wiki_title[0] ), - " ·." - ); - - $out["web"][] = $web; + "description" => $description, + "url" => $url, + "thumb" => $image, + "table" => $table, + "sublink" => $sublink + ]; continue; } - // check for container title header + // + // Detect related searches containers + // $container_title = $this->fuckhtml ->getElementsByClassName( @@ -1183,6 +2322,21 @@ class google{ if(count($container_title) !== 0){ + // get carousel entries + $carousels = $this->parsecarousels($container["innerHTML"]); + $this->fuckhtml->load($container); + + foreach($carousels as $carousel){ + + foreach($carousel as $item){ + + if($item["url"] !== null){ + + $out["related"][] = $item["url"]; + } + } + } + $container_title = strtolower( $this->fuckhtml @@ -1191,158 +2345,300 @@ class google{ ) ); - if( - $container_title == "related searches" || - $container_title == "people also search for" - ){ + switch($container_title){ - // - // Parse related searches - // - $as = - $this->fuckhtml - ->getElementsByTagName("a"); - - foreach($as as $a){ + case "related searches": + case "people also search for": + // + // Parse related searches + // + $as = + $this->fuckhtml + ->getElementsByTagName("a"); - $out["related"][] = + foreach($as as $a){ + + $out["related"][] = + $this->fuckhtml + ->getTextContent($a); + } + break; + + case "people also ask": + // get related queries + $divs = $this->fuckhtml - ->getTextContent($a); - } + ->getElementsByTagName("div"); + + foreach($divs as $div){ + + // add accdef's here + if($has_appended_accdef === false){ + + $out["web"] = array_merge($out["web"], $accdefs); + $has_appended_accdef = true; + } + + // add accdef's questions + if(isset($div["attributes"]["role"])){ + + $out["related"][] = + $this->fuckhtml + ->getTextContent($div); + + continue; + } + } + break; } continue; } // - // Parse image carousel + // Parse news // - $title_container = + $title = $this->fuckhtml ->getElementsByClassName( $this->findstyles( [ - "padding" => "12px 16px 12px" + "font-size" => "16px", + "line-height" => "20px", + "font-weight" => "400" ], self::is_class ), "div" ); - if(count($title_container) !== 0){ + if(count($title) !== 0){ + + $carousels = $this->parsecarousels(); + $this->fuckhtml->load($container); - $title_container = + if(count($carousels) === 0){ + + // no carousels found + continue; + } + + $title = strtolower( $this->fuckhtml ->getTextContent( - $title_container[0] + $title[0] ) ); - if($title_container == "imagesview all"){ - - // - // Image carousel - // - $pcitem = - $this->fuckhtml - ->getElementsByClassName( - "pcitem", - "div" - ); + if( + preg_match( + '/^latest from|^top stories/', + $title + ) + ){ - foreach($pcitem as $item){ - - $this->fuckhtml->load($item); + // Found news article + foreach($carousels[0] as $carousel){ - $link = - $this->fuckhtml - ->getElementsByTagName( - "a" - )[0]; - - parse_str( - parse_url( - $this->fuckhtml - ->getTextContent( - $link - ["attributes"] - ["href"] - ), - PHP_URL_QUERY - ), - $link - ); - - if(isset($link["tbm"])){ + if($carousel["image"] !== null){ - continue; + $thumb = [ + "url" => $carousel["image"], + "ratio" => "16:9" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; } - $image = - $this->fuckhtml - ->getElementsByTagName("img")[0]; - - $title = - $this->fuckhtml - ->getTextContent( - $image - ["attributes"] - ["alt"] - ); - - $image = - $this->getimage( - $image - ["attributes"] - ["id"] - ); + $out["news"][] = [ + "title" => $carousel["title"], + "description" => $carousel["description"], + "date" => $carousel["date"], + "thumb" => $thumb, + "url" => $carousel["url"] + ]; + } + } + + elseif( + $title == "images" + ){ + + foreach($carousels as $carousel){ - $out["image"][] = [ - "title" => $title, - "source" => [ - [ - "url" => $link["imgurl"], - "width" => (int)$link["w"], - "height" => (int)$link["h"] + foreach($carousel as $item){ + + $out["image"][] = [ + "title" => $item["title"], + "source" => [ + [ + "url" => $item["url"], + "width" => $item["image_width"], + "height" => $item["image_height"] + ], + [ + "url" => $item["image"], + "width" => $item["thumb_width"], + "height" => $item["thumb_height"] + ] ], - [ - "url" => $image, - "width" => (int)$link["tbnw"], - "height" => (int)$link["tbnh"] - ] - ], - "url" => $link["imgrefurl"] - ]; + "url" => $item["ref"] + ]; + } } } + + continue; + } + + // + // Detect nodes with only text + links + // + + // ignore elements with