From e252bf4fce70c56b33294fdd095b0dc08ce8b95f Mon Sep 17 00:00:00 2001 From: lolcat Date: Sat, 29 Jun 2024 12:55:28 -0400 Subject: soundcloud fix, for good this time --- scraper/sc.php | 103 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 81 insertions(+), 22 deletions(-) diff --git a/scraper/sc.php b/scraper/sc.php index 86ea979..c435c78 100644 --- a/scraper/sc.php +++ b/scraper/sc.php @@ -6,6 +6,9 @@ class sc{ include "lib/backend.php"; $this->backend = new backend("sc"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); } public function getfilters($page){ @@ -25,7 +28,7 @@ class sc{ ]; } - private function get($proxy, $url, $get = []){ + private function get($proxy, $url, $get = [], $web_req = false){ $curlproc = curl_init(); @@ -37,19 +40,42 @@ class sc{ curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding - curl_setopt($curlproc, CURLOPT_HTTPHEADER, - ["User-Agent: " . config::USER_AGENT, - "Accept: application/json, text/javascript, */*; q=0.01", - "Accept-Language: en-US,en;q=0.5", - "Accept-Encoding: gzip", - "Referer: https://soundcloud.com/", - "Origin: https://soundcloud.com", - "DNT: 1", - "Connection: keep-alive", - "Sec-Fetch-Dest: empty", - "Sec-Fetch-Mode: cors", - "Sec-Fetch-Site: same-site"] - ); + + // use http2 + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + if($web_req === false){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://soundcloud.com/", + "Origin: https://soundcloud.com", + "DNT: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site", + "Priority: u=1", + "TE: trailers"] + ); + } curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); @@ -396,13 +422,47 @@ class sc{ $token = apcu_fetch("sc_token"); - if($token === false){ + if($token !== false){ + + return $token; + } + + // search through all javascript components on the main page + try{ + $html = + $this->get( + $proxy, + "https://soundcloud.com", + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch front page"); + } + + $this->fuckhtml->load($html); + + $scripts = + $this->fuckhtml + ->getElementsByTagName( + "script" + ); + + foreach($scripts as $script){ + + if( + !isset($script["attributes"]["src"]) || + strpos($script["attributes"]["src"], "sndcdn.com") === false + ){ + + continue; + } try{ $js = $this->get( $proxy, - "https://a-v2.sndcdn.com/assets/0-a901c1e0.js", + $script["attributes"]["src"], [] ); }catch(Exception $error){ @@ -416,16 +476,15 @@ class sc{ $token ); - if(!isset($token[1])){ + if(isset($token[1])){ - throw new Exception("Failed to get search token"); + apcu_store("sc_token", $token[1]); + return $token[1]; + break; } - - apcu_store("sc_token", $token[1]); - return $token[1]; } - return $token; + throw new Exception("Did not find a Soundcloud token in the Javascript blobs"); } private function limitstrlen($text){ -- cgit v1.2.3 From 8161f8e7b8b236892a2254b8876fe4363b33050b Mon Sep 17 00:00:00 2001 From: lolcat Date: Sat, 29 Jun 2024 17:58:05 -0400 Subject: forgor to set headers --- scraper/sc.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scraper/sc.php b/scraper/sc.php index c435c78..011293a 100644 --- a/scraper/sc.php +++ b/scraper/sc.php @@ -57,7 +57,8 @@ class sc{ "Connection: keep-alive", "Sec-Fetch-Dest: empty", "Sec-Fetch-Mode: cors", - "Sec-Fetch-Site: same-site"] + "Sec-Fetch-Site: same-site", + "Priority: u=1"] ); }else{ @@ -433,7 +434,8 @@ class sc{ $this->get( $proxy, "https://soundcloud.com", - [] + [], + true ); }catch(Exception $error){ -- cgit v1.2.3 From 6df9d17ada64663c8bfb80baafc42dc951951218 Mon Sep 17 00:00:00 2001 From: lolcat Date: Sat, 29 Jun 2024 18:03:10 -0400 Subject: fixed soundcloud crash --- scraper/sc.php | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scraper/sc.php b/scraper/sc.php index 011293a..7083c42 100644 --- a/scraper/sc.php +++ b/scraper/sc.php @@ -327,9 +327,12 @@ class sc{ $description[] = $song["title"]; } - if(count($description) != 0){ + if(count($description) !== 0){ $description = trim($count . " songs. " . implode(", ", $description)); + }else{ + + $description = ""; } if( -- cgit v1.2.3 From 7a91eb78396b312f85526fd8ea509de9d1cb4d17 Mon Sep 17 00:00:00 2001 From: lolcat Date: Sat, 29 Jun 2024 19:02:33 -0400 Subject: fixed google formatting changes --- scraper/google.php | 176 +++++++++++++++++++++++++++++------------------------ 1 file changed, 97 insertions(+), 79 deletions(-) diff --git a/scraper/google.php b/scraper/google.php index 68fc22c..b623872 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -703,6 +703,43 @@ class google{ } // reset + $this->fuckhtml->load($result_div); + }else{ + + // get the "Did you mean?" prompt + $taw = + $this->fuckhtml + ->getElementById( + "taw" + ); + + if($taw){ + + $this->fuckhtml->load($taw); + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) !== 0){ + + $text = + $this->fuckhtml + ->getTextContent( + $as[0] + ); + + // @TODO implement did_you_mean + $out["spelling"] = [ + "type" => "including", + "using" => $search, + "correction" => $text + ]; + } + } + $this->fuckhtml->load($result_div); } @@ -895,36 +932,10 @@ class google{ // get "Related Searches" and "People also search for" // $relateds = - array_merge( - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "align-items" => "center", - "background-color" => "#28292a", - "border-radius" => "100px", - "box-sizing" => "border-box", - "display" => "flex", - "max-height" => "none", - "min-height" => "48px", - "padding-left" => "17px", - "padding-right" => "17px", - "position" => "relative" - ] - ) . " " . - $this->getstyle( - [ - "margin-left" => "8px", - "margin-right" => "8px" - ] - ), - "a" - ), - $this->fuckhtml - ->getElementsByClassName( - "wyccme", - "div" - ) + $this->fuckhtml + ->getElementsByClassName( + "wyccme", + "div" ); foreach($relateds as $related){ @@ -1354,7 +1365,7 @@ class google{ "font-size" => "12px", "line-height" => "1.34", "display" => "inline-block", - "font-family" => "Google Sans,arial,sans-serif", + "font-family" => "google sans,arial,sans-serif", "padding-right" => "0", "white-space" => "nowrap" ] @@ -1415,12 +1426,9 @@ class google{ ->getElementsByClassName( $this->getstyle( [ - "border-radius" => "10px", - "font-family" => "arial,sans-serif-medium,sans-serif", - "font-size" => "12px", - "line-height" => "16px", - "padding-block" => "2px", - "padding-inline" => "8px" + "background-color" => "rgba(0,0,0,0.6)", + "color" => "#fff", + "fill" => "#fff" ] ), "div" @@ -1433,14 +1441,6 @@ class google{ ->getTextContent( $duration[0] ); - - // remove duration from description - $description[0]["innerHTML"] = - str_replace( - $duration[0]["outerHTML"], - "", - $description[0]["innerHTML"] - ); } $web["description"] = @@ -1979,7 +1979,7 @@ class google{ "font-size" => "12px", "line-height" => "1.34", "display" => "inline-block", - "font-family" => "Google Sans,arial,sans-serif", + "font-family" => "google sans,arial,sans-serif", "padding-right" => "0", "white-space" => "nowrap" ] @@ -2211,7 +2211,7 @@ class google{ ->getElementsByClassName( $this->getstyle( [ - "font-family" => "Google Sans,arial,sans-serif", + "font-family" => "google sans,arial,sans-serif", "font-size" => "28px", "line-height" => "36px" ] @@ -2801,7 +2801,22 @@ class google{ } } - // get thumbnail + // get heading element + $heading = + $this->fuckhtml + ->getElementsByAttributeValue( + "role", + "heading", + "div" + ); + + if(count($heading) === 0){ + + // no heading, fuck this. + continue; + } + + // get thumbnail before loading heading object $image = $this->fuckhtml ->getElementsByAttributeName( @@ -2823,35 +2838,6 @@ class google{ ]; } - // get title - $title = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "font-family" => "arial,sans-serif", - "font-size" => "16px", - "font-weight" => "400", - "line-height" => "24px" - ] - ), - "div" - ); - - if(count($title) === 0){ - - // ?? no title - continue; - } - - $title = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $title[0] - ) - ); - // get duration $duration_div = $this->fuckhtml @@ -2908,6 +2894,38 @@ class google{ } } + // load heading + $this->fuckhtml->load($heading[0]); + + // get title + $title = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "font-family" => "arial,sans-serif", + "font-size" => "16px", + "font-weight" => "400", + "line-height" => "24px" + ] + ), + "div" + ); + + if(count($title) === 0){ + + // ?? no title + continue; + } + + $title = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ); + // get date $date_div = $this->fuckhtml @@ -3940,7 +3958,7 @@ class google{ for($k=0; $kstyles[":root"] as $key => $value){ - $this->css_colors[$value] = $key; + $this->css_colors[$value] = strtolower($key); } } } -- cgit v1.2.3 From 03ccd75f4b47fb6772199b5e534c7211e5e53bd0 Mon Sep 17 00:00:00 2001 From: lolcat Date: Sat, 29 Jun 2024 20:51:19 -0400 Subject: fixed google description bug --- scraper/google.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scraper/google.php b/scraper/google.php index b623872..3d99c05 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -1412,7 +1412,7 @@ class google{ "line-height" => "22px", "overflow" => "hidden", "word-break" => "break-word", - "color" => "#bdc1c6" + "color" => "#4d5156" ] ), "div" @@ -4224,7 +4224,7 @@ class google{ throw new Exception("Failed to get HTML"); } - //$html = file_get_contents("scraper/google-video.html"); + //$html = file_get_contents("scraper/google.html"); $response = $this->parsepage($html, "videos", $search, $proxy, $params); $out = [ -- cgit v1.2.3 From 9c00182b2eeb20000855396c44aab42227a51af6 Mon Sep 17 00:00:00 2001 From: lolcat Date: Wed, 3 Jul 2024 12:05:54 -0400 Subject: yep fix --- scraper/yep.php | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scraper/yep.php b/scraper/yep.php index c8f82f9..4a5d411 100644 --- a/scraper/yep.php +++ b/scraper/yep.php @@ -252,21 +252,23 @@ class yep{ curl_setopt($curlproc, CURLOPT_URL, $url); + // use http2 + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, "Accept: */*", "Accept-Language: en-US,en;q=0.5", "Accept-Encoding: gzip, deflate, br, zstd", - "Connection: keep-alive", - "DNT: 1", - "Priority: u=1", - "Origin: https://yep.com", "Referer: https://yep.com/", + "Origin: https://yep.com", + "DNT: 1", "Connection: keep-alive", "Sec-Fetch-Dest: empty", "Sec-Fetch-Mode: cors", "Sec-Fetch-Site: same-site", + "Priority: u=4", "TE: trailers"] ); -- cgit v1.2.3 From feb0a6dfc3e8557dd0e87c5b21625baf335dba7b Mon Sep 17 00:00:00 2001 From: lolcat Date: Wed, 3 Jul 2024 12:17:09 -0400 Subject: yep ssl fix --- scraper/yep.php | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scraper/yep.php b/scraper/yep.php index 4a5d411..3d99181 100644 --- a/scraper/yep.php +++ b/scraper/yep.php @@ -255,6 +255,15 @@ class yep{ // use http2 curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + // set ciphers + // aes_128_gcm_sha_256,chacha20_poly1305_sha_256,aes_256_gcm_sha_384,ecdhe_ecdsa_aes_128_gcm_sha_256,ecdhe_rsa_aes_128_gcm_sha_256,ecdhe_ecdsa_chacha20_poly1305_sha_256,ecdhe_rsa_chacha20_poly1305_sha_256,ecdhe_ecdsa_aes_256_gcm_sha_384,ecdhe_rsa_aes_256_gcm_sha_384,ecdhe_ecdsa_aes_256_sha,ecdhe_ecdsa_aes_128_sha,ecdhe_rsa_aes_128_sha,ecdhe_rsa_aes_256_sha,rsa_aes_128_gcm_sha_256,rsa_aes_256_gcm_sha_384,rsa_aes_128_sha,rsa_aes_256_sha + /* + curl_setopt( + $curlproc, + CURLOPT_SSL_CIPHER_LIST, + "aes_128_gcm_sha_256,chacha20_poly1305_sha_256,aes_256_gcm_sha_384,ecdhe_ecdsa_aes_128_gcm_sha_256,ecdhe_rsa_aes_128_gcm_sha_256,ecdhe_ecdsa_chacha20_poly1305_sha_256,ecdhe_rsa_chacha20_poly1305_sha_256,ecdhe_ecdsa_aes_256_gcm_sha_384,ecdhe_rsa_aes_256_gcm_sha_384,ecdhe_ecdsa_aes_256_sha,ecdhe_ecdsa_aes_128_sha,ecdhe_rsa_aes_128_sha,ecdhe_rsa_aes_256_sha,rsa_aes_128_gcm_sha_256,rsa_aes_256_gcm_sha_384,rsa_aes_128_sha,rsa_aes_256_sha" + );*/ + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, ["User-Agent: " . config::USER_AGENT, -- cgit v1.2.3 From 53d40c6e4e86fdd8ed86d5fb043b6c93d9c7b3ea Mon Sep 17 00:00:00 2001 From: lolcat Date: Wed, 3 Jul 2024 12:18:01 -0400 Subject: yep ssl fix 2 fucknfwuefhuiew --- scraper/yep.php | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scraper/yep.php b/scraper/yep.php index 3d99181..bfe347f 100644 --- a/scraper/yep.php +++ b/scraper/yep.php @@ -256,13 +256,11 @@ class yep{ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); // set ciphers - // aes_128_gcm_sha_256,chacha20_poly1305_sha_256,aes_256_gcm_sha_384,ecdhe_ecdsa_aes_128_gcm_sha_256,ecdhe_rsa_aes_128_gcm_sha_256,ecdhe_ecdsa_chacha20_poly1305_sha_256,ecdhe_rsa_chacha20_poly1305_sha_256,ecdhe_ecdsa_aes_256_gcm_sha_384,ecdhe_rsa_aes_256_gcm_sha_384,ecdhe_ecdsa_aes_256_sha,ecdhe_ecdsa_aes_128_sha,ecdhe_rsa_aes_128_sha,ecdhe_rsa_aes_256_sha,rsa_aes_128_gcm_sha_256,rsa_aes_256_gcm_sha_384,rsa_aes_128_sha,rsa_aes_256_sha - /* curl_setopt( $curlproc, CURLOPT_SSL_CIPHER_LIST, "aes_128_gcm_sha_256,chacha20_poly1305_sha_256,aes_256_gcm_sha_384,ecdhe_ecdsa_aes_128_gcm_sha_256,ecdhe_rsa_aes_128_gcm_sha_256,ecdhe_ecdsa_chacha20_poly1305_sha_256,ecdhe_rsa_chacha20_poly1305_sha_256,ecdhe_ecdsa_aes_256_gcm_sha_384,ecdhe_rsa_aes_256_gcm_sha_384,ecdhe_ecdsa_aes_256_sha,ecdhe_ecdsa_aes_128_sha,ecdhe_rsa_aes_128_sha,ecdhe_rsa_aes_256_sha,rsa_aes_128_gcm_sha_256,rsa_aes_256_gcm_sha_384,rsa_aes_128_sha,rsa_aes_256_sha" - );*/ + ); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, -- cgit v1.2.3