summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
authorgesang <gesang@itinerariummentis.org>2024-07-04 01:08:21 +0000
committergesang <gesang@itinerariummentis.org>2024-07-04 01:08:21 +0000
commit241fa10dcd49581e3a5bc8fe3f0a3629d1b7b5bf (patch)
tree68a6ccd0e2ef81bc262c9f5aa9ae0053703dfb67 /scraper
parentd13e228d72478745635e7a3e766cd16cc42f2e96 (diff)
parent53d40c6e4e86fdd8ed86d5fb043b6c93d9c7b3ea (diff)
Merge branch 'master' of https://git.lolcat.ca/lolcat/4get
Diffstat (limited to 'scraper')
-rw-r--r--scraper/google.php180
-rw-r--r--scraper/sc.php110
-rw-r--r--scraper/yep.php17
3 files changed, 199 insertions, 108 deletions
diff --git a/scraper/google.php b/scraper/google.php
index 68fc22c..3d99c05 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -704,6 +704,43 @@ class google{
// reset
$this->fuckhtml->load($result_div);
+ }else{
+
+ // get the "Did you mean?" prompt
+ $taw =
+ $this->fuckhtml
+ ->getElementById(
+ "taw"
+ );
+
+ if($taw){
+
+ $this->fuckhtml->load($taw);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($as) !== 0){
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]
+ );
+
+ // @TODO implement did_you_mean
+ $out["spelling"] = [
+ "type" => "including",
+ "using" => $search,
+ "correction" => $text
+ ];
+ }
+ }
+
+ $this->fuckhtml->load($result_div);
}
//
@@ -895,36 +932,10 @@ class google{
// get "Related Searches" and "People also search for"
//
$relateds =
- array_merge(
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle(
- [
- "align-items" => "center",
- "background-color" => "#28292a",
- "border-radius" => "100px",
- "box-sizing" => "border-box",
- "display" => "flex",
- "max-height" => "none",
- "min-height" => "48px",
- "padding-left" => "17px",
- "padding-right" => "17px",
- "position" => "relative"
- ]
- ) . " " .
- $this->getstyle(
- [
- "margin-left" => "8px",
- "margin-right" => "8px"
- ]
- ),
- "a"
- ),
- $this->fuckhtml
- ->getElementsByClassName(
- "wyccme",
- "div"
- )
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "wyccme",
+ "div"
);
foreach($relateds as $related){
@@ -1354,7 +1365,7 @@ class google{
"font-size" => "12px",
"line-height" => "1.34",
"display" => "inline-block",
- "font-family" => "Google Sans,arial,sans-serif",
+ "font-family" => "google sans,arial,sans-serif",
"padding-right" => "0",
"white-space" => "nowrap"
]
@@ -1401,7 +1412,7 @@ class google{
"line-height" => "22px",
"overflow" => "hidden",
"word-break" => "break-word",
- "color" => "#bdc1c6"
+ "color" => "#4d5156"
]
),
"div"
@@ -1415,12 +1426,9 @@ class google{
->getElementsByClassName(
$this->getstyle(
[
- "border-radius" => "10px",
- "font-family" => "arial,sans-serif-medium,sans-serif",
- "font-size" => "12px",
- "line-height" => "16px",
- "padding-block" => "2px",
- "padding-inline" => "8px"
+ "background-color" => "rgba(0,0,0,0.6)",
+ "color" => "#fff",
+ "fill" => "#fff"
]
),
"div"
@@ -1433,14 +1441,6 @@ class google{
->getTextContent(
$duration[0]
);
-
- // remove duration from description
- $description[0]["innerHTML"] =
- str_replace(
- $duration[0]["outerHTML"],
- "",
- $description[0]["innerHTML"]
- );
}
$web["description"] =
@@ -1979,7 +1979,7 @@ class google{
"font-size" => "12px",
"line-height" => "1.34",
"display" => "inline-block",
- "font-family" => "Google Sans,arial,sans-serif",
+ "font-family" => "google sans,arial,sans-serif",
"padding-right" => "0",
"white-space" => "nowrap"
]
@@ -2211,7 +2211,7 @@ class google{
->getElementsByClassName(
$this->getstyle(
[
- "font-family" => "Google Sans,arial,sans-serif",
+ "font-family" => "google sans,arial,sans-serif",
"font-size" => "28px",
"line-height" => "36px"
]
@@ -2801,7 +2801,22 @@ class google{
}
}
- // get thumbnail
+ // get heading element
+ $heading =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "role",
+ "heading",
+ "div"
+ );
+
+ if(count($heading) === 0){
+
+ // no heading, fuck this.
+ continue;
+ }
+
+ // get thumbnail before loading heading object
$image =
$this->fuckhtml
->getElementsByAttributeName(
@@ -2823,35 +2838,6 @@ class google{
];
}
- // get title
- $title =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle(
- [
- "font-family" => "arial,sans-serif",
- "font-size" => "16px",
- "font-weight" => "400",
- "line-height" => "24px"
- ]
- ),
- "div"
- );
-
- if(count($title) === 0){
-
- // ?? no title
- continue;
- }
-
- $title =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $title[0]
- )
- );
-
// get duration
$duration_div =
$this->fuckhtml
@@ -2908,6 +2894,38 @@ class google{
}
}
+ // load heading
+ $this->fuckhtml->load($heading[0]);
+
+ // get title
+ $title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "font-family" => "arial,sans-serif",
+ "font-size" => "16px",
+ "font-weight" => "400",
+ "line-height" => "24px"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($title) === 0){
+
+ // ?? no title
+ continue;
+ }
+
+ $title =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ )
+ );
+
// get date
$date_div =
$this->fuckhtml
@@ -3940,7 +3958,7 @@ class google{
for($k=0; $k<count($values_regex[1]); $k++){
$values[trim($values_regex[1][$k])] =
- trim($values_regex[2][$k]);
+ strtolower(trim($values_regex[2][$k]));
}
$names = explode(",", $matches[1][$i]);
@@ -3971,7 +3989,7 @@ class google{
foreach($this->styles[":root"] as $key => $value){
- $this->css_colors[$value] = $key;
+ $this->css_colors[$value] = strtolower($key);
}
}
}
@@ -4206,7 +4224,7 @@ class google{
throw new Exception("Failed to get HTML");
}
- //$html = file_get_contents("scraper/google-video.html");
+ //$html = file_get_contents("scraper/google.html");
$response = $this->parsepage($html, "videos", $search, $proxy, $params);
$out = [
diff --git a/scraper/sc.php b/scraper/sc.php
index 86ea979..7083c42 100644
--- a/scraper/sc.php
+++ b/scraper/sc.php
@@ -6,6 +6,9 @@ class sc{
include "lib/backend.php";
$this->backend = new backend("sc");
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
@@ -25,7 +28,7 @@ class sc{
];
}
- private function get($proxy, $url, $get = []){
+ private function get($proxy, $url, $get = [], $web_req = false){
$curlproc = curl_init();
@@ -37,19 +40,43 @@ class sc{
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
- curl_setopt($curlproc, CURLOPT_HTTPHEADER,
- ["User-Agent: " . config::USER_AGENT,
- "Accept: application/json, text/javascript, */*; q=0.01",
- "Accept-Language: en-US,en;q=0.5",
- "Accept-Encoding: gzip",
- "Referer: https://soundcloud.com/",
- "Origin: https://soundcloud.com",
- "DNT: 1",
- "Connection: keep-alive",
- "Sec-Fetch-Dest: empty",
- "Sec-Fetch-Mode: cors",
- "Sec-Fetch-Site: same-site"]
- );
+
+ // use http2
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
+ if($web_req === false){
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "Referer: https://soundcloud.com/",
+ "Origin: https://soundcloud.com",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Sec-Fetch-Dest: empty",
+ "Sec-Fetch-Mode: cors",
+ "Sec-Fetch-Site: same-site",
+ "Priority: u=1"]
+ );
+ }else{
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: cross-site",
+ "Priority: u=1",
+ "TE: trailers"]
+ );
+ }
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
@@ -300,9 +327,12 @@ class sc{
$description[] = $song["title"];
}
- if(count($description) != 0){
+ if(count($description) !== 0){
$description = trim($count . " songs. " . implode(", ", $description));
+ }else{
+
+ $description = "";
}
if(
@@ -396,13 +426,48 @@ class sc{
$token = apcu_fetch("sc_token");
- if($token === false){
+ if($token !== false){
+
+ return $token;
+ }
+
+ // search through all javascript components on the main page
+ try{
+ $html =
+ $this->get(
+ $proxy,
+ "https://soundcloud.com",
+ [],
+ true
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch front page");
+ }
+
+ $this->fuckhtml->load($html);
+
+ $scripts =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "script"
+ );
+
+ foreach($scripts as $script){
+
+ if(
+ !isset($script["attributes"]["src"]) ||
+ strpos($script["attributes"]["src"], "sndcdn.com") === false
+ ){
+
+ continue;
+ }
try{
$js =
$this->get(
$proxy,
- "https://a-v2.sndcdn.com/assets/0-a901c1e0.js",
+ $script["attributes"]["src"],
[]
);
}catch(Exception $error){
@@ -416,16 +481,15 @@ class sc{
$token
);
- if(!isset($token[1])){
+ if(isset($token[1])){
- throw new Exception("Failed to get search token");
+ apcu_store("sc_token", $token[1]);
+ return $token[1];
+ break;
}
-
- apcu_store("sc_token", $token[1]);
- return $token[1];
}
- return $token;
+ throw new Exception("Did not find a Soundcloud token in the Javascript blobs");
}
private function limitstrlen($text){
diff --git a/scraper/yep.php b/scraper/yep.php
index c8f82f9..bfe347f 100644
--- a/scraper/yep.php
+++ b/scraper/yep.php
@@ -252,21 +252,30 @@ class yep{
curl_setopt($curlproc, CURLOPT_URL, $url);
+ // use http2
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
+ // set ciphers
+ curl_setopt(
+ $curlproc,
+ CURLOPT_SSL_CIPHER_LIST,
+ "aes_128_gcm_sha_256,chacha20_poly1305_sha_256,aes_256_gcm_sha_384,ecdhe_ecdsa_aes_128_gcm_sha_256,ecdhe_rsa_aes_128_gcm_sha_256,ecdhe_ecdsa_chacha20_poly1305_sha_256,ecdhe_rsa_chacha20_poly1305_sha_256,ecdhe_ecdsa_aes_256_gcm_sha_384,ecdhe_rsa_aes_256_gcm_sha_384,ecdhe_ecdsa_aes_256_sha,ecdhe_ecdsa_aes_128_sha,ecdhe_rsa_aes_128_sha,ecdhe_rsa_aes_256_sha,rsa_aes_128_gcm_sha_256,rsa_aes_256_gcm_sha_384,rsa_aes_128_sha,rsa_aes_256_sha"
+ );
+
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
- "Connection: keep-alive",
- "DNT: 1",
- "Priority: u=1",
- "Origin: https://yep.com",
"Referer: https://yep.com/",
+ "Origin: https://yep.com",
+ "DNT: 1",
"Connection: keep-alive",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-site",
+ "Priority: u=4",
"TE: trailers"]
);