From 2913c58cecc26b699ec44c19f105f036f3dcb6a6 Mon Sep 17 00:00:00 2001 From: lolcat Date: Thu, 10 Aug 2023 22:54:37 -0400 Subject: added faceberg videos --- scraper/brave.php | 20 +- scraper/facebook.php | 809 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 816 insertions(+), 13 deletions(-) create mode 100644 scraper/facebook.php (limited to 'scraper') diff --git a/scraper/brave.php b/scraper/brave.php index c598c80..bcec59e 100644 --- a/scraper/brave.php +++ b/scraper/brave.php @@ -1,12 +1,4 @@ bypasscaptcha($html, "yes", "ca");*/ class brave{ @@ -154,6 +146,11 @@ class brave{ case "no": $nsfw = "strict"; break; } + if($country == "any"){ + + $country = "all"; + } + $headers = [ "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", @@ -169,11 +166,6 @@ class brave{ "Sec-Fetch-User: ?1" ]; - if($country == "any"){ - - $country = "all"; - } - $curlproc = curl_init(); if($get !== []){ @@ -1990,6 +1982,8 @@ class brave{ as $result ){ + print_r($result); + $out["image"][] = [ "title" => $result["title"], "source" => [ diff --git a/scraper/facebook.php b/scraper/facebook.php new file mode 100644 index 0000000..ebb5466 --- /dev/null +++ b/scraper/facebook.php @@ -0,0 +1,809 @@ +nextpage = new nextpage("fb"); + } + + public function getfilters($page){ + + return [ + "sort" => [ + "display" => "Sort by", + "option" => [ + "relevance" => "Relevance", + "most_recent" => "Most recent" + ] + ], + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ], + "live" => [ + "display" => "Livestream", + "option" => [ + "no" => "No", + "yes" => "Yes" + ] + ] + ]; + } + + private function get($url, $get = [], $reqtype = self::get){ + + $curlproc = curl_init(); + + if($get !== []){ + + $get = http_build_query($get); + + if($reqtype === self::get){ + + $headers = [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1" + ]; + + $url .= "?" . $get; + }else{ + + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + $headers = [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0", + "Accept: */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br", + "Content-Type: application/x-www-form-urlencoded", + "X-FB-Friendly-Name: SearchCometResultsPaginatedResultsQuery", + //"X-FB-LSD: AVptQC4a16c", + //"X-ASBD-ID: 129477", + "Content-Length: " . strlen($get), + "Origin: https://www.facebook.com", + "DNT: 1", + "Connection: keep-alive", + "Referer: https://www.facebook.com/watch/", + "Cookie: datr=__GMZCgwVF5BbyvAtfJojQwg; oo=v1%7C3%3A1691641171; wd=955x995", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "TE: trailers" + ]; + + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get); + } + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function video($get){ + + $search = $get["s"]; + $npt = $get["npt"]; + + $this->out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + if($get["npt"]){ + + $nextpage = + json_decode( + $this->nextpage->get( + $npt, + "videos" + ), + true + ); + + // parse next page + $this->video_nextpage($nextpage); + + return $this->out; + } + + // generate filter data + // { + // "rp_creation_time:0":"{\"name\":\"creation_time\",\"args\":\"{\\\"start_year\\\":\\\"2023\\\",\\\"start_month\\\":\\\"2023-08\\\",\\\"end_year\\\":\\\"2023\\\",\\\"end_month\\\":\\\"2023-08\\\",\\\"start_day\\\":\\\"2023-08-10\\\",\\\"end_day\\\":\\\"2023-08-10\\\"}\"}", + // "videos_sort_by:0":"{\"name\":\"videos_sort_by\",\"args\":\"Most Recent\"}", + // "videos_live:0":"{\"name\":\"videos_live\",\"args\":\"\"}" + // } + $filter = []; + $sort = $get["sort"]; + $live = $get["live"]; + $older = $get["older"]; + $newer = $get["newer"]; + + if( + $older !== false || + $newer !== false + ){ + + if($older === false){ + + $older = time(); + } + + if($newer === false){ + + $newer = 0; + } + + $filter["rp_creation_time:0"] = + json_encode( + [ + "name" => "creation_time", + "args" => + json_encode( + [ + "start_year" => date("Y", $newer), + "start_month" => date("Y-m", $newer), + "end_year" => date("Y", $older), + "end_month" => date("Y-m", $older), + "start_day" => date("Y-m-d", $newer), + "end_day" => date("Y-m-d", $older) + ] + ) + ] + ); + } + + if($sort != "relevance"){ + + $filter["videos_sort_by:0"] = + json_encode( + [ + "name" => "videos_sort_by", + "args" => "Most Recent" + ] + ); + } + + if($live != "no"){ + + $filter["videos_live:0"] = json_encode( + [ + "name" => "videos_live", + "args" => "" + ] + ); + } + + $req = [ + "q" => $search + ]; + + if(count($filter) !== 0){ + + $req["filters"] = + base64_encode( + json_encode( + $filter + ) + ); + } + + $html = + $this->get( + "https://www.facebook.com/watch/search/", + $req + ); + /* + $handle = fopen("scraper/facebook.html", "r"); + $html = fread($handle, filesize("scraper/facebook.html")); + fclose($handle);*/ + + preg_match_all( + '/({"__bbox":.*,"sequence_number":0}})\]\]/', + $html, + $json + ); + + if(!isset($json[1][1])){ + + throw new Exception("Could not grep JSON body"); + } + + $json = json_decode($json[1][1], true); + + foreach( + $json + ["__bbox"] + ["result"] + ["data"] + ["serpResponse"] + ["results"] + ["edges"] + as $result + ){ + + $this->parse_edge($result); + } + + // get nextpage data + if( + $json + ["__bbox"] + ["result"] + ["data"] + ["serpResponse"] + ["results"] + ["page_info"] + ["has_next_page"] + == 1 + ){ + + preg_match( + '/handleWithCustomApplyEach\(ScheduledApplyEach,({.*})\);}\);}\);<\/script>/', + $html, + $nextpagedata + ); + + // [POST] https://www.facebook.com/api/graphql/ + // FORM data, not JSON! + + $nextpage = [ + "av" => "0", + "__user" => null, + "__a" => null, + "__req" => "2", + "__hs" => null, + "dpr" => "1", + "__ccg" => null, + "__rev" => null, + // another client side token + "__s" => $this->randomstring(6) . ":" . $this->randomstring(6) . ":" . $this->randomstring(6), + "__hsi" => null, + // tracking fingerprint (probably generated using webgl) + "__dyn" => "7xeUmwlE7ibwKBWo2vwAxu13w8CewSwMwNw9G2S0im3y4o0B-q1ew65xO2O1Vw8G1Qw5Mx61vw9m1YwBgao6C0Mo5W3S7Udo5q4U2zxe2Gew9O222SUbEaU2eU5O0GpovU19pobodEGdw46wbS1LwTwNwLw8O1pwr86C16w", + "__csr" => $this->randomstring(null), + "__comet_req" => null, + "lsd" => null, + "jazoest" => null, + "__spin_r" => null, + "__spin_b" => null, + "__spin_t" => null, + "fb_api_caller_class" => "RelayModern", + "fb_api_req_friendly_name" => "SearchCometResultsPaginatedResultsQuery", + "variables" => [ // this is json + "UFI2CommentsProvider_commentsKey" => "SearchCometResultsInitialResultsQuery", + "allow_streaming" => false, + "args" => [ + "callsite" => "comet:watch_search", + "config" => [ + "exact_match" => false, + "high_confidence_config" => null, + "intercept_config" => null, + "sts_disambiguation" => null, + "watch_config" => null + ], + "context" => [ + "bsid" => null, + "tsid" => null + ], + "experience" => [ + "encoded_server_defined_params" => null, + "fbid" => null, + "type" => "WATCH_TAB_GLOBAL" + ], + "filters" => [], + "text" => $search + ], + "count" => 5, + "cursor" => + $json + ["__bbox"] + ["result"] + ["data"] + ["serpResponse"] + ["results"] + ["page_info"] + ["end_cursor"], + "displayCommentsContextEnableComment" => false, + "displayCommentsContextIsAdPreview" => false, + "displayCommentsContextIsAggregatedShare" => false, + "displayCommentsContextIsStorySet" => false, + "displayCommentsFeedbackContext" => null, + "feedLocation" => "SEARCH", + "feedbackSource" => 23, + "fetch_filters" => true, + "focusCommentID" => null, + "locale" => null, + "privacySelectorRenderLocation" => "COMET_STREAM", + "renderLocation" => "search_results_page", + "scale" => 1, + "stream_initial_count" => 0, + "useDefaultActor" => false, + "__relay_internal__pv__IsWorkUserrelayprovider" => false, + "__relay_internal__pv__IsMergQAPollsrelayprovider" => false, + "__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider" => false, + "__relay_internal__pv__StoriesRingrelayprovider" => false + ], + "server_timestamps" => "true", + "doc_id" => "6761275837251607" // is actually dynamic + ]; + + // append filters to nextpage + foreach($filter as $key => $value){ + + $nextpage["variables"]["args"]["filters"][] = + $value; + } + + $nextpagedata = json_decode($nextpagedata[1], true); + + // get bsid + foreach($nextpagedata["require"] as $key){ + + foreach($key as $innerkey){ + + if(is_array($innerkey)){ + foreach($innerkey as $inner_innerkey){ + + if(is_array($inner_innerkey)){ + foreach($inner_innerkey as $inner_inner_innerkey){ + + if( + isset( + $inner_inner_innerkey + ["variables"] + ["args"] + ["context"] + ["bsid"] + ) + ){ + + $nextpage + ["variables"] + ["args"] + ["context"] + ["bsid"] = + $inner_inner_innerkey + ["variables"] + ["args"] + ["context"] + ["bsid"]; + } + } + } + } + } + } + } + + foreach($nextpagedata["define"] as $key){ + + if(isset($key[2]["haste_session"])){ + + $nextpage["__hs"] = $key[2]["haste_session"]; + } + + if(isset($key[2]["connectionClass"])){ + + $nextpage["__ccg"] = $key[2]["connectionClass"]; + } + + if(isset($key[2]["__spin_r"])){ + + $nextpage["__spin_r"] = (string)$key[2]["__spin_r"]; + } + + if(isset($key[2]["hsi"])){ + + $nextpage["__hsi"] = (string)$key[2]["hsi"]; + } + + if( + isset($key[2]["token"]) && + !empty($key[2]["token"]) + ){ + + $nextpage["lsd"] = $key[2]["token"]; + } + + if(isset($key[2]["__spin_r"])){ + + $nextpage["__spin_r"] = (string)$key[2]["__spin_r"]; + $nextpage["__rev"] = $nextpage["__spin_r"]; + } + + if(isset($key[2]["__spin_b"])){ + + $nextpage["__spin_b"] = $key[2]["__spin_b"]; + } + + if(isset($key[2]["__spin_t"])){ + + $nextpage["__spin_t"] = (string)$key[2]["__spin_t"]; + } + } + + preg_match( + '/{"u":"\\\\\/ajax\\\\\/qm\\\\\/\?__a=([0-9]+)&__user=([0-9]+)&__comet_req=([0-9]+)&jazoest=([0-9]+)"/', + $html, + $ajaxparams + ); + + if(count($ajaxparams) !== 5){ + + throw new Exception("Could not grep the AJAX parameters"); + } + + $nextpage["__a"] = $ajaxparams[1]; + $nextpage["__user"] = $ajaxparams[2]; + $nextpage["__comet_req"] = $ajaxparams[3]; + $nextpage["jazoest"] = $ajaxparams[4]; + + /* + $handle = fopen("scraper/facebook-nextpage.json", "r"); + $json = fread($handle, filesize("scraper/facebook-nextpage.json")); + fclose($handle);*/ + + $nextpage["variables"] = json_encode($nextpage["variables"]); + + $this->video_nextpage($nextpage); + } + + return $this->out; + } + + private function video_nextpage($nextpage, $getcursor = false){ + + $json = + $this->get( + "https://www.facebook.com/api/graphql/", + $nextpage, + self::post + ); + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode next page JSON"); + } + + foreach( + $json + ["data"] + ["serpResponse"] + ["results"] + ["edges"] + as $result + ){ + + $this->parse_edge($result); + } + + if( + $json + ["data"] + ["serpResponse"] + ["results"] + ["page_info"] + ["has_next_page"] == 1 + ){ + + $nextpage["variables"] = json_decode($nextpage["variables"], true); + + $nextpage["variables"]["cursor"] = + $json + ["data"] + ["serpResponse"] + ["results"] + ["page_info"] + ["end_cursor"]; + + $nextpage["variables"] = json_encode($nextpage["variables"]); + + //change this for second call. after, it's static. + // TODO: csr also updates to longer string + $nextpage["__dyn"] = "7xeUmwlEnwn8K2WnFw9-2i5U4e0yoW3q322aew9G2S0zU20xi3y4o0B-q1ew65xOfxO1Vw8G11xmfz81s8hwGwQw9m1YwBgao6C2O0B85W3S7Udo5qfK0EUjwGzE2swwwJK2W2K0zK5o4q0GpovU19pobodEGdw46wbS1LwTwNwLw8O1pwr86C16w"; + + // TODO: change this on third and 6th call + //$nextpage["__s"] = $this->randomstring(6) . ":" . explode(":", $nextpage["__s"], 2)[1]; + + $this->out["npt"] = $this->nextpage->store(json_encode($nextpage), "videos"); + } + } + + private function parse_edge($edge){ + + $append = "video"; + $edge = + $edge + ["relay_rendering_strategy"] + ["view_model"]; + + if( + strtolower( + $edge + ["video_metadata_model"] + ["video_broadcast_status"] + ) + == "live" + ){ + + // handle livestream + $duration = "_LIVE"; + $append = "livestream"; + $timetext = null; + $views = + (int)$edge + ["video_metadata_model"] + ["relative_time_string"]; + + }elseif( + stripos( + $edge + ["video_metadata_model"] + ["video_broadcast_status"], + "vod" + ) !== false + ){ + + // handle VOD format + $timetext = null; + $views = + (int)$edge + ["video_metadata_model"] + ["relative_time_string"]; + + $duration = + $this->hms2int( + $edge + ["video_thumbnail_model"] + ["video_duration_text"] + ); + + }else{ + + // handle normal format + $timetext = + explode( + " ยท ", + $edge + ["video_metadata_model"] + ["relative_time_string"], + 2 + ); + + if(count($timetext) === 2){ + + $views = $this->truncatedcount2int($timetext[1]); + }else{ + + $views = null; + } + + $timetext = strtotime($timetext[0]); + + $duration = + $this->hms2int( + $edge + ["video_thumbnail_model"] + ["video_duration_text"] + ); + } + + if( + isset( + $edge + ["video_metadata_model"] + ["video_owner_profile"] + ["uri_token"] + ) + ){ + + $profileurl = + "https://www.facebook.com/watch/" . + $edge + ["video_metadata_model"] + ["video_owner_profile"] + ["uri_token"]; + }else{ + + $profileurl = + $edge + ["video_metadata_model"] + ["video_owner_profile"] + ["url"]; + } + + $this->out[$append][] = [ + "title" => + $this->limitstrlen( + str_replace( + "\n", + " ", + $edge + ["video_metadata_model"] + ["title"] + ), + 100 + ), + "description" => + empty( + $edge + ["video_metadata_model"] + ["save_description"] + ) ? + null : + str_replace( + "\n", + " ", + $this->limitstrlen( + $edge + ["video_metadata_model"] + ["save_description"] + ) + ), + "author" => [ + "name" => + $edge + ["video_metadata_model"] + ["video_owner_profile"] + ["name"], + "url" => $profileurl, + "avatar" => null + ], + "date" => $timetext, + "duration" => $duration, + "views" => $views, + "thumb" => + [ + "url" => + $edge + ["video_thumbnail_model"] + ["thumbnail_image"] + ["uri"], + "ratio" => "16:9" + ], + "url" => + "https://www.facebook.com/watch/?v=" . + $edge + ["video_click_model"] + ["click_metadata_model"] + ["video_id"] + ]; + } + + private function randomstring($len){ + + if($len === null){ + + $str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789-"; + $len = rand(141, 145); + $c = 61; + }else{ + + $str = "abcdefghijklmnopqrstuvwxyz123456789"; + $c = 34; + } + + $out = null; + for($i=0; $i<$len; $i++){ + + $out .= $str[rand(0, $c)]; + } + + return $out; + } + + private function limitstrlen($text, $len = 300){ + + return explode("\n", wordwrap($text, $len, "\n"))[0]; + } + + private function hms2int($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + private function truncatedcount2int($number){ + + // decimal should always be 1 number long + $number = explode(" ", $number, 2); + $number = $number[0]; + + $unit = strtolower($number[strlen($number) - 1]); + + $tmp = explode(".", $number, 2); + $number = (int)$number; + + if(count($tmp) === 2){ + + $decimal = (int)$tmp[1]; + }else{ + + $decimal = 0; + } + + switch($unit){ + + case "k": + $exponant = 1000; + break; + + case "m": + $exponant = 1000000; + break; + + case "b"; + $exponant = 1000000000; + break; + + default: + $exponant = 1; + break; + } + + return ($number * $exponant) + ($decimal * ($exponant / 10)); + } +} -- cgit v1.2.3