summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
Diffstat (limited to 'scraper')
-rw-r--r--scraper/brave.php20
-rw-r--r--scraper/facebook.php809
2 files changed, 816 insertions, 13 deletions
diff --git a/scraper/brave.php b/scraper/brave.php
index c598c80..bcec59e 100644
--- a/scraper/brave.php
+++ b/scraper/brave.php
@@ -1,12 +1,4 @@
<?php
-/*
-$brave = new brave();
-
-$handle = fopen("captcha.html", "r");
-$html = fread($handle, filesize("captcha.html"));
-fclose($handle);
-
-$brave->bypasscaptcha($html, "yes", "ca");*/
class brave{
@@ -154,6 +146,11 @@ class brave{
case "no": $nsfw = "strict"; break;
}
+ if($country == "any"){
+
+ $country = "all";
+ }
+
$headers = [
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
@@ -169,11 +166,6 @@ class brave{
"Sec-Fetch-User: ?1"
];
- if($country == "any"){
-
- $country = "all";
- }
-
$curlproc = curl_init();
if($get !== []){
@@ -1990,6 +1982,8 @@ class brave{
as $result
){
+ print_r($result);
+
$out["image"][] = [
"title" => $result["title"],
"source" => [
diff --git a/scraper/facebook.php b/scraper/facebook.php
new file mode 100644
index 0000000..ebb5466
--- /dev/null
+++ b/scraper/facebook.php
@@ -0,0 +1,809 @@
+<?php
+
+class facebook{
+
+ const get = 0;
+ const post = 1;
+
+ public function __construct(){
+
+ include "lib/nextpage.php";
+ $this->nextpage = new nextpage("fb");
+ }
+
+ public function getfilters($page){
+
+ return [
+ "sort" => [
+ "display" => "Sort by",
+ "option" => [
+ "relevance" => "Relevance",
+ "most_recent" => "Most recent"
+ ]
+ ],
+ "newer" => [
+ "display" => "Newer than",
+ "option" => "_DATE"
+ ],
+ "older" => [
+ "display" => "Older than",
+ "option" => "_DATE"
+ ],
+ "live" => [
+ "display" => "Livestream",
+ "option" => [
+ "no" => "No",
+ "yes" => "Yes"
+ ]
+ ]
+ ];
+ }
+
+ private function get($url, $get = [], $reqtype = self::get){
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+
+ $get = http_build_query($get);
+
+ if($reqtype === self::get){
+
+ $headers = [
+ "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0",
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1"
+ ];
+
+ $url .= "?" . $get;
+ }else{
+
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
+ $headers = [
+ "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0",
+ "Accept: */*",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip, deflate, br",
+ "Content-Type: application/x-www-form-urlencoded",
+ "X-FB-Friendly-Name: SearchCometResultsPaginatedResultsQuery",
+ //"X-FB-LSD: AVptQC4a16c",
+ //"X-ASBD-ID: 129477",
+ "Content-Length: " . strlen($get),
+ "Origin: https://www.facebook.com",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Referer: https://www.facebook.com/watch/",
+ "Cookie: datr=__GMZCgwVF5BbyvAtfJojQwg; oo=v1%7C3%3A1691641171; wd=955x995",
+ "Sec-Fetch-Dest: empty",
+ "Sec-Fetch-Mode: cors",
+ "Sec-Fetch-Site: same-origin",
+ "TE: trailers"
+ ];
+
+ curl_setopt($curlproc, CURLOPT_POST, true);
+ curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
+ }
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+ return $data;
+ }
+
+ public function video($get){
+
+ $search = $get["s"];
+ $npt = $get["npt"];
+
+ $this->out = [
+ "status" => "ok",
+ "npt" => null,
+ "video" => [],
+ "author" => [],
+ "livestream" => [],
+ "playlist" => [],
+ "reel" => []
+ ];
+
+ if($get["npt"]){
+
+ $nextpage =
+ json_decode(
+ $this->nextpage->get(
+ $npt,
+ "videos"
+ ),
+ true
+ );
+
+ // parse next page
+ $this->video_nextpage($nextpage);
+
+ return $this->out;
+ }
+
+ // generate filter data
+ // {
+ // "rp_creation_time:0":"{\"name\":\"creation_time\",\"args\":\"{\\\"start_year\\\":\\\"2023\\\",\\\"start_month\\\":\\\"2023-08\\\",\\\"end_year\\\":\\\"2023\\\",\\\"end_month\\\":\\\"2023-08\\\",\\\"start_day\\\":\\\"2023-08-10\\\",\\\"end_day\\\":\\\"2023-08-10\\\"}\"}",
+ // "videos_sort_by:0":"{\"name\":\"videos_sort_by\",\"args\":\"Most Recent\"}",
+ // "videos_live:0":"{\"name\":\"videos_live\",\"args\":\"\"}"
+ // }
+ $filter = [];
+ $sort = $get["sort"];
+ $live = $get["live"];
+ $older = $get["older"];
+ $newer = $get["newer"];
+
+ if(
+ $older !== false ||
+ $newer !== false
+ ){
+
+ if($older === false){
+
+ $older = time();
+ }
+
+ if($newer === false){
+
+ $newer = 0;
+ }
+
+ $filter["rp_creation_time:0"] =
+ json_encode(
+ [
+ "name" => "creation_time",
+ "args" =>
+ json_encode(
+ [
+ "start_year" => date("Y", $newer),
+ "start_month" => date("Y-m", $newer),
+ "end_year" => date("Y", $older),
+ "end_month" => date("Y-m", $older),
+ "start_day" => date("Y-m-d", $newer),
+ "end_day" => date("Y-m-d", $older)
+ ]
+ )
+ ]
+ );
+ }
+
+ if($sort != "relevance"){
+
+ $filter["videos_sort_by:0"] =
+ json_encode(
+ [
+ "name" => "videos_sort_by",
+ "args" => "Most Recent"
+ ]
+ );
+ }
+
+ if($live != "no"){
+
+ $filter["videos_live:0"] = json_encode(
+ [
+ "name" => "videos_live",
+ "args" => ""
+ ]
+ );
+ }
+
+ $req = [
+ "q" => $search
+ ];
+
+ if(count($filter) !== 0){
+
+ $req["filters"] =
+ base64_encode(
+ json_encode(
+ $filter
+ )
+ );
+ }
+
+ $html =
+ $this->get(
+ "https://www.facebook.com/watch/search/",
+ $req
+ );
+ /*
+ $handle = fopen("scraper/facebook.html", "r");
+ $html = fread($handle, filesize("scraper/facebook.html"));
+ fclose($handle);*/
+
+ preg_match_all(
+ '/({"__bbox":.*,"sequence_number":0}})\]\]/',
+ $html,
+ $json
+ );
+
+ if(!isset($json[1][1])){
+
+ throw new Exception("Could not grep JSON body");
+ }
+
+ $json = json_decode($json[1][1], true);
+
+ foreach(
+ $json
+ ["__bbox"]
+ ["result"]
+ ["data"]
+ ["serpResponse"]
+ ["results"]
+ ["edges"]
+ as $result
+ ){
+
+ $this->parse_edge($result);
+ }
+
+ // get nextpage data
+ if(
+ $json
+ ["__bbox"]
+ ["result"]
+ ["data"]
+ ["serpResponse"]
+ ["results"]
+ ["page_info"]
+ ["has_next_page"]
+ == 1
+ ){
+
+ preg_match(
+ '/handleWithCustomApplyEach\(ScheduledApplyEach,({.*})\);}\);}\);<\/script>/',
+ $html,
+ $nextpagedata
+ );
+
+ // [POST] https://www.facebook.com/api/graphql/
+ // FORM data, not JSON!
+
+ $nextpage = [
+ "av" => "0",
+ "__user" => null,
+ "__a" => null,
+ "__req" => "2",
+ "__hs" => null,
+ "dpr" => "1",
+ "__ccg" => null,
+ "__rev" => null,
+ // another client side token
+ "__s" => $this->randomstring(6) . ":" . $this->randomstring(6) . ":" . $this->randomstring(6),
+ "__hsi" => null,
+ // tracking fingerprint (probably generated using webgl)
+ "__dyn" => "7xeUmwlE7ibwKBWo2vwAxu13w8CewSwMwNw9G2S0im3y4o0B-q1ew65xO2O1Vw8G1Qw5Mx61vw9m1YwBgao6C0Mo5W3S7Udo5q4U2zxe2Gew9O222SUbEaU2eU5O0GpovU19pobodEGdw46wbS1LwTwNwLw8O1pwr86C16w",
+ "__csr" => $this->randomstring(null),
+ "__comet_req" => null,
+ "lsd" => null,
+ "jazoest" => null,
+ "__spin_r" => null,
+ "__spin_b" => null,
+ "__spin_t" => null,
+ "fb_api_caller_class" => "RelayModern",
+ "fb_api_req_friendly_name" => "SearchCometResultsPaginatedResultsQuery",
+ "variables" => [ // this is json
+ "UFI2CommentsProvider_commentsKey" => "SearchCometResultsInitialResultsQuery",
+ "allow_streaming" => false,
+ "args" => [
+ "callsite" => "comet:watch_search",
+ "config" => [
+ "exact_match" => false,
+ "high_confidence_config" => null,
+ "intercept_config" => null,
+ "sts_disambiguation" => null,
+ "watch_config" => null
+ ],
+ "context" => [
+ "bsid" => null,
+ "tsid" => null
+ ],
+ "experience" => [
+ "encoded_server_defined_params" => null,
+ "fbid" => null,
+ "type" => "WATCH_TAB_GLOBAL"
+ ],
+ "filters" => [],
+ "text" => $search
+ ],
+ "count" => 5,
+ "cursor" =>
+ $json
+ ["__bbox"]
+ ["result"]
+ ["data"]
+ ["serpResponse"]
+ ["results"]
+ ["page_info"]
+ ["end_cursor"],
+ "displayCommentsContextEnableComment" => false,
+ "displayCommentsContextIsAdPreview" => false,
+ "displayCommentsContextIsAggregatedShare" => false,
+ "displayCommentsContextIsStorySet" => false,
+ "displayCommentsFeedbackContext" => null,
+ "feedLocation" => "SEARCH",
+ "feedbackSource" => 23,
+ "fetch_filters" => true,
+ "focusCommentID" => null,
+ "locale" => null,
+ "privacySelectorRenderLocation" => "COMET_STREAM",
+ "renderLocation" => "search_results_page",
+ "scale" => 1,
+ "stream_initial_count" => 0,
+ "useDefaultActor" => false,
+ "__relay_internal__pv__IsWorkUserrelayprovider" => false,
+ "__relay_internal__pv__IsMergQAPollsrelayprovider" => false,
+ "__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider" => false,
+ "__relay_internal__pv__StoriesRingrelayprovider" => false
+ ],
+ "server_timestamps" => "true",
+ "doc_id" => "6761275837251607" // is actually dynamic
+ ];
+
+ // append filters to nextpage
+ foreach($filter as $key => $value){
+
+ $nextpage["variables"]["args"]["filters"][] =
+ $value;
+ }
+
+ $nextpagedata = json_decode($nextpagedata[1], true);
+
+ // get bsid
+ foreach($nextpagedata["require"] as $key){
+
+ foreach($key as $innerkey){
+
+ if(is_array($innerkey)){
+ foreach($innerkey as $inner_innerkey){
+
+ if(is_array($inner_innerkey)){
+ foreach($inner_innerkey as $inner_inner_innerkey){
+
+ if(
+ isset(
+ $inner_inner_innerkey
+ ["variables"]
+ ["args"]
+ ["context"]
+ ["bsid"]
+ )
+ ){
+
+ $nextpage
+ ["variables"]
+ ["args"]
+ ["context"]
+ ["bsid"] =
+ $inner_inner_innerkey
+ ["variables"]
+ ["args"]
+ ["context"]
+ ["bsid"];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ foreach($nextpagedata["define"] as $key){
+
+ if(isset($key[2]["haste_session"])){
+
+ $nextpage["__hs"] = $key[2]["haste_session"];
+ }
+
+ if(isset($key[2]["connectionClass"])){
+
+ $nextpage["__ccg"] = $key[2]["connectionClass"];
+ }
+
+ if(isset($key[2]["__spin_r"])){
+
+ $nextpage["__spin_r"] = (string)$key[2]["__spin_r"];
+ }
+
+ if(isset($key[2]["hsi"])){
+
+ $nextpage["__hsi"] = (string)$key[2]["hsi"];
+ }
+
+ if(
+ isset($key[2]["token"]) &&
+ !empty($key[2]["token"])
+ ){
+
+ $nextpage["lsd"] = $key[2]["token"];
+ }
+
+ if(isset($key[2]["__spin_r"])){
+
+ $nextpage["__spin_r"] = (string)$key[2]["__spin_r"];
+ $nextpage["__rev"] = $nextpage["__spin_r"];
+ }
+
+ if(isset($key[2]["__spin_b"])){
+
+ $nextpage["__spin_b"] = $key[2]["__spin_b"];
+ }
+
+ if(isset($key[2]["__spin_t"])){
+
+ $nextpage["__spin_t"] = (string)$key[2]["__spin_t"];
+ }
+ }
+
+ preg_match(
+ '/{"u":"\\\\\/ajax\\\\\/qm\\\\\/\?__a=([0-9]+)&__user=([0-9]+)&__comet_req=([0-9]+)&jazoest=([0-9]+)"/',
+ $html,
+ $ajaxparams
+ );
+
+ if(count($ajaxparams) !== 5){
+
+ throw new Exception("Could not grep the AJAX parameters");
+ }
+
+ $nextpage["__a"] = $ajaxparams[1];
+ $nextpage["__user"] = $ajaxparams[2];
+ $nextpage["__comet_req"] = $ajaxparams[3];
+ $nextpage["jazoest"] = $ajaxparams[4];
+
+ /*
+ $handle = fopen("scraper/facebook-nextpage.json", "r");
+ $json = fread($handle, filesize("scraper/facebook-nextpage.json"));
+ fclose($handle);*/
+
+ $nextpage["variables"] = json_encode($nextpage["variables"]);
+
+ $this->video_nextpage($nextpage);
+ }
+
+ return $this->out;
+ }
+
+ private function video_nextpage($nextpage, $getcursor = false){
+
+ $json =
+ $this->get(
+ "https://www.facebook.com/api/graphql/",
+ $nextpage,
+ self::post
+ );
+
+ $json = json_decode($json, true);
+
+ if($json === null){
+
+ throw new Exception("Failed to decode next page JSON");
+ }
+
+ foreach(
+ $json
+ ["data"]
+ ["serpResponse"]
+ ["results"]
+ ["edges"]
+ as $result
+ ){
+
+ $this->parse_edge($result);
+ }
+
+ if(
+ $json
+ ["data"]
+ ["serpResponse"]
+ ["results"]
+ ["page_info"]
+ ["has_next_page"] == 1
+ ){
+
+ $nextpage["variables"] = json_decode($nextpage["variables"], true);
+
+ $nextpage["variables"]["cursor"] =
+ $json
+ ["data"]
+ ["serpResponse"]
+ ["results"]
+ ["page_info"]
+ ["end_cursor"];
+
+ $nextpage["variables"] = json_encode($nextpage["variables"]);
+
+ //change this for second call. after, it's static.
+ // TODO: csr also updates to longer string
+ $nextpage["__dyn"] = "7xeUmwlEnwn8K2WnFw9-2i5U4e0yoW3q322aew9G2S0zU20xi3y4o0B-q1ew65xOfxO1Vw8G11xmfz81s8hwGwQw9m1YwBgao6C2O0B85W3S7Udo5qfK0EUjwGzE2swwwJK2W2K0zK5o4q0GpovU19pobodEGdw46wbS1LwTwNwLw8O1pwr86C16w";
+
+ // TODO: change this on third and 6th call
+ //$nextpage["__s"] = $this->randomstring(6) . ":" . explode(":", $nextpage["__s"], 2)[1];
+
+ $this->out["npt"] = $this->nextpage->store(json_encode($nextpage), "videos");
+ }
+ }
+
+ private function parse_edge($edge){
+
+ $append = "video";
+ $edge =
+ $edge
+ ["relay_rendering_strategy"]
+ ["view_model"];
+
+ if(
+ strtolower(
+ $edge
+ ["video_metadata_model"]
+ ["video_broadcast_status"]
+ )
+ == "live"
+ ){
+
+ // handle livestream
+ $duration = "_LIVE";
+ $append = "livestream";
+ $timetext = null;
+ $views =
+ (int)$edge
+ ["video_metadata_model"]
+ ["relative_time_string"];
+
+ }elseif(
+ stripos(
+ $edge
+ ["video_metadata_model"]
+ ["video_broadcast_status"],
+ "vod"
+ ) !== false
+ ){
+
+ // handle VOD format
+ $timetext = null;
+ $views =
+ (int)$edge
+ ["video_metadata_model"]
+ ["relative_time_string"];
+
+ $duration =
+ $this->hms2int(
+ $edge
+ ["video_thumbnail_model"]
+ ["video_duration_text"]
+ );
+
+ }else{
+
+ // handle normal format
+ $timetext =
+ explode(
+ " ยท ",
+ $edge
+ ["video_metadata_model"]
+ ["relative_time_string"],
+ 2
+ );
+
+ if(count($timetext) === 2){
+
+ $views = $this->truncatedcount2int($timetext[1]);
+ }else{
+
+ $views = null;
+ }
+
+ $timetext = strtotime($timetext[0]);
+
+ $duration =
+ $this->hms2int(
+ $edge
+ ["video_thumbnail_model"]
+ ["video_duration_text"]
+ );
+ }
+
+ if(
+ isset(
+ $edge
+ ["video_metadata_model"]
+ ["video_owner_profile"]
+ ["uri_token"]
+ )
+ ){
+
+ $profileurl =
+ "https://www.facebook.com/watch/" .
+ $edge
+ ["video_metadata_model"]
+ ["video_owner_profile"]
+ ["uri_token"];
+ }else{
+
+ $profileurl =
+ $edge
+ ["video_metadata_model"]
+ ["video_owner_profile"]
+ ["url"];
+ }
+
+ $this->out[$append][] = [
+ "title" =>
+ $this->limitstrlen(
+ str_replace(
+ "\n",
+ " ",
+ $edge
+ ["video_metadata_model"]
+ ["title"]
+ ),
+ 100
+ ),
+ "description" =>
+ empty(
+ $edge
+ ["video_metadata_model"]
+ ["save_description"]
+ ) ?
+ null :
+ str_replace(
+ "\n",
+ " ",
+ $this->limitstrlen(
+ $edge
+ ["video_metadata_model"]
+ ["save_description"]
+ )
+ ),
+ "author" => [
+ "name" =>
+ $edge
+ ["video_metadata_model"]
+ ["video_owner_profile"]
+ ["name"],
+ "url" => $profileurl,
+ "avatar" => null
+ ],
+ "date" => $timetext,
+ "duration" => $duration,
+ "views" => $views,
+ "thumb" =>
+ [
+ "url" =>
+ $edge
+ ["video_thumbnail_model"]
+ ["thumbnail_image"]
+ ["uri"],
+ "ratio" => "16:9"
+ ],
+ "url" =>
+ "https://www.facebook.com/watch/?v=" .
+ $edge
+ ["video_click_model"]
+ ["click_metadata_model"]
+ ["video_id"]
+ ];
+ }
+
+ private function randomstring($len){
+
+ if($len === null){
+
+ $str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789-";
+ $len = rand(141, 145);
+ $c = 61;
+ }else{
+
+ $str = "abcdefghijklmnopqrstuvwxyz123456789";
+ $c = 34;
+ }
+
+ $out = null;
+ for($i=0; $i<$len; $i++){
+
+ $out .= $str[rand(0, $c)];
+ }
+
+ return $out;
+ }
+
+ private function limitstrlen($text, $len = 300){
+
+ return explode("\n", wordwrap($text, $len, "\n"))[0];
+ }
+
+ private function hms2int($time){
+
+ $parts = explode(":", $time, 3);
+ $time = 0;
+
+ if(count($parts) === 3){
+
+ // hours
+ $time = $time + ((int)$parts[0] * 3600);
+ array_shift($parts);
+ }
+
+ if(count($parts) === 2){
+
+ // minutes
+ $time = $time + ((int)$parts[0] * 60);
+ array_shift($parts);
+ }
+
+ // seconds
+ $time = $time + (int)$parts[0];
+
+ return $time;
+ }
+
+ private function truncatedcount2int($number){
+
+ // decimal should always be 1 number long
+ $number = explode(" ", $number, 2);
+ $number = $number[0];
+
+ $unit = strtolower($number[strlen($number) - 1]);
+
+ $tmp = explode(".", $number, 2);
+ $number = (int)$number;
+
+ if(count($tmp) === 2){
+
+ $decimal = (int)$tmp[1];
+ }else{
+
+ $decimal = 0;
+ }
+
+ switch($unit){
+
+ case "k":
+ $exponant = 1000;
+ break;
+
+ case "m":
+ $exponant = 1000000;
+ break;
+
+ case "b";
+ $exponant = 1000000000;
+ break;
+
+ default:
+ $exponant = 1;
+ break;
+ }
+
+ return ($number * $exponant) + ($decimal * ($exponant / 10));
+ }
+}