summaryrefslogtreecommitdiff
path: root/scraper/mojeek.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2023-11-07 08:04:56 -0500
committerlolcat <will@lolcat.ca>2023-11-07 08:04:56 -0500
commit785452873f0ee0a27fc157b482b7551560f0282d (patch)
tree4c70e240031ed3868425ca683c83ebfd378a9159 /scraper/mojeek.php
parent64b090ee058953aed2246967332c7f0b6623cd8f (diff)
fix typo
Diffstat (limited to 'scraper/mojeek.php')
-rw-r--r--scraper/mojeek.php427
1 files changed, 204 insertions, 223 deletions
diff --git a/scraper/mojeek.php b/scraper/mojeek.php
index e7e8abc..3d91c09 100644
--- a/scraper/mojeek.php
+++ b/scraper/mojeek.php
@@ -6,8 +6,8 @@ class mojeek{
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
- include "lib/nextpage.php";
- $this->nextpage = new nextpage("mojeek");
+ include "lib/backend.php";
+ $this->backend = new backend("mojeek");
}
public function getfilters($page){
@@ -371,10 +371,10 @@ class mojeek{
}
}
- private function get($url, $get = []){
+ private function get($proxy, $url, $get = []){
$headers = [
- "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
+ "User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
@@ -404,6 +404,8 @@ class mojeek{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
@@ -420,11 +422,12 @@ class mojeek{
if($get["npt"]){
- $token = $this->nextpage->get($get["npt"], "web");
+ [$token, $proxy] = $this->backend->get($get["npt"], "web");
try{
$html =
$this->get(
+ $proxy,
"https://www.mojeek.com" . $token,
[]
);
@@ -485,9 +488,12 @@ class mojeek{
$params["si"] = $domain;
}
+ $proxy = $this->backend->get_ip();
+
try{
$html =
$this->get(
+ $proxy,
"https://www.mojeek.com/search",
$params
);
@@ -529,88 +535,90 @@ class mojeek{
return $out;
}
- $this->fuckhtml->load($results[0]);
-
/*
- Get search results
+ Get all search result divs
*/
- $results =
- $this->fuckhtml
- ->getElementsByTagName("li");
-
- foreach($results as $result){
-
- $data = [
- "title" => null,
- "description" => null,
- "url" => null,
- "date" => null,
- "type" => "web",
- "thumb" => [
- "url" => null,
- "ratio" => null
- ],
- "sublink" => [],
- "table" => []
- ];
-
- $this->fuckhtml->load($result);
+ foreach($results as $container){
- $title =
+ $this->fuckhtml->load($container);
+ $results =
$this->fuckhtml
- ->getElementsByClassName("title", "a")[0];
+ ->getElementsByTagName("li");
- $data["title"] =
- html_entity_decode(
+ foreach($results as $result){
+
+ $data = [
+ "title" => null,
+ "description" => null,
+ "url" => null,
+ "date" => null,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => []
+ ];
+
+ $this->fuckhtml->load($result);
+
+ $title =
$this->fuckhtml
- ->getTextContent(
- $title["innerHTML"]
- )
- );
-
- $data["url"] =
- html_entity_decode(
+ ->getElementsByClassName("title", "a")[0];
+
+ $data["title"] =
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $title["innerHTML"]
+ )
+ );
+
+ $data["url"] =
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $title["attributes"]["href"]
+ )
+ );
+
+ $description =
$this->fuckhtml
- ->getTextContent(
- $title["attributes"]["href"]
- )
- );
-
- $description =
- $this->fuckhtml
- ->getElementsByClassName(
- "s", "p"
- );
-
- if(count($description) !== 0){
+ ->getElementsByClassName(
+ "s", "p"
+ );
- $data["description"] =
- $this->titledots(
- html_entity_decode(
- $this->fuckhtml
- ->getTextContent(
- $description[0]
+ if(count($description) !== 0){
+
+ $data["description"] =
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ )
)
+ );
+ }
+
+ $data["date"] =
+ explode(
+ " - ",
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByClassName("i", "p")[1]
)
);
+
+ $data["date"] =
+ strtotime(
+ $data["date"][count($data["date"]) - 1]
+ );
+
+ $out["web"][] = $data;
}
-
- $data["date"] =
- explode(
- " - ",
- $this->fuckhtml
- ->getTextContent(
- $this->fuckhtml
- ->getElementsByClassName("i", "p")[1]
- )
- );
-
- $data["date"] =
- strtotime(
- $data["date"][count($data["date"]) - 1]
- );
-
- $out["web"][] = $data;
}
/*
@@ -969,12 +977,13 @@ class mojeek{
if($a["innerHTML"] == "Next"){
- $out["npt"] = $this->nextpage->store(
+ $out["npt"] = $this->backend->store(
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
- "web"
+ "web",
+ $proxy
);
}
}
@@ -1001,6 +1010,7 @@ class mojeek{
try{
$html =
$this->get(
+ $this->backend->get_ip(),
"https://www.mojeek.com/search",
[
"q" => $search,
@@ -1011,168 +1021,139 @@ class mojeek{
throw new Exception("Failed to get HTML");
}
-
/*
$handle = fopen("scraper/mojeek.html", "r");
$html = fread($handle, filesize("scraper/mojeek.html"));
- fclose($handle);*/
-
- /*
- Get big, standard and smaller nodes
+ fclose($handle);
*/
- foreach(
- [
- "results-extended",
- "results-standard"
- ]
- as $categoryname
- ){
+
+ $this->fuckhtml->load($html);
+
+ $articles =
+ $this->fuckhtml->getElementsByTagName("article");
+
+ foreach($articles as $article){
+
+ $this->fuckhtml->load($article);
+
+ $data = [
+ "title" => null,
+ "author" => null,
+ "description" => null,
+ "date" => null,
+ "thumb" =>
+ [
+ "url" => null,
+ "ratio" => null
+ ],
+ "url" => null
+ ];
+
+ $a = $this->fuckhtml->getElementsByTagName("a")[0];
+
+ $data["title"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["title"]
+ );
+
+ $data["url"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ );
+
+ $p = $this->fuckhtml->getElementsByTagName("p");
+
+ $data["description"] =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "s",
+ $p
+ )[0]
+ )
+ );
- $this->fuckhtml->load($html);
+ if($data["description"] == ""){
+
+ $data["description"] = null;
+ }
- $categories =
+ // get date from big node
+ $date =
$this->fuckhtml
->getElementsByClassName(
- $categoryname,
- "ul"
+ "date",
+ $p
);
-
- foreach($categories as $category){
+
+ if(count($date) !== 0){
+
+ $data["date"] =
+ strtotime(
+ $this->fuckhtml
+ ->getTextContent(
+ $date[0]
+ )
+ );
+ }
+
+ // grep date + author
+ $s =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "i",
+ $p
+ )[0];
+
+ $this->fuckhtml->load($s);
+
+ $a =
+ $this->fuckhtml
+ ->getElementsByTagName("a");
+
+ if(count($a) !== 0){
- $this->fuckhtml->load($category);
+ // parse big node information
+ $data["author"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $a[0]["innerHTML"]
+ );
+ }else{
- $nodes =
+ // parse smaller nodes
+ $replace =
$this->fuckhtml
- ->getElementsByTagName("li");
+ ->getElementsByTagName("time")[0];
- foreach($nodes as $node){
-
- $data = [
- "title" => null,
- "author" => null,
- "description" => null,
- "date" => null,
- "thumb" =>
- [
- "url" => null,
- "ratio" => null
- ],
- "url" => null
- ];
-
- /*
- Parse the results
- */
- $this->fuckhtml->load($node);
-
- // get title + url
- $a =
- $this->fuckhtml
- ->getElementsByTagName("a")[0];
-
- $data["title"] =
- $this->fuckhtml
- ->getTextContent(
- $a["attributes"]["title"]
- );
-
- $data["url"] =
+ $data["date"] =
+ strtotime(
$this->fuckhtml
->getTextContent(
- $a["attributes"]["href"]
- );
-
- // get image
- $image =
- $this->fuckhtml
- ->getElementsByTagName("img");
-
- if(count($image) !== 0){
-
- $data["thumb"] = [
- "url" =>
- urldecode(
- str_replace(
- "/image?img=",
- "",
- $this->fuckhtml
- ->getTextContent(
- $image[0]["attributes"]["src"]
- )
- )
- ),
- "ratio" => "16:9"
- ];
- }
-
- // get description
- $description =
- $this->fuckhtml
- ->getElementsByClassName("s", "p");
-
- if(count($description) !== 0){
-
- $data["description"] =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $description[0]
- )
- );
- }
-
- // get date + time
- $date =
- $this->fuckhtml
- ->getElementsByClassName(
- "date",
- "p"
- );
-
- $i =
- $this->fuckhtml
- ->getElementsByClassName("i", "p");
-
- if(count($date) !== 0){
-
- // we're inside a big node
- $data["date"] = strtotime($date[0]["innerHTML"]);
-
- if(count($i) !== 0){
-
- $this->fuckhtml->load($i[0]);
-
- $a =
- $this->fuckhtml
- ->getElementsByTagName("a");
-
- if(count($a) !== 0){
-
- $data["author"] =
- $this->fuckhtml
- ->getTextContent($a[0]);
- }
- }
- }else{
-
- // we're inside a small node
- if(count($i) !== 0){
-
- $i =
- explode(
- " - ",
- $this->fuckhtml
- ->getTextContent($i[0])
- );
-
- $data["date"] = strtotime(array_pop($i));
- $data["author"] = implode(" - ", $i);
- }
- }
-
- $out["news"][] = $data;
- }
+ $replace
+ )
+ );
+
+ $s["innerHTML"] =
+ str_replace(
+ $replace["outerHTML"],
+ "",
+ $s["innerHTML"]
+ );
+
+ $data["author"] =
+ preg_replace(
+ '/ &bull; $/',
+ "",
+ $s["innerHTML"]
+ );
}
+
+ $out["news"][] = $data;
}
return $out;