diff options
author | lolcat <will@lolcat.ca> | 2024-04-27 14:25:39 -0400 |
---|---|---|
committer | lolcat <will@lolcat.ca> | 2024-04-27 14:25:39 -0400 |
commit | f2eb164d40340cf221cb7ad457ab35492da4d308 (patch) | |
tree | d896523316590bd961522a12fe84aca1fc3f8db9 | |
parent | 81dc93802c32aa6f593a12b3f2efbe38dc9e31f7 (diff) |
qwant gibberish check
-rw-r--r-- | scraper/qwant.php | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/scraper/qwant.php b/scraper/qwant.php index a8b69fe..7f441e5 100644 --- a/scraper/qwant.php +++ b/scraper/qwant.php @@ -453,6 +453,8 @@ class qwant{ switch($item["type"]){ // ignores ads case "web": + + $first_iteration = true; foreach($item["items"] as $result){ if(isset($result["thumbnailUrl"])){ @@ -483,6 +485,25 @@ class qwant{ } } + // detect gibberish results + if( + $first_iteration && + preg_match( + "/^" . + preg_quote( + $this->trimdots( + $result["source"] + ), + "/" + ) . + "/", + $result["url"] + ) !== 1 + ){ + + throw new Exception("Qwant returned gibberish results"); + } + $out["web"][] = [ "title" => $this->trimdots($result["title"]), "description" => $this->trimdots($result["desc"]), @@ -493,6 +514,8 @@ class qwant{ "sublink" => $sublinks, "table" => [] ]; + + $first_iteration = false; } break; |