summaryrefslogtreecommitdiff
path: root/lib/fuckhtml.php
diff options
context:
space:
mode:
Diffstat (limited to 'lib/fuckhtml.php')
-rw-r--r--lib/fuckhtml.php361
1 files changed, 361 insertions, 0 deletions
diff --git a/lib/fuckhtml.php b/lib/fuckhtml.php
new file mode 100644
index 0000000..8802511
--- /dev/null
+++ b/lib/fuckhtml.php
@@ -0,0 +1,361 @@
+<?php
+class fuckhtml{
+
+ public function __construct($html = null, $isfile = false){
+
+ if($html !== null){
+
+ $this->load($html, $isfile);
+ }
+ }
+
+ public function load($html, $isfile = false){
+
+ if(is_array($html)){
+
+ if(!isset($html["innerHTML"])){
+
+ throw new Exception("(load) Supplied array doesn't contain a innerHTML index");
+ }
+ $html = $html["innerHTML"];
+ }
+
+ if($isfile){
+
+ $handle = fopen($html, "r");
+ $fetch = fread($handle, filesize($html));
+ fclose($handle);
+
+ $this->html = $fetch;
+ }else{
+
+ $this->html = $html;
+ }
+
+ $this->strlen = strlen($this->html);
+ }
+
+ public function getElementsByTagName(string $tagname){
+
+ $out = [];
+
+ /*
+ Scrape start of the tag. Example
+ <div class="mydiv"> ...
+ */
+
+ if($tagname == "*"){
+
+ $tagname = '[^\/<>\s]+';
+ }else{
+
+ $tagname = preg_quote(strtolower($tagname));
+ }
+
+ preg_match_all(
+ '/<\s*(' . $tagname . ')(\s(?:[^>\'"]*|"[^"]*"|\'[^\']*\')+)?\s*>/i',
+ /* '/<\s*(' . $tagname . ')(\s[\S\s]*?)?>/i', */
+ $this->html,
+ $starting_tags,
+ PREG_OFFSET_CAPTURE
+ );
+
+ for($i=0; $i<count($starting_tags[0]); $i++){
+
+ /*
+ Parse attributes
+ */
+ $attributes = [];
+
+ preg_match_all(
+ '/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/',
+ $starting_tags[2][$i][0],
+ $regex_attributes
+ );
+
+ for($k=0; $k<count($regex_attributes[0]); $k++){
+
+ if(trim($regex_attributes[2][$k]) == ""){
+
+ $attributes[$regex_attributes[1][$k]] =
+ "true";
+
+ continue;
+ }
+
+ $attributes[$regex_attributes[1][$k]] =
+ trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00");
+ }
+
+ $out[] = [
+ "tagName" => strtolower($starting_tags[1][$i][0]),
+ "startPos" => $starting_tags[0][$i][1],
+ "endPos" => 0,
+ "startTag" => $starting_tags[0][$i][0],
+ "attributes" => $attributes,
+ "innerHTML" => null
+ ];
+ }
+
+ /*
+ Get innerHTML
+ */
+ // get closing tag positions
+ preg_match_all(
+ '/<\s*\/\s*(' . $tagname . ')\s*>/i',
+ $this->html,
+ $regex_closing_tags,
+ PREG_OFFSET_CAPTURE
+ );
+
+ // merge opening and closing tags together
+ for($i=0; $i<count($regex_closing_tags[1]); $i++){
+
+ $out[] = [
+ "tagName" => strtolower($regex_closing_tags[1][$i][0]),
+ "endTag" => $regex_closing_tags[0][$i][0],
+ "startPos" => $regex_closing_tags[0][$i][1]
+ ];
+ }
+
+ usort(
+ $out,
+ function($a, $b){
+
+ return $a["startPos"] > $b["startPos"];
+ }
+ );
+
+ // computer the indent level for each element
+ $level = [];
+ $count = count($out);
+
+ for($i=0; $i<$count; $i++){
+
+ if(!isset($level[$out[$i]["tagName"]])){
+
+ $level[$out[$i]["tagName"]] = 0;
+ }
+
+ if(isset($out[$i]["startTag"])){
+
+ // encountered starting tag
+ $level[$out[$i]["tagName"]]++;
+ $out[$i]["level"] = $level[$out[$i]["tagName"]];
+ }else{
+
+ // encountered closing tag
+ $out[$i]["level"] = $level[$out[$i]["tagName"]];
+ $level[$out[$i]["tagName"]]--;
+ }
+ }
+
+ // if the indent level is the same for a div,
+ // we encountered _THE_ closing tag
+ for($i=0; $i<$count; $i++){
+
+ if(!isset($out[$i]["startTag"])){
+
+ continue;
+ }
+
+ for($k=$i; $k<$count; $k++){
+
+ if(
+ isset($out[$k]["endTag"]) &&
+ $out[$i]["tagName"] == $out[$k]["tagName"] &&
+ $out[$i]["level"]
+ === $out[$k]["level"]
+ ){
+
+ $startlen = strlen($out[$i]["startTag"]);
+ $endlen = strlen($out[$k]["endTag"]);
+
+ $out[$i]["endPos"] = $out[$k]["startPos"] + $endlen;
+
+ $out[$i]["innerHTML"] =
+ substr(
+ $this->html,
+ $out[$i]["startPos"] + $startlen,
+ $out[$k]["startPos"] - ($out[$i]["startPos"] + $startlen)
+ );
+
+ $out[$i]["outerHTML"] =
+ substr(
+ $this->html,
+ $out[$i]["startPos"],
+ $out[$k]["startPos"] - $out[$i]["startPos"] + $endlen
+ );
+
+ break;
+ }
+ }
+ }
+
+ // filter out ending divs
+ for($i=0; $i<$count; $i++){
+
+ if(isset($out[$i]["endTag"])){
+
+ unset($out[$i]);
+ }
+
+ unset($out[$i]["startTag"]);
+ }
+
+ return array_values($out);
+ }
+
+ public function getElementsByAttributeName(string $name, $collection = null){
+
+ if($collection === null){
+
+ $collection = $this->getElementsByTagName("*");
+ }elseif(is_string($collection)){
+
+ $collection = $this->getElementsByTagName($collection);
+ }
+
+ $return = [];
+ foreach($collection as $elem){
+
+ foreach($elem["attributes"] as $attrib_name => $attrib_value){
+
+ if($attrib_name == $name){
+
+ $return[] = $elem;
+ continue 2;
+ }
+ }
+ }
+
+ return $return;
+ }
+
+ public function getElementsByFuzzyAttributeValue(string $name, string $value, $collection = null){
+
+ $elems = $this->getElementsByAttributeName($name, $collection);
+ $value = explode(" ", $value);
+
+ $return = [];
+
+ foreach($elems as $elem){
+
+ foreach($elem["attributes"] as $attrib_name => $attrib_value){
+
+ $attrib_value = explode(" ", $attrib_value);
+ $ac = count($attrib_value);
+ $nc = count($value);
+ $cr = 0;
+
+ for($i=0; $i<$nc; $i++){
+
+ for($k=0; $k<$ac; $k++){
+
+ if($value[$i] == $attrib_value[$k]){
+
+ $cr++;
+ }
+ }
+ }
+
+ if($cr === $nc){
+
+ $return[] = $elem;
+ continue 2;
+ }
+ }
+ }
+
+ return $return;
+ }
+
+ public function getElementsByAttributeValue(string $name, string $value, $collection = null){
+
+ $elems = $this->getElementsByAttributeName($name, $collection);
+
+ $return = [];
+
+ foreach($elems as $elem){
+
+ foreach($elem["attributes"] as $attrib_name => $attrib_value){
+
+ if($attrib_value == $value){
+
+ $return[] = $elem;
+ continue 2;
+ }
+ }
+ }
+
+ return $return;
+ }
+
+ public function getElementById(string $idname, $collection = null){
+
+ $id = $this->getElementsByAttributeValue("id", $idname, $collection);
+
+ if(count($id) !== 0){
+
+ return $id[0];
+ }
+
+ return false;
+ }
+
+ public function getElementsByClassName(string $classname, $collection = null){
+
+ return $this->getElementsByFuzzyAttributeValue("class", $classname, $collection);
+ }
+
+ public function getTextContent($html, $whitespace = false, $trim = true){
+
+ if(is_array($html)){
+
+ if(!isset($html["innerHTML"])){
+
+ throw new Exception("(getTextContent) Supplied array doesn't contain a innerHTML index");
+ }
+ $html = $html["innerHTML"];
+ }
+
+ $html =
+ preg_split('/\n|<\/?br>/i', $html);
+
+ $out = "";
+ for($i=0; $i<count($html); $i++){
+
+ $tmp =
+ html_entity_decode(
+ strip_tags(
+ $html[$i]
+ ),
+ ENT_QUOTES | ENT_XML1, "UTF-8"
+ );
+
+ if($trim){
+
+ $tmp = trim($tmp);
+ }
+
+ $out .= $tmp;
+
+ if($whitespace === true){
+
+ $out .= "\n";
+ }else{
+
+ $out .= " ";
+ }
+ }
+
+ if($trim){
+
+ return trim($out);
+ }
+
+ return $out;
+ }
+}
+
+?>