%PDF- %PDF-
| Direktori : /home/lightco1/upgrade.lightco.com.au/libraries/regularlabs/helpers/ |
| Current File : /home/lightco1/upgrade.lightco.com.au/libraries/regularlabs/helpers/htmlfixer.class.php |
<?
// -------------------------------------------------
// HTML FIXER v.2.05 15/07/2010
// clean dirty html and make it better, fix open tags
// bad nesting, bad quotes, bad autoclosing tags.
//
// by Giulio Pons, http://www.barattalo.it
// -------------------------------------------------
// usage:
// -------------------------------------------------
// $a = new HtmlFixer();
// $clean_html = $a->getFixedHtml($dirty_html);
// -------------------------------------------------
Class HtmlFixer
{
public $dirtyhtml;
public $fixedhtml;
public $allowed_styles; // inline styles array of allowed css (if empty means ALL allowed)
private $matrix; // array used to store nodes
public $debug;
private $fixedhtmlDisplayCode;
public function __construct()
{
$this->dirtyhtml = "";
$this->fixedhtml = "";
$this->debug = false;
$this->fixedhtmlDisplayCode = "";
$this->allowed_styles = array();
}
public function getFixedHtml($dirtyhtml)
{
$c = 0;
$this->dirtyhtml = $dirtyhtml;
$this->fixedhtml = "";
$this->fixedhtmlDisplayCode = "";
if (is_array($this->matrix)) unset($this->matrix);
$errorsFound = 0;
while ($c < 10)
{
/*
iterations, every time it's getting better...
*/
if ($c > 0) $this->dirtyhtml = $this->fixedxhtml;
$errorsFound = $this->charByCharJob();
if (!$errorsFound) $c = 10; // if no corrections made, stops iteration
$this->fixedxhtml = str_replace('<root>', '', $this->fixedxhtml);
$this->fixedxhtml = str_replace('</root>', '', $this->fixedxhtml);
$this->fixedxhtml = $this->removeSpacesAndBadTags($this->fixedxhtml);
$c++;
}
return $this->fixedxhtml;
}
private function fixStrToLower($m)
{
/*
$m is a part of the tag: make the first part of attr=value lowercase
*/
$right = strstr($m, '=');
$left = str_replace($right, '', $m);
return strtolower($left) . $right;
}
private function fixQuotes($s)
{
$q = "\"";// thanks to emmanuel@evobilis.com
if (!stristr($s, "=")) return $s;
$out = $s;
preg_match_all("|=(.*)|", $s, $o, PREG_PATTERN_ORDER);
for ($i = 0; $i < count($o[1]); $i++)
{
$t = trim($o[1][$i]);
$lc = "";
if ($t != "")
{
if ($t[strlen($t) - 1] == ">")
{
$lc = ($t[strlen($t) - 2] . $t[strlen($t) - 1]) == "/>" ? "/>" : ">";
$t = substr($t, 0, -1);
}
//missing " or ' at the beginning
if (($t[0] != "\"") && ($t[0] != "'")) $out = str_replace($t, "\"" . $t, $out);
else $q = $t[0];
//missing " or ' at the end
if (($t[strlen($t) - 1] != "\"") && ($t[strlen($t) - 1] != "'")) $out = str_replace($t . $lc, $t . $q . $lc, $out);
}
}
return $out;
}
private function fixTag($t)
{
/* remove non standard attributes and call the fix for quoted attributes */
$t = preg_replace(
array(
'/borderColor=([^ >])*/i',
'/border=([^ >])*/i',
),
array(
'',
'',
)
, $t);
$ar = explode(" ", $t);
$nt = "";
for ($i = 0; $i < count($ar); $i++)
{
$ar[$i] = $this->fixStrToLower($ar[$i]);
if (stristr($ar[$i], "=")) $ar[$i] = $this->fixQuotes($ar[$i]); // thanks to emmanuel@evobilis.com
//if (stristr($ar[$i],"=") && !stristr($ar[$i],"=\"")) $ar[$i] = $this->fixQuotes($ar[$i]);
$nt .= $ar[$i] . " ";
}
$nt = preg_replace("/<( )*/i", "<", $nt);
$nt = preg_replace("/( )*>/i", ">", $nt);
return trim($nt);
}
private function extractChars($tag1, $tag2, $tutto)
{ /*extract a block between $tag1 and $tag2*/
if (!stristr($tutto, $tag1)) return '';
$s = stristr($tutto, $tag1);
$s = substr($s, strlen($tag1));
if (!stristr($s, $tag2)) return '';
$s1 = stristr($s, $tag2);
return substr($s, 0, strlen($s) - strlen($s1));
}
private function mergeStyleAttributes($s)
{
//
// merge many style definitions in the same tag in just one attribute style
//
$x = "";
$temp = "";
$c = 0;
while (stristr($s, "style=\""))
{
$temp = $this->extractChars("style=\"", "\"", $s);
if ($temp == "")
{
// missing closing quote! add missing quote.
return preg_replace("/(\/)?>/i", "\"\\1>", $s);
}
if ($c == 0) $s = str_replace("style=\"" . $temp . "\"", "##PUTITHERE##", $s);
$s = str_replace("style=\"" . $temp . "\"", "", $s);
if (!preg_match("/;$/i", $temp)) $temp .= ";";
$x .= $temp;
$c++;
}
if (count($this->allowed_styles) > 0)
{
// keep only allowed styles by Martin Vool 2010-04-19
$check = explode(';', $x);
$x = "";
foreach ($check as $chk)
{
foreach ($this->allowed_styles as $as)
if (stripos($chk, $as) !== false)
{
$x .= $chk . ';';
break;
}
}
}
if ($c > 0) $s = str_replace("##PUTITHERE##", "style=\"" . $x . "\"", $s);
return $s;
}
private function fixAutoclosingTags($tag, $tipo = "")
{
/*
metodo richiamato da fix() per aggiustare i tag auto chiudenti (<br/> <img ... />)
*/
if (in_array($tipo, array("img", "input", "br", "hr")))
{
if (!stristr($tag, '/>')) $tag = str_replace('>', '/>', $tag);
}
return $tag;
}
private function getTypeOfTag($tag)
{
$tag = trim(preg_replace("/[\>\<\/]/i", "", $tag));
$a = explode(" ", $tag);
return $a[0];
}
private function checkTree()
{
// return the number of errors found
$errorsCounter = 0;
for ($i = 1; $i < count($this->matrix); $i++)
{
$flag = false;
if ($this->matrix[$i]["tagType"] == "div")
{ //div cannot stay inside a p, b, etc.
$parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
if (in_array($parentType, array("p", "b", "i", "font", "u", "small", "strong", "em"))) $flag = true;
}
if (in_array($this->matrix[$i]["tagType"], array("b", "strong")))
{ //b cannot stay inside b o strong.
$parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
if (in_array($parentType, array("b", "strong"))) $flag = true;
}
if (in_array($this->matrix[$i]["tagType"], array("i", "em")))
{ //i cannot stay inside i or em
$parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
if (in_array($parentType, array("i", "em"))) $flag = true;
}
if ($this->matrix[$i]["tagType"] == "p")
{
$parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
if (in_array($parentType, array("p", "b", "i", "font", "u", "small", "strong", "em"))) $flag = true;
}
if ($this->matrix[$i]["tagType"] == "table")
{
$parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
if (in_array($parentType, array("p", "b", "i", "font", "u", "small", "strong", "em", "tr", "table"))) $flag = true;
}
if ($flag)
{
$errorsCounter++;
if ($this->debug) echo "<div style='color:#ff0000'>Found a <b>" . $this->matrix[$i]["tagType"] . "</b> tag inside a <b>" . htmlspecialchars($parentType) . "</b> tag at node $i: MOVED</div>";
$swap = $this->matrix[$this->matrix[$i]["parentTag"]]["parentTag"];
if ($this->debug) echo "<div style='color:#ff0000'>Every node that has parent " . $this->matrix[$i]["parentTag"] . " will have parent " . $swap . "</div>";
$this->matrix[$this->matrix[$i]["parentTag"]]["tag"] = "<!-- T A G \"" . $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"] . "\" R E M O V E D -->";
$this->matrix[$this->matrix[$i]["parentTag"]]["tagType"] = "";
$hoSpostato = 0;
for ($j = count($this->matrix) - 1; $j >= $i; $j--)
{
if ($this->matrix[$j]["parentTag"] == $this->matrix[$i]["parentTag"])
{
$this->matrix[$j]["parentTag"] = $swap;
$hoSpostato = 1;
}
}
}
}
return $errorsCounter;
}
private function findSonsOf($parentTag)
{
// build correct html recursively
$out = "";
for ($i = 1; $i < count($this->matrix); $i++)
{
if ($this->matrix[$i]["parentTag"] == $parentTag)
{
if ($this->matrix[$i]["tag"] != "")
{
$out .= $this->matrix[$i]["pre"];
$out .= $this->matrix[$i]["tag"];
$out .= $this->matrix[$i]["post"];
}
else
{
$out .= $this->matrix[$i]["pre"];
$out .= $this->matrix[$i]["post"];
}
if ($this->matrix[$i]["tag"] != "")
{
$out .= $this->findSonsOf($i);
if ($this->matrix[$i]["tagType"] != "")
{
//write the closing tag
if (!in_array($this->matrix[$i]["tagType"], array("br", "img", "hr", "input")))
$out .= "</" . $this->matrix[$i]["tagType"] . ">";
}
}
}
}
return $out;
}
private function findSonsOfDisplayCode($parentTag)
{
//used for debug
$out = "";
for ($i = 1; $i < count($this->matrix); $i++)
{
if ($this->matrix[$i]["parentTag"] == $parentTag)
{
$out .= "<div style=\"padding-left:15\"><span style='float:left;background-color:#FFFF99;color:#000;'>{$i}:</span>";
if ($this->matrix[$i]["tag"] != "")
{
if ($this->matrix[$i]["pre"] != "") $out .= htmlspecialchars($this->matrix[$i]["pre"]) . "<br>";
$out .= "" . htmlspecialchars($this->matrix[$i]["tag"]) . "<span style='background-color:red; color:white'>{$i} <em>" . $this->matrix[$i]["tagType"] . "</em></span>";
$out .= htmlspecialchars($this->matrix[$i]["post"]);
}
else
{
if ($this->matrix[$i]["pre"] != "") $out .= htmlspecialchars($this->matrix[$i]["pre"]) . "<br>";
$out .= htmlspecialchars($this->matrix[$i]["post"]);
}
if ($this->matrix[$i]["tag"] != "")
{
$out .= "<div>" . $this->findSonsOfDisplayCode($i) . "</div>\n";
if ($this->matrix[$i]["tagType"] != "")
{
if (($this->matrix[$i]["tagType"] != "br") && ($this->matrix[$i]["tagType"] != "img") && ($this->matrix[$i]["tagType"] != "hr") && ($this->matrix[$i]["tagType"] != "input"))
$out .= "<div style='color:red'>" . htmlspecialchars("</" . $this->matrix[$i]["tagType"] . ">") . "{$i} <em>" . $this->matrix[$i]["tagType"] . "</em></div>";
}
}
$out .= "</div>\n";
}
}
return $out;
}
private function removeSpacesAndBadTags($s)
{
$i = 0;
while ($i < 10)
{
$i++;
$s = preg_replace(
array(
'/[\r\n]/i',
'/ /i',
'/<p([^>])*>( )*\s*<\/p>/i',
'/<span([^>])*>( )*\s*<\/span>/i',
'/<strong([^>])*>( )*\s*<\/strong>/i',
'/<em([^>])*>( )*\s*<\/em>/i',
'/<font([^>])*>( )*\s*<\/font>/i',
'/<small([^>])*>( )*\s*<\/small>/i',
'/<\?xml:namespace([^>])*><\/\?xml:namespace>/i',
'/<\?xml:namespace([^>])*\/>/i',
'/class=\"MsoNormal\"/i',
'/<o:p><\/o:p>/i',
'/<!DOCTYPE([^>])*>/i',
'/<!--(.|\s)*?-->/',
'/<\?(.|\s)*?\?>/',
),
array(
' ',
' ',
'',
'',
'',
'',
'',
'',
'',
'',
'',
' ',
'',
'',
)
, trim($s));
}
return $s;
}
private function charByCharJob()
{
$s = $this->removeSpacesAndBadTags($this->dirtyhtml);
if ($s == "") return;
$s = "<root>" . $s . "</root>";
$contenuto = "";
$ns = "";
$i = 0;
$j = 0;
$indexparentTag = 0;
$padri = array();
array_push($padri, "0");
$this->matrix[$j]["tagType"] = "";
$this->matrix[$j]["tag"] = "";
$this->matrix[$j]["parentTag"] = "0";
$this->matrix[$j]["pre"] = "";
$this->matrix[$j]["post"] = "";
$tags = array();
while ($i < strlen($s))
{
if ($s[$i] == "<")
{
/*
found a tag
*/
$contenuto = $ns;
$ns = "";
$tag = "";
while ($i < strlen($s) && $s[$i] != ">")
{
// get chars till the end of a tag
$tag .= $s[$i];
$i++;
}
$tag .= $s[$i];
if ($s[$i] == ">")
{
/*
$tag contains a tag <...chars...>
let's clean it!
*/
$tag = $this->fixTag($tag);
$tagType = $this->getTypeOfTag($tag);
$tag = $this->fixAutoclosingTags($tag, $tagType);
$tag = $this->mergeStyleAttributes($tag);
if (!isset($tags[$tagType])) $tags[$tagType] = 0;
$tagok = true;
if (($tags[$tagType] == 0) && (stristr($tag, '/' . $tagType . '>')))
{
$tagok = false;
/* there is a close tag without any open tag, I delete it */
if ($this->debug) echo "<div style='color:#ff0000'>Found a closing tag <b>" . htmlspecialchars($tag) . "</b> at char $i without open tag: REMOVED</div>";
}
}
if ($tagok)
{
$j++;
$this->matrix[$j]["pre"] = "";
$this->matrix[$j]["post"] = "";
$this->matrix[$j]["parentTag"] = "";
$this->matrix[$j]["tag"] = "";
$this->matrix[$j]["tagType"] = "";
if (stristr($tag, '/' . $tagType . '>'))
{
/*
it's the closing tag
*/
$ind = array_pop($padri);
$this->matrix[$j]["post"] = $contenuto;
$this->matrix[$j]["parentTag"] = $ind;
$tags[$tagType]--;
}
else
{
if (@preg_match("/" . $tagType . "\/>$/i", $tag) || preg_match("/\/>/i", $tag))
{
/*
it's a autoclosing tag
*/
$this->matrix[$j]["tagType"] = $tagType;
$this->matrix[$j]["tag"] = $tag;
$indexparentTag = array_pop($padri);
array_push($padri, $indexparentTag);
$this->matrix[$j]["parentTag"] = $indexparentTag;
$this->matrix[$j]["pre"] = $contenuto;
$this->matrix[$j]["post"] = "";
}
else
{
/*
it's a open tag
*/
$tags[$tagType]++;
$this->matrix[$j]["tagType"] = $tagType;
$this->matrix[$j]["tag"] = $tag;
$indexparentTag = array_pop($padri);
array_push($padri, $indexparentTag);
array_push($padri, $j);
$this->matrix[$j]["parentTag"] = $indexparentTag;
$this->matrix[$j]["pre"] = $contenuto;
$this->matrix[$j]["post"] = "";
}
}
}
}
else
{
/*
content of the tag
*/
$ns .= $s[$i];
}
$i++;
}
/*
remove not valid tags
*/
for ($eli = $j + 1; $eli < count($this->matrix); $eli++)
{
$this->matrix[$eli]["pre"] = "";
$this->matrix[$eli]["post"] = "";
$this->matrix[$eli]["parentTag"] = "";
$this->matrix[$eli]["tag"] = "";
$this->matrix[$eli]["tagType"] = "";
}
$errorsCounter = $this->checkTree(); // errorsCounter contains the number of removed tags
$this->fixedxhtml = $this->findSonsOf(0); // build html fixed
if ($this->debug)
{
$this->fixedxhtmlDisplayCode = $this->findSonsOfDisplayCode(0);
echo "<table border=1 cellspacing=0 cellpadding=0>";
echo "<tr><th>node id</th>";
echo "<th>pre</th>";
echo "<th>tag</th>";
echo "<th>post</th>";
echo "<th>parentTag</th>";
echo "<th>tipo</th></tr>";
for ($k = 0; $k <= $j; $k++)
{
echo "<tr><td>$k</td>";
echo "<td> " . htmlspecialchars($this->matrix[$k]["pre"]) . "</td>";
echo "<td> " . htmlspecialchars($this->matrix[$k]["tag"]) . "</td>";
echo "<td> " . htmlspecialchars($this->matrix[$k]["post"]) . "</td>";
echo "<td> " . $this->matrix[$k]["parentTag"] . "</td>";
echo "<td> <i>" . $this->matrix[$k]["tagType"] . "</i></td></tr>";
}
echo "</table>";
echo "<hr/>{$j}<hr/>\n\n\n\n" . $this->fixedxhtmlDisplayCode;
}
return $errorsCounter;
}
}