PageRenderTime 61ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/civicrm/custom/ext/gov.nysenate.inbox/incl/htmlfixer.class.php

https://github.com/nysenate/Bluebird-CRM
PHP | 455 lines | 372 code | 30 blank | 53 comment | 65 complexity | d560c3a429490a8e7a5dbba3b99f3da0 MD5 | raw file
Possible License(s): JSON, BSD-3-Clause, MPL-2.0-no-copyleft-exception, AGPL-1.0, GPL-2.0, AGPL-3.0, Apache-2.0, MIT, GPL-3.0, CC-BY-4.0, LGPL-2.1, BSD-2-Clause, LGPL-3.0
  1. <?php
  2. // -------------------------------------------------
  3. // HTML FIXER v.2.05 15/07/2010
  4. // clean dirty html and make it better, fix open tags
  5. // bad nesting, bad quotes, bad autoclosing tags.
  6. //
  7. // by Giulio Pons, http://www.barattalo.it
  8. // -------------------------------------------------
  9. // usage:
  10. // -------------------------------------------------
  11. // $a = new HtmlFixer();
  12. // $clean_html = $a->getFixedHtml($dirty_html);
  13. // -------------------------------------------------
  14. class HtmlFixer {
  15. public $dirtyhtml;
  16. public $fixedhtml;
  17. public $allowed_styles; // inline styles array of allowed css (if empty means ALL allowed)
  18. private $matrix; // array used to store nodes
  19. public $debug;
  20. private $fixedhtmlDisplayCode;
  21. public function __construct() {
  22. $this->dirtyhtml = "";
  23. $this->fixedhtml = "";
  24. $this->debug = false;
  25. $this->fixedhtmlDisplayCode = "";
  26. $this->allowed_styles = array();
  27. }
  28. public function getFixedHtml($dirtyhtml) {
  29. $c = 0;
  30. $this->dirtyhtml = $dirtyhtml;
  31. $this->fixedhtml = "";
  32. $this->fixedhtmlDisplayCode = "";
  33. if (is_array($this->matrix)) unset($this->matrix);
  34. $errorsFound=0;
  35. while ($c<10) {
  36. /*
  37. iterations, every time it's getting better...
  38. */
  39. if ($c>0) $this->dirtyhtml = $this->fixedxhtml;
  40. $errorsFound = $this->charByCharJob();
  41. if (!$errorsFound) $c=10; // if no corrections made, stops iteration
  42. $this->fixedxhtml=str_replace('<root>','',$this->fixedxhtml);
  43. $this->fixedxhtml=str_replace('</root>','',$this->fixedxhtml);
  44. $this->fixedxhtml = $this->removeSpacesAndBadTags($this->fixedxhtml);
  45. $c++;
  46. }
  47. return $this->fixedxhtml;
  48. }
  49. private function fixStrToLower($m){
  50. /*
  51. $m is a part of the tag: make the first part of attr=value lowercase
  52. */
  53. $right = strstr($m, '=');
  54. $left = str_replace($right,'',$m);
  55. return strtolower($left).$right;
  56. }
  57. private function fixQuotes($s){
  58. $q = "\"";// thanks to emmanuel@evobilis.com
  59. if (!stristr($s,"=")) return $s;
  60. $out = $s;
  61. preg_match_all("|=(.*)|",$s,$o,PREG_PATTERN_ORDER);
  62. for ($i = 0; $i< count ($o[1]); $i++) {
  63. $t = trim ( $o[1][$i] ) ;
  64. $lc="";
  65. if ($t!="") {
  66. if ($t[strlen($t)-1]==">") {
  67. $lc= ($t[strlen($t)-2].$t[strlen($t)-1])=="/>" ? "/>" : ">" ;
  68. $t=substr($t,0,-1);
  69. }
  70. //missing " or ' at the beginning
  71. if (($t[0]!="\"")&&($t[0]!="'")) $out = str_replace( $t, "\"".$t,$out); else $q=$t[0];
  72. //missing " or ' at the end
  73. if (($t[strlen($t)-1]!="\"")&&($t[strlen($t)-1]!="'")) $out = str_replace( $t.$lc, $t.$q.$lc,$out);
  74. }
  75. }
  76. return $out;
  77. }
  78. private function fixTag($t){
  79. /* remove non standard attributes and call the fix for quoted attributes */
  80. $t = preg_replace (
  81. array(
  82. '/borderColor=([^ >])*/i',
  83. '/border=([^ >])*/i'
  84. ),
  85. array(
  86. '',
  87. ''
  88. )
  89. , $t);
  90. $ar = explode(" ",$t);
  91. $nt = "";
  92. for ($i=0;$i<count($ar);$i++) {
  93. $ar[$i]=$this->fixStrToLower($ar[$i]);
  94. if (stristr($ar[$i],"=")) $ar[$i] = $this->fixQuotes($ar[$i]); // thanks to emmanuel@evobilis.com
  95. //if (stristr($ar[$i],"=") && !stristr($ar[$i],"=\"")) $ar[$i] = $this->fixQuotes($ar[$i]);
  96. $nt.=$ar[$i]." ";
  97. }
  98. $nt=preg_replace("/<( )*/i","<",$nt);
  99. $nt=preg_replace("/( )*>/i",">",$nt);
  100. return trim($nt);
  101. }
  102. private function extractChars($tag1,$tag2,$tutto) { /*extract a block between $tag1 and $tag2*/
  103. if (!stristr($tutto, $tag1)) return '';
  104. $s=stristr($tutto,$tag1);
  105. $s=substr( $s,strlen($tag1));
  106. if (!stristr($s,$tag2)) return '';
  107. $s1=stristr($s,$tag2);
  108. return substr($s,0,strlen($s)-strlen($s1));
  109. }
  110. private function mergeStyleAttributes($s) {
  111. //
  112. // merge many style definitions in the same tag in just one attribute style
  113. //
  114. $x = "";
  115. $temp = "";
  116. $c = 0;
  117. while(stristr($s,"style=\"")) {
  118. $temp = $this->extractChars("style=\"","\"",$s);
  119. if ($temp=="") {
  120. // missing closing quote! add missing quote.
  121. return preg_replace("/(\/)?>/i","\"\\1>",$s);
  122. }
  123. if ($c==0) $s = str_replace("style=\"".$temp."\"","##PUTITHERE##",$s);
  124. $s = str_replace("style=\"".$temp."\"","",$s);
  125. if (!preg_match("/;$/i",$temp)) $temp.=";";
  126. $x.=$temp;
  127. $c++;
  128. }
  129. if (count($this->allowed_styles)>0) {
  130. // keep only allowed styles by Martin Vool 2010-04-19
  131. $check=explode(';', $x);
  132. $x="";
  133. foreach($check as $chk){
  134. foreach($this->allowed_styles as $as)
  135. if(stripos($chk, $as) !== False) { $x.=$chk.';'; break; }
  136. }
  137. }
  138. if ($c>0) $s = str_replace("##PUTITHERE##","style=\"".$x."\"",$s);
  139. return $s;
  140. }
  141. private function fixAutoclosingTags($tag,$tipo=""){
  142. /*
  143. metodo richiamato da fix() per aggiustare i tag auto chiudenti (<br/> <img ... />)
  144. */
  145. if (in_array( $tipo, array ("img","input","br","hr")) ) {
  146. if (!stristr($tag,'/>')) $tag = str_replace('>','/>',$tag );
  147. }
  148. return $tag;
  149. }
  150. private function getTypeOfTag($tag) {
  151. $tag = trim(preg_replace("/[\>\<\/]/i","",$tag));
  152. $a = explode(" ",$tag);
  153. return $a[0];
  154. }
  155. private function checkTree() {
  156. // return the number of errors found
  157. $errorsCounter = 0;
  158. for ($i=1;$i<count($this->matrix);$i++) {
  159. $flag=false;
  160. if ($this->matrix[$i]["tagType"]=="div") { //div cannot stay inside a p, b, etc.
  161. $parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
  162. if (in_array($parentType, array("p","b","i","font","u","small","strong","em"))) $flag=true;
  163. }
  164. if (in_array( $this->matrix[$i]["tagType"], array( "b", "strong" )) ) { //b cannot stay inside b o strong.
  165. $parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
  166. if (in_array($parentType, array("b","strong"))) $flag=true;
  167. }
  168. if (in_array( $this->matrix[$i]["tagType"], array ( "i", "em") )) { //i cannot stay inside i or em
  169. $parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
  170. if (in_array($parentType, array("i","em"))) $flag=true;
  171. }
  172. if ($this->matrix[$i]["tagType"]=="p") {
  173. $parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
  174. if (in_array($parentType, array("p","b","i","font","u","small","strong","em"))) $flag=true;
  175. }
  176. if ($this->matrix[$i]["tagType"]=="table") {
  177. $parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
  178. if (in_array($parentType, array("p","b","i","font","u","small","strong","em","tr","table"))) $flag=true;
  179. }
  180. if ($flag) {
  181. $errorsCounter++;
  182. if ($this->debug) echo "<div style='color:#ff0000'>Found a <b>".$this->matrix[$i]["tagType"]."</b> tag inside a <b>".htmlspecialchars($parentType)."</b> tag at node $i: MOVED</div>";
  183. $swap = $this->matrix[$this->matrix[$i]["parentTag"]]["parentTag"];
  184. if ($this->debug) echo "<div style='color:#ff0000'>Every node that has parent ".$this->matrix[$i]["parentTag"]." will have parent ".$swap."</div>";
  185. $this->matrix[$this->matrix[$i]["parentTag"]]["tag"]="<!-- T A G \"".$this->matrix[$this->matrix[$i]["parentTag"]]["tagType"]."\" R E M O V E D -->";
  186. $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"]="";
  187. $hoSpostato=0;
  188. for ($j=count($this->matrix)-1;$j>=$i;$j--) {
  189. if ($this->matrix[$j]["parentTag"]==$this->matrix[$i]["parentTag"]) {
  190. $this->matrix[$j]["parentTag"] = $swap;
  191. $hoSpostato=1;
  192. }
  193. }
  194. }
  195. }
  196. return $errorsCounter;
  197. }
  198. private function findSonsOf($parentTag) {
  199. // build correct html recursively
  200. $out= "";
  201. for ($i=1;$i<count($this->matrix);$i++) {
  202. if ($this->matrix[$i]["parentTag"]==$parentTag) {
  203. if ($this->matrix[$i]["tag"]!="") {
  204. $out.=$this->matrix[$i]["pre"];
  205. $out.=$this->matrix[$i]["tag"];
  206. $out.=$this->matrix[$i]["post"];
  207. } else {
  208. $out.=$this->matrix[$i]["pre"];
  209. $out.=$this->matrix[$i]["post"];
  210. }
  211. if ($this->matrix[$i]["tag"]!="") {
  212. $out.=$this->findSonsOf($i);
  213. if ($this->matrix[$i]["tagType"]!="") {
  214. //write the closing tag
  215. if (!in_array($this->matrix[$i]["tagType"], array ( "br","img","hr","input")))
  216. $out.="</". $this->matrix[$i]["tagType"].">";
  217. }
  218. }
  219. }
  220. }
  221. return $out;
  222. }
  223. private function findSonsOfDisplayCode($parentTag) {
  224. //used for debug
  225. $out= "";
  226. for ($i=1;$i<count($this->matrix);$i++) {
  227. if ($this->matrix[$i]["parentTag"]==$parentTag) {
  228. $out.= "<div style=\"padding-left:15px\"><span style='float:left;background-color:#FFFF99;color:#000;'>{$i}:</span>";
  229. if ($this->matrix[$i]["tag"]!="") {
  230. if ($this->matrix[$i]["pre"]!="") $out.=htmlspecialchars($this->matrix[$i]["pre"])."<br>";
  231. $out.="".htmlspecialchars($this->matrix[$i]["tag"])."<span style='background-color:red; color:white'>{$i} <em>".$this->matrix[$i]["tagType"]."</em></span>";
  232. $out.=htmlspecialchars($this->matrix[$i]["post"]);
  233. } else {
  234. if ($this->matrix[$i]["pre"]!="") $out.=htmlspecialchars($this->matrix[$i]["pre"])."<br>";
  235. $out.=htmlspecialchars($this->matrix[$i]["post"]);
  236. }
  237. if ($this->matrix[$i]["tag"]!="") {
  238. $out.="<div>".$this->findSonsOfDisplayCode($i)."</div>\n";
  239. if ($this->matrix[$i]["tagType"]!="") {
  240. if (($this->matrix[$i]["tagType"]!="br") && ($this->matrix[$i]["tagType"]!="img") && ($this->matrix[$i]["tagType"]!="hr")&& ($this->matrix[$i]["tagType"]!="input"))
  241. $out.="<div style='color:red'>".htmlspecialchars("</". $this->matrix[$i]["tagType"].">")."{$i} <em>".$this->matrix[$i]["tagType"]."</em></div>";
  242. }
  243. }
  244. $out.="</div>\n";
  245. }
  246. }
  247. return $out;
  248. }
  249. private function removeSpacesAndBadTags($s) {
  250. $i=0;
  251. while ($i<10) {
  252. $i++;
  253. $s = preg_replace (
  254. array(
  255. '/[\r\n]/i',
  256. '/ /i',
  257. '/<p([^>])*>(&nbsp;)*\s*<\/p>/i',
  258. '/<span([^>])*>(&nbsp;)*\s*<\/span>/i',
  259. '/<strong([^>])*>(&nbsp;)*\s*<\/strong>/i',
  260. '/<em([^>])*>(&nbsp;)*\s*<\/em>/i',
  261. '/<font([^>])*>(&nbsp;)*\s*<\/font>/i',
  262. '/<small([^>])*>(&nbsp;)*\s*<\/small>/i',
  263. '/<\?xml:namespace([^>])*><\/\?xml:namespace>/i',
  264. '/<\?xml:namespace([^>])*\/>/i',
  265. '/class=\"MsoNormal\"/i',
  266. '/<o:p><\/o:p>/i',
  267. '/<!DOCTYPE([^>])*>/i',
  268. '/<!--(.|\s)*?-->/',
  269. '/<\?(.|\s)*?\?>/'
  270. ),
  271. array(
  272. ' ',
  273. ' ',
  274. '',
  275. '',
  276. '',
  277. '',
  278. '',
  279. '',
  280. '',
  281. '',
  282. '',
  283. ' ',
  284. '',
  285. ''
  286. )
  287. , trim($s));
  288. }
  289. return $s;
  290. }
  291. private function charByCharJob() {
  292. $s = $this->removeSpacesAndBadTags($this->dirtyhtml);
  293. if ($s=="") return;
  294. $s = "<root>".$s."</root>";
  295. $contenuto = "";
  296. $ns = "";
  297. $i=0;
  298. $j=0;
  299. $indexparentTag=0;
  300. $padri=array();
  301. array_push($padri,"0");
  302. $this->matrix[$j]["tagType"]="";
  303. $this->matrix[$j]["tag"]="";
  304. $this->matrix[$j]["parentTag"]="0";
  305. $this->matrix[$j]["pre"]="";
  306. $this->matrix[$j]["post"]="";
  307. $tags=array();
  308. while($i<strlen($s)) {
  309. if ( $s[$i] =="<") {
  310. /*
  311. found a tag
  312. */
  313. $contenuto = $ns;
  314. $ns = "";
  315. $tag="";
  316. while( $i<strlen($s) && $s[$i]!=">" ){
  317. // get chars till the end of a tag
  318. $tag.=$s[$i];
  319. $i++;
  320. }
  321. $tag.=$s[$i];
  322. if($s[$i]==">") {
  323. /*
  324. $tag contains a tag <...chars...>
  325. let's clean it!
  326. */
  327. $tag = $this->fixTag($tag);
  328. $tagType = $this->getTypeOfTag($tag);
  329. $tag = $this->fixAutoclosingTags($tag,$tagType);
  330. $tag = $this->mergeStyleAttributes($tag);
  331. if (!isset($tags[$tagType])) $tags[$tagType]=0;
  332. $tagok=true;
  333. if (($tags[$tagType]==0)&&(stristr($tag,'/'.$tagType.'>'))) {
  334. $tagok=false;
  335. /* there is a close tag without any open tag, I delete it */
  336. if ($this->debug) echo "<div style='color:#ff0000'>Found a closing tag <b>".htmlspecialchars($tag)."</b> at char $i without open tag: REMOVED</div>";
  337. }
  338. }
  339. if ($tagok) {
  340. $j++;
  341. $this->matrix[$j]["pre"]="";
  342. $this->matrix[$j]["post"]="";
  343. $this->matrix[$j]["parentTag"]="";
  344. $this->matrix[$j]["tag"]="";
  345. $this->matrix[$j]["tagType"]="";
  346. if (stristr($tag,'/'.$tagType.'>')) {
  347. /*
  348. it's the closing tag
  349. */
  350. $ind = array_pop($padri);
  351. $this->matrix[$j]["post"]=$contenuto;
  352. $this->matrix[$j]["parentTag"]=$ind;
  353. $tags[$tagType]--;
  354. } else {
  355. if (@preg_match("/".$tagType."\/>$/i",$tag)||preg_match("/\/>/i",$tag)) {
  356. /*
  357. it's a autoclosing tag
  358. */
  359. $this->matrix[$j]["tagType"]=$tagType;
  360. $this->matrix[$j]["tag"]=$tag;
  361. $indexparentTag = array_pop($padri);
  362. array_push($padri,$indexparentTag);
  363. $this->matrix[$j]["parentTag"]=$indexparentTag;
  364. $this->matrix[$j]["pre"]=$contenuto;
  365. $this->matrix[$j]["post"]="";
  366. } else {
  367. /*
  368. it's a open tag
  369. */
  370. $tags[$tagType]++;
  371. $this->matrix[$j]["tagType"]=$tagType;
  372. $this->matrix[$j]["tag"]=$tag;
  373. $indexparentTag = array_pop($padri);
  374. array_push($padri,$indexparentTag);
  375. array_push($padri,$j);
  376. $this->matrix[$j]["parentTag"]=$indexparentTag;
  377. $this->matrix[$j]["pre"]=$contenuto;
  378. $this->matrix[$j]["post"]="";
  379. }
  380. }
  381. }
  382. } else {
  383. /*
  384. content of the tag
  385. */
  386. $ns.=$s[$i];
  387. }
  388. $i++;
  389. }
  390. /*
  391. remove not valid tags
  392. */
  393. for ($eli=$j+1;$eli<count($this->matrix);$eli++) {
  394. $this->matrix[$eli]["pre"]="";
  395. $this->matrix[$eli]["post"]="";
  396. $this->matrix[$eli]["parentTag"]="";
  397. $this->matrix[$eli]["tag"]="";
  398. $this->matrix[$eli]["tagType"]="";
  399. }
  400. $errorsCounter = $this->checkTree(); // errorsCounter contains the number of removed tags
  401. $this->fixedxhtml=$this->findSonsOf(0); // build html fixed
  402. if ($this->debug) {
  403. $this->fixedxhtmlDisplayCode=$this->findSonsOfDisplayCode(0);
  404. echo "<table border=1 cellspacing=0 cellpadding=0>";
  405. echo "<tr><th>node id</th>";
  406. echo "<th>pre</th>";
  407. echo "<th>tag</th>";
  408. echo "<th>post</th>";
  409. echo "<th>parentTag</th>";
  410. echo "<th>tipo</th></tr>";
  411. for ($k=0;$k<=$j;$k++) {
  412. echo "<tr><td>$k</td>";
  413. echo "<td>&nbsp;".htmlspecialchars($this->matrix[$k]["pre"])."</td>";
  414. echo "<td>&nbsp;".htmlspecialchars($this->matrix[$k]["tag"])."</td>";
  415. echo "<td>&nbsp;".htmlspecialchars($this->matrix[$k]["post"])."</td>";
  416. echo "<td>&nbsp;".$this->matrix[$k]["parentTag"]."</td>";
  417. echo "<td>&nbsp;<i>".$this->matrix[$k]["tagType"]."</i></td></tr>";
  418. }
  419. echo "</table>";
  420. echo "<hr/>{$j}<hr/>\n\n\n\n".$this->fixedxhtmlDisplayCode;
  421. }
  422. return $errorsCounter;
  423. }
  424. }