PageRenderTime 37ms CodeModel.GetById 1ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/external/phpmailer/extras/htmlfilter.php

https://bitbucket.org/navigatecms/navigatecms
PHP | 1159 lines | 765 code | 41 blank | 353 comment | 184 complexity | 96e032c0bd341c651461fe6cf1e5670a MD5 | raw file
Possible License(s): GPL-2.0, MIT, LGPL-2.1, BSD-3-Clause, AGPL-3.0, Apache-2.0
  1. <?php
  2. /**
  3. * htmlfilter.inc
  4. * ---------------
  5. * This set of functions allows you to filter html in order to remove
  6. * any malicious tags from it. Useful in cases when you need to filter
  7. * user input for any cross-site-scripting attempts.
  8. *
  9. * Copyright (C) 2002-2004 by Duke University
  10. *
  11. * This library is free software; you can redistribute it and/or
  12. * modify it under the terms of the GNU Lesser General Public
  13. * License as published by the Free Software Foundation; either
  14. * version 2.1 of the License, or (at your option) any later version.
  15. *
  16. * This library is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. * Lesser General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU Lesser General Public
  22. * License along with this library; if not, write to the Free Software
  23. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  24. * 02110-1301 USA
  25. *
  26. * @Author Konstantin Riabitsev <icon@linux.duke.edu>
  27. * @Author Jim Jagielski <jim@jaguNET.com / jimjag@gmail.com>
  28. * @Version 1.1 ($Date$)
  29. */
  30. /**
  31. * This function returns the final tag out of the tag name, an array
  32. * of attributes, and the type of the tag. This function is called by
  33. * tln_sanitize internally.
  34. *
  35. * @param string $tagname the name of the tag.
  36. * @param array $attary the array of attributes and their values
  37. * @param integer $tagtype The type of the tag (see in comments).
  38. * @return string A string with the final tag representation.
  39. */
  40. function tln_tagprint($tagname, $attary, $tagtype)
  41. {
  42. if ($tagtype == 2) {
  43. $fulltag = '</' . $tagname . '>';
  44. } else {
  45. $fulltag = '<' . $tagname;
  46. if (is_array($attary) && sizeof($attary)) {
  47. $atts = array();
  48. while (list($attname, $attvalue) = each($attary)) {
  49. array_push($atts, "$attname=$attvalue");
  50. }
  51. $fulltag .= ' ' . join(' ', $atts);
  52. }
  53. if ($tagtype == 3) {
  54. $fulltag .= ' /';
  55. }
  56. $fulltag .= '>';
  57. }
  58. return $fulltag;
  59. }
  60. /**
  61. * A small helper function to use with array_walk. Modifies a by-ref
  62. * value and makes it lowercase.
  63. *
  64. * @param string $val a value passed by-ref.
  65. * @return void since it modifies a by-ref value.
  66. */
  67. function tln_casenormalize(&$val)
  68. {
  69. $val = strtolower($val);
  70. }
  71. /**
  72. * This function skips any whitespace from the current position within
  73. * a string and to the next non-whitespace value.
  74. *
  75. * @param string $body the string
  76. * @param integer $offset the offset within the string where we should start
  77. * looking for the next non-whitespace character.
  78. * @return integer the location within the $body where the next
  79. * non-whitespace char is located.
  80. */
  81. function tln_skipspace($body, $offset)
  82. {
  83. preg_match('/^(\s*)/s', substr($body, $offset), $matches);
  84. if (sizeof($matches[1])) {
  85. $count = strlen($matches[1]);
  86. $offset += $count;
  87. }
  88. return $offset;
  89. }
  90. /**
  91. * This function looks for the next character within a string. It's
  92. * really just a glorified "strpos", except it catches the failures
  93. * nicely.
  94. *
  95. * @param string $body The string to look for needle in.
  96. * @param integer $offset Start looking from this position.
  97. * @param string $needle The character/string to look for.
  98. * @return integer location of the next occurrence of the needle, or
  99. * strlen($body) if needle wasn't found.
  100. */
  101. function tln_findnxstr($body, $offset, $needle)
  102. {
  103. $pos = strpos($body, $needle, $offset);
  104. if ($pos === false) {
  105. $pos = strlen($body);
  106. }
  107. return $pos;
  108. }
  109. /**
  110. * This function takes a PCRE-style regexp and tries to match it
  111. * within the string.
  112. *
  113. * @param string $body The string to look for needle in.
  114. * @param integer $offset Start looking from here.
  115. * @param string $reg A PCRE-style regex to match.
  116. * @return array|boolean Returns a false if no matches found, or an array
  117. * with the following members:
  118. * - integer with the location of the match within $body
  119. * - string with whatever content between offset and the match
  120. * - string with whatever it is we matched
  121. */
  122. function tln_findnxreg($body, $offset, $reg)
  123. {
  124. $matches = array();
  125. $retarr = array();
  126. $preg_rule = '%^(.*?)(' . $reg . ')%s';
  127. preg_match($preg_rule, substr($body, $offset), $matches);
  128. if (!isset($matches[0]) || !$matches[0]) {
  129. $retarr = false;
  130. } else {
  131. $retarr[0] = $offset + strlen($matches[1]);
  132. $retarr[1] = $matches[1];
  133. $retarr[2] = $matches[2];
  134. }
  135. return $retarr;
  136. }
  137. /**
  138. * This function looks for the next tag.
  139. *
  140. * @param string $body String where to look for the next tag.
  141. * @param integer $offset Start looking from here.
  142. * @return array|boolean false if no more tags exist in the body, or
  143. * an array with the following members:
  144. * - string with the name of the tag
  145. * - array with attributes and their values
  146. * - integer with tag type (1, 2, or 3)
  147. * - integer where the tag starts (starting "<")
  148. * - integer where the tag ends (ending ">")
  149. * first three members will be false, if the tag is invalid.
  150. */
  151. function tln_getnxtag($body, $offset)
  152. {
  153. if ($offset > strlen($body)) {
  154. return false;
  155. }
  156. $lt = tln_findnxstr($body, $offset, '<');
  157. if ($lt == strlen($body)) {
  158. return false;
  159. }
  160. /**
  161. * We are here:
  162. * blah blah <tag attribute="value">
  163. * \---------^
  164. */
  165. $pos = tln_skipspace($body, $lt + 1);
  166. if ($pos >= strlen($body)) {
  167. return array(false, false, false, $lt, strlen($body));
  168. }
  169. /**
  170. * There are 3 kinds of tags:
  171. * 1. Opening tag, e.g.:
  172. * <a href="blah">
  173. * 2. Closing tag, e.g.:
  174. * </a>
  175. * 3. XHTML-style content-less tag, e.g.:
  176. * <img src="blah"/>
  177. */
  178. switch (substr($body, $pos, 1)) {
  179. case '/':
  180. $tagtype = 2;
  181. $pos++;
  182. break;
  183. case '!':
  184. /**
  185. * A comment or an SGML declaration.
  186. */
  187. if (substr($body, $pos + 1, 2) == '--') {
  188. $gt = strpos($body, '-->', $pos);
  189. if ($gt === false) {
  190. $gt = strlen($body);
  191. } else {
  192. $gt += 2;
  193. }
  194. return array(false, false, false, $lt, $gt);
  195. } else {
  196. $gt = tln_findnxstr($body, $pos, '>');
  197. return array(false, false, false, $lt, $gt);
  198. }
  199. break;
  200. default:
  201. /**
  202. * Assume tagtype 1 for now. If it's type 3, we'll switch values
  203. * later.
  204. */
  205. $tagtype = 1;
  206. break;
  207. }
  208. /**
  209. * Look for next [\W-_], which will indicate the end of the tag name.
  210. */
  211. $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
  212. if ($regary == false) {
  213. return array(false, false, false, $lt, strlen($body));
  214. }
  215. list($pos, $tagname, $match) = $regary;
  216. $tagname = strtolower($tagname);
  217. /**
  218. * $match can be either of these:
  219. * '>' indicating the end of the tag entirely.
  220. * '\s' indicating the end of the tag name.
  221. * '/' indicating that this is type-3 xhtml tag.
  222. *
  223. * Whatever else we find there indicates an invalid tag.
  224. */
  225. switch ($match) {
  226. case '/':
  227. /**
  228. * This is an xhtml-style tag with a closing / at the
  229. * end, like so: <img src="blah"/>. Check if it's followed
  230. * by the closing bracket. If not, then this tag is invalid
  231. */
  232. if (substr($body, $pos, 2) == '/>') {
  233. $pos++;
  234. $tagtype = 3;
  235. } else {
  236. $gt = tln_findnxstr($body, $pos, '>');
  237. $retary = array(false, false, false, $lt, $gt);
  238. return $retary;
  239. }
  240. //intentional fall-through
  241. case '>':
  242. return array($tagname, false, $tagtype, $lt, $pos);
  243. break;
  244. default:
  245. /**
  246. * Check if it's whitespace
  247. */
  248. if (!preg_match('/\s/', $match)) {
  249. /**
  250. * This is an invalid tag! Look for the next closing ">".
  251. */
  252. $gt = tln_findnxstr($body, $lt, '>');
  253. return array(false, false, false, $lt, $gt);
  254. }
  255. break;
  256. }
  257. /**
  258. * At this point we're here:
  259. * <tagname attribute='blah'>
  260. * \-------^
  261. *
  262. * At this point we loop in order to find all attributes.
  263. */
  264. $attary = array();
  265. while ($pos <= strlen($body)) {
  266. $pos = tln_skipspace($body, $pos);
  267. if ($pos == strlen($body)) {
  268. /**
  269. * Non-closed tag.
  270. */
  271. return array(false, false, false, $lt, $pos);
  272. }
  273. /**
  274. * See if we arrived at a ">" or "/>", which means that we reached
  275. * the end of the tag.
  276. */
  277. $matches = array();
  278. if (preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches)) {
  279. /**
  280. * Yep. So we did.
  281. */
  282. $pos += strlen($matches[1]);
  283. if ($matches[2] == '/>') {
  284. $tagtype = 3;
  285. $pos++;
  286. }
  287. return array($tagname, $attary, $tagtype, $lt, $pos);
  288. }
  289. /**
  290. * There are several types of attributes, with optional
  291. * [:space:] between members.
  292. * Type 1:
  293. * attrname[:space:]=[:space:]'CDATA'
  294. * Type 2:
  295. * attrname[:space:]=[:space:]"CDATA"
  296. * Type 3:
  297. * attr[:space:]=[:space:]CDATA
  298. * Type 4:
  299. * attrname
  300. *
  301. * We leave types 1 and 2 the same, type 3 we check for
  302. * '"' and convert to "&quot" if needed, then wrap in
  303. * double quotes. Type 4 we convert into:
  304. * attrname="yes".
  305. */
  306. $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
  307. if ($regary == false) {
  308. /**
  309. * Looks like body ended before the end of tag.
  310. */
  311. return array(false, false, false, $lt, strlen($body));
  312. }
  313. list($pos, $attname, $match) = $regary;
  314. $attname = strtolower($attname);
  315. /**
  316. * We arrived at the end of attribute name. Several things possible
  317. * here:
  318. * '>' means the end of the tag and this is attribute type 4
  319. * '/' if followed by '>' means the same thing as above
  320. * '\s' means a lot of things -- look what it's followed by.
  321. * anything else means the attribute is invalid.
  322. */
  323. switch ($match) {
  324. case '/':
  325. /**
  326. * This is an xhtml-style tag with a closing / at the
  327. * end, like so: <img src="blah"/>. Check if it's followed
  328. * by the closing bracket. If not, then this tag is invalid
  329. */
  330. if (substr($body, $pos, 2) == '/>') {
  331. $pos++;
  332. $tagtype = 3;
  333. } else {
  334. $gt = tln_findnxstr($body, $pos, '>');
  335. $retary = array(false, false, false, $lt, $gt);
  336. return $retary;
  337. }
  338. //intentional fall-through
  339. case '>':
  340. $attary{$attname} = '"yes"';
  341. return array($tagname, $attary, $tagtype, $lt, $pos);
  342. break;
  343. default:
  344. /**
  345. * Skip whitespace and see what we arrive at.
  346. */
  347. $pos = tln_skipspace($body, $pos);
  348. $char = substr($body, $pos, 1);
  349. /**
  350. * Two things are valid here:
  351. * '=' means this is attribute type 1 2 or 3.
  352. * \w means this was attribute type 4.
  353. * anything else we ignore and re-loop. End of tag and
  354. * invalid stuff will be caught by our checks at the beginning
  355. * of the loop.
  356. */
  357. if ($char == '=') {
  358. $pos++;
  359. $pos = tln_skipspace($body, $pos);
  360. /**
  361. * Here are 3 possibilities:
  362. * "'" attribute type 1
  363. * '"' attribute type 2
  364. * everything else is the content of tag type 3
  365. */
  366. $quot = substr($body, $pos, 1);
  367. if ($quot == '\'') {
  368. $regary = tln_findnxreg($body, $pos + 1, '\'');
  369. if ($regary == false) {
  370. return array(false, false, false, $lt, strlen($body));
  371. }
  372. list($pos, $attval, $match) = $regary;
  373. $pos++;
  374. $attary{$attname} = '\'' . $attval . '\'';
  375. } elseif ($quot == '"') {
  376. $regary = tln_findnxreg($body, $pos + 1, '\"');
  377. if ($regary == false) {
  378. return array(false, false, false, $lt, strlen($body));
  379. }
  380. list($pos, $attval, $match) = $regary;
  381. $pos++;
  382. $attary{$attname} = '"' . $attval . '"';
  383. } else {
  384. /**
  385. * These are hateful. Look for \s, or >.
  386. */
  387. $regary = tln_findnxreg($body, $pos, '[\s>]');
  388. if ($regary == false) {
  389. return array(false, false, false, $lt, strlen($body));
  390. }
  391. list($pos, $attval, $match) = $regary;
  392. /**
  393. * If it's ">" it will be caught at the top.
  394. */
  395. $attval = preg_replace('/\"/s', '&quot;', $attval);
  396. $attary{$attname} = '"' . $attval . '"';
  397. }
  398. } elseif (preg_match('|[\w/>]|', $char)) {
  399. /**
  400. * That was attribute type 4.
  401. */
  402. $attary{$attname} = '"yes"';
  403. } else {
  404. /**
  405. * An illegal character. Find next '>' and return.
  406. */
  407. $gt = tln_findnxstr($body, $pos, '>');
  408. return array(false, false, false, $lt, $gt);
  409. }
  410. break;
  411. }
  412. }
  413. /**
  414. * The fact that we got here indicates that the tag end was never
  415. * found. Return invalid tag indication so it gets stripped.
  416. */
  417. return array(false, false, false, $lt, strlen($body));
  418. }
  419. /**
  420. * Translates entities into literal values so they can be checked.
  421. *
  422. * @param string $attvalue the by-ref value to check.
  423. * @param string $regex the regular expression to check against.
  424. * @param boolean $hex whether the entities are hexadecimal.
  425. * @return boolean True or False depending on whether there were matches.
  426. */
  427. function tln_deent(&$attvalue, $regex, $hex = false)
  428. {
  429. preg_match_all($regex, $attvalue, $matches);
  430. if (is_array($matches) && sizeof($matches[0]) > 0) {
  431. $repl = array();
  432. for ($i = 0; $i < sizeof($matches[0]); $i++) {
  433. $numval = $matches[1][$i];
  434. if ($hex) {
  435. $numval = hexdec($numval);
  436. }
  437. $repl{$matches[0][$i]} = chr($numval);
  438. }
  439. $attvalue = strtr($attvalue, $repl);
  440. return true;
  441. } else {
  442. return false;
  443. }
  444. }
  445. /**
  446. * This function checks attribute values for entity-encoded values
  447. * and returns them translated into 8-bit strings so we can run
  448. * checks on them.
  449. *
  450. * @param string $attvalue A string to run entity check against.
  451. */
  452. function tln_defang(&$attvalue)
  453. {
  454. /**
  455. * Skip this if there aren't ampersands or backslashes.
  456. */
  457. if (strpos($attvalue, '&') === false
  458. && strpos($attvalue, '\\') === false
  459. ) {
  460. return;
  461. }
  462. do {
  463. $m = false;
  464. $m = $m || tln_deent($attvalue, '/\&#0*(\d+);*/s');
  465. $m = $m || tln_deent($attvalue, '/\&#x0*((\d|[a-f])+);*/si', true);
  466. $m = $m || tln_deent($attvalue, '/\\\\(\d+)/s', true);
  467. } while ($m == true);
  468. $attvalue = stripslashes($attvalue);
  469. }
  470. /**
  471. * Kill any tabs, newlines, or carriage returns. Our friends the
  472. * makers of the browser with 95% market value decided that it'd
  473. * be funny to make "java[tab]script" be just as good as "javascript".
  474. *
  475. * @param string $attvalue The attribute value before extraneous spaces removed.
  476. */
  477. function tln_unspace(&$attvalue)
  478. {
  479. if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)) {
  480. $attvalue = str_replace(
  481. array("\t", "\r", "\n", "\0", " "),
  482. array('', '', '', '', ''),
  483. $attvalue
  484. );
  485. }
  486. }
  487. /**
  488. * This function runs various checks against the attributes.
  489. *
  490. * @param string $tagname String with the name of the tag.
  491. * @param array $attary Array with all tag attributes.
  492. * @param array $rm_attnames See description for tln_sanitize
  493. * @param array $bad_attvals See description for tln_sanitize
  494. * @param array $add_attr_to_tag See description for tln_sanitize
  495. * @param string $trans_image_path
  496. * @param boolean $block_external_images
  497. * @return array with modified attributes.
  498. */
  499. function tln_fixatts(
  500. $tagname,
  501. $attary,
  502. $rm_attnames,
  503. $bad_attvals,
  504. $add_attr_to_tag,
  505. $trans_image_path,
  506. $block_external_images
  507. ) {
  508. while (list($attname, $attvalue) = each($attary)) {
  509. /**
  510. * See if this attribute should be removed.
  511. */
  512. foreach ($rm_attnames as $matchtag => $matchattrs) {
  513. if (preg_match($matchtag, $tagname)) {
  514. foreach ($matchattrs as $matchattr) {
  515. if (preg_match($matchattr, $attname)) {
  516. unset($attary{$attname});
  517. continue;
  518. }
  519. }
  520. }
  521. }
  522. /**
  523. * Remove any backslashes, entities, or extraneous whitespace.
  524. */
  525. $oldattvalue = $attvalue;
  526. tln_defang($attvalue);
  527. if ($attname == 'style' && $attvalue !== $oldattvalue) {
  528. $attvalue = "idiocy";
  529. $attary{$attname} = $attvalue;
  530. }
  531. tln_unspace($attvalue);
  532. /**
  533. * Now let's run checks on the attvalues.
  534. * I don't expect anyone to comprehend this. If you do,
  535. * get in touch with me so I can drive to where you live and
  536. * shake your hand personally. :)
  537. */
  538. foreach ($bad_attvals as $matchtag => $matchattrs) {
  539. if (preg_match($matchtag, $tagname)) {
  540. foreach ($matchattrs as $matchattr => $valary) {
  541. if (preg_match($matchattr, $attname)) {
  542. /**
  543. * There are two arrays in valary.
  544. * First is matches.
  545. * Second one is replacements
  546. */
  547. list($valmatch, $valrepl) = $valary;
  548. $newvalue = preg_replace($valmatch, $valrepl, $attvalue);
  549. if ($newvalue != $attvalue) {
  550. $attary{$attname} = $newvalue;
  551. $attvalue = $newvalue;
  552. }
  553. }
  554. }
  555. }
  556. }
  557. if ($attname == 'style') {
  558. if (preg_match('/[\0-\37\200-\377]+/', $attvalue)) {
  559. $attary{$attname} = '"disallowed character"';
  560. }
  561. preg_match_all("/url\s*\((.+)\)/si", $attvalue, $aMatch);
  562. if (count($aMatch)) {
  563. foreach($aMatch[1] as $sMatch) {
  564. $urlvalue = $sMatch;
  565. tln_fixurl($attname, $urlvalue, $trans_image_path, $block_external_images);
  566. $attary{$attname} = str_replace($sMatch, $urlvalue, $attvalue);
  567. }
  568. }
  569. }
  570. }
  571. /**
  572. * See if we need to append any attributes to this tag.
  573. */
  574. foreach ($add_attr_to_tag as $matchtag => $addattary) {
  575. if (preg_match($matchtag, $tagname)) {
  576. $attary = array_merge($attary, $addattary);
  577. }
  578. }
  579. return $attary;
  580. }
  581. function tln_fixurl($attname, &$attvalue, $trans_image_path, $block_external_images)
  582. {
  583. $sQuote = '"';
  584. $attvalue = trim($attvalue);
  585. if ($attvalue && ($attvalue[0] =='"'|| $attvalue[0] == "'")) {
  586. // remove the double quotes
  587. $sQuote = $attvalue[0];
  588. $attvalue = trim(substr($attvalue,1,-1));
  589. }
  590. /**
  591. * Replace empty src tags with the blank image. src is only used
  592. * for frames, images, and image inputs. Doing a replace should
  593. * not affect them working as should be, however it will stop
  594. * IE from being kicked off when src for img tags are not set
  595. */
  596. if ($attvalue == '') {
  597. $attvalue = $sQuote . $trans_image_path . $sQuote;
  598. } else {
  599. // first, disallow 8 bit characters and control characters
  600. if (preg_match('/[\0-\37\200-\377]+/',$attvalue)) {
  601. switch ($attname) {
  602. case 'href':
  603. $attvalue = $sQuote . 'http://invalid-stuff-detected.example.com' . $sQuote;
  604. break;
  605. default:
  606. $attvalue = $sQuote . $trans_image_path . $sQuote;
  607. break;
  608. }
  609. } else {
  610. $aUrl = parse_url($attvalue);
  611. if (isset($aUrl['scheme'])) {
  612. switch(strtolower($aUrl['scheme'])) {
  613. case 'mailto':
  614. case 'http':
  615. case 'https':
  616. case 'ftp':
  617. if ($attname != 'href') {
  618. if ($block_external_images == true) {
  619. $attvalue = $sQuote . $trans_image_path . $sQuote;
  620. } else {
  621. if (!isset($aUrl['path'])) {
  622. $attvalue = $sQuote . $trans_image_path . $sQuote;
  623. }
  624. }
  625. } else {
  626. $attvalue = $sQuote . $attvalue . $sQuote;
  627. }
  628. break;
  629. case 'outbind':
  630. $attvalue = $sQuote . $attvalue . $sQuote;
  631. break;
  632. case 'cid':
  633. $attvalue = $sQuote . $attvalue . $sQuote;
  634. break;
  635. default:
  636. $attvalue = $sQuote . $trans_image_path . $sQuote;
  637. break;
  638. }
  639. } else {
  640. if (!isset($aUrl['path']) || $aUrl['path'] != $trans_image_path) {
  641. $$attvalue = $sQuote . $trans_image_path . $sQuote;
  642. }
  643. }
  644. }
  645. }
  646. }
  647. function tln_fixstyle($body, $pos, $trans_image_path, $block_external_images)
  648. {
  649. // workaround for </style> in between comments
  650. $content = '';
  651. $sToken = '';
  652. $bSucces = false;
  653. $bEndTag = false;
  654. for ($i=$pos,$iCount=strlen($body);$i<$iCount;++$i) {
  655. $char = $body{$i};
  656. switch ($char) {
  657. case '<':
  658. $sToken = $char;
  659. break;
  660. case '/':
  661. if ($sToken == '<') {
  662. $sToken .= $char;
  663. $bEndTag = true;
  664. } else {
  665. $content .= $char;
  666. }
  667. break;
  668. case '>':
  669. if ($bEndTag) {
  670. $sToken .= $char;
  671. if (preg_match('/\<\/\s*style\s*\>/i',$sToken,$aMatch)) {
  672. $newpos = $i + 1;
  673. $bSucces = true;
  674. break 2;
  675. } else {
  676. $content .= $sToken;
  677. }
  678. $bEndTag = false;
  679. } else {
  680. $content .= $char;
  681. }
  682. break;
  683. case '!':
  684. if ($sToken == '<') {
  685. // possible comment
  686. if (isset($body{$i+2}) && substr($body,$i,3) == '!--') {
  687. $i = strpos($body,'-->',$i+3);
  688. if ($i === false) { // no end comment
  689. $i = strlen($body);
  690. }
  691. $sToken = '';
  692. }
  693. } else {
  694. $content .= $char;
  695. }
  696. break;
  697. default:
  698. if ($bEndTag) {
  699. $sToken .= $char;
  700. } else {
  701. $content .= $char;
  702. }
  703. break;
  704. }
  705. }
  706. if ($bSucces == FALSE){
  707. return array(FALSE, strlen($body));
  708. }
  709. /**
  710. * First look for general BODY style declaration, which would be
  711. * like so:
  712. * body {background: blah-blah}
  713. * and change it to .bodyclass so we can just assign it to a <div>
  714. */
  715. $content = preg_replace("|body(\s*\{.*?\})|si", ".bodyclass\\1", $content);
  716. /**
  717. * Fix url('blah') declarations.
  718. */
  719. // $content = preg_replace("|url\s*\(\s*([\'\"])\s*\S+script\s*:.*?([\'\"])\s*\)|si",
  720. // "url(\\1$trans_image_path\\2)", $content);
  721. // first check for 8bit sequences and disallowed control characters
  722. if (preg_match('/[\16-\37\200-\377]+/',$content)) {
  723. $content = '<!-- style block removed by html filter due to presence of 8bit characters -->';
  724. return array($content, $newpos);
  725. }
  726. // remove @import line
  727. $content = preg_replace("/^\s*(@import.*)$/mi","\n<!-- @import rules forbidden -->\n",$content);
  728. $content = preg_replace("/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i", 'url', $content);
  729. preg_match_all("/url\s*\((.+)\)/si",$content,$aMatch);
  730. if (count($aMatch)) {
  731. $aValue = $aReplace = array();
  732. foreach($aMatch[1] as $sMatch) {
  733. // url value
  734. $urlvalue = $sMatch;
  735. tln_fixurl('style',$urlvalue, $trans_image_path, $block_external_images);
  736. $aValue[] = $sMatch;
  737. $aReplace[] = $urlvalue;
  738. }
  739. $content = str_replace($aValue,$aReplace,$content);
  740. }
  741. /**
  742. * Remove any backslashes, entities, and extraneous whitespace.
  743. */
  744. $contentTemp = $content;
  745. tln_defang($contentTemp);
  746. tln_unspace($contentTemp);
  747. $match = array('/\/\*.*\*\//',
  748. '/expression/i',
  749. '/behaviou*r/i',
  750. '/binding/i',
  751. '/include-source/i',
  752. '/javascript/i',
  753. '/script/i',
  754. '/position/i');
  755. $replace = array('','idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', '');
  756. $contentNew = preg_replace($match, $replace, $contentTemp);
  757. if ($contentNew !== $contentTemp) {
  758. $content = $contentNew;
  759. }
  760. return array($content, $newpos);
  761. }
  762. function tln_body2div($attary, $trans_image_path)
  763. {
  764. $divattary = array('class' => "'bodyclass'");
  765. $text = '#000000';
  766. $has_bgc_stl = $has_txt_stl = false;
  767. $styledef = '';
  768. if (is_array($attary) && sizeof($attary) > 0){
  769. foreach ($attary as $attname=>$attvalue){
  770. $quotchar = substr($attvalue, 0, 1);
  771. $attvalue = str_replace($quotchar, "", $attvalue);
  772. switch ($attname){
  773. case 'background':
  774. $styledef .= "background-image: url('$trans_image_path'); ";
  775. break;
  776. case 'bgcolor':
  777. $has_bgc_stl = true;
  778. $styledef .= "background-color: $attvalue; ";
  779. break;
  780. case 'text':
  781. $has_txt_stl = true;
  782. $styledef .= "color: $attvalue; ";
  783. break;
  784. }
  785. }
  786. // Outlook defines a white bgcolor and no text color. This can lead to
  787. // white text on a white bg with certain themes.
  788. if ($has_bgc_stl && !$has_txt_stl) {
  789. $styledef .= "color: $text; ";
  790. }
  791. if (strlen($styledef) > 0){
  792. $divattary{"style"} = "\"$styledef\"";
  793. }
  794. }
  795. return $divattary;
  796. }
  797. /**
  798. *
  799. * @param string $body The HTML you wish to filter
  800. * @param array $tag_list see description above
  801. * @param array $rm_tags_with_content see description above
  802. * @param array $self_closing_tags see description above
  803. * @param boolean $force_tag_closing see description above
  804. * @param array $rm_attnames see description above
  805. * @param array $bad_attvals see description above
  806. * @param array $add_attr_to_tag see description above
  807. * @param string $trans_image_path
  808. * @param boolean $block_external_images
  809. * @return string Sanitized html safe to show on your pages.
  810. */
  811. function tln_sanitize(
  812. $body,
  813. $tag_list,
  814. $rm_tags_with_content,
  815. $self_closing_tags,
  816. $force_tag_closing,
  817. $rm_attnames,
  818. $bad_attvals,
  819. $add_attr_to_tag,
  820. $trans_image_path,
  821. $block_external_images
  822. ) {
  823. /**
  824. * Normalize rm_tags and rm_tags_with_content.
  825. */
  826. $rm_tags = array_shift($tag_list);
  827. @array_walk($tag_list, 'tln_casenormalize');
  828. @array_walk($rm_tags_with_content, 'tln_casenormalize');
  829. @array_walk($self_closing_tags, 'tln_casenormalize');
  830. /**
  831. * See if tag_list is of tags to remove or tags to allow.
  832. * false means remove these tags
  833. * true means allow these tags
  834. */
  835. $curpos = 0;
  836. $open_tags = array();
  837. $trusted = "<!-- begin tln_sanitized html -->\n";
  838. $skip_content = false;
  839. /**
  840. * Take care of netscape's stupid javascript entities like
  841. * &{alert('boo')};
  842. */
  843. $body = preg_replace('/&(\{.*?\};)/si', '&amp;\\1', $body);
  844. while (($curtag = tln_getnxtag($body, $curpos)) != false) {
  845. list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
  846. $free_content = substr($body, $curpos, $lt-$curpos);
  847. /**
  848. * Take care of <style>
  849. */
  850. if ($tagname == "style" && $tagtype == 1){
  851. list($free_content, $curpos) =
  852. tln_fixstyle($body, $gt+1, $trans_image_path, $block_external_images);
  853. if ($free_content != FALSE){
  854. if ( !empty($attary) ) {
  855. $attary = tln_fixatts($tagname,
  856. $attary,
  857. $rm_attnames,
  858. $bad_attvals,
  859. $add_attr_to_tag,
  860. $trans_image_path,
  861. $block_external_images
  862. );
  863. }
  864. $trusted .= tln_tagprint($tagname, $attary, $tagtype);
  865. $trusted .= $free_content;
  866. $trusted .= tln_tagprint($tagname, null, 2);
  867. }
  868. continue;
  869. }
  870. if ($skip_content == false){
  871. $trusted .= $free_content;
  872. }
  873. if ($tagname != false) {
  874. if ($tagtype == 2) {
  875. if ($skip_content == $tagname) {
  876. /**
  877. * Got to the end of tag we needed to remove.
  878. */
  879. $tagname = false;
  880. $skip_content = false;
  881. } else {
  882. if ($skip_content == false) {
  883. if ($tagname == "body") {
  884. $tagname = "div";
  885. }
  886. if (isset($open_tags{$tagname}) &&
  887. $open_tags{$tagname} > 0
  888. ) {
  889. $open_tags{$tagname}--;
  890. } else {
  891. $tagname = false;
  892. }
  893. }
  894. }
  895. } else {
  896. /**
  897. * $rm_tags_with_content
  898. */
  899. if ($skip_content == false) {
  900. /**
  901. * See if this is a self-closing type and change
  902. * tagtype appropriately.
  903. */
  904. if ($tagtype == 1
  905. && in_array($tagname, $self_closing_tags)
  906. ) {
  907. $tagtype = 3;
  908. }
  909. /**
  910. * See if we should skip this tag and any content
  911. * inside it.
  912. */
  913. if ($tagtype == 1
  914. && in_array($tagname, $rm_tags_with_content)
  915. ) {
  916. $skip_content = $tagname;
  917. } else {
  918. if (($rm_tags == false
  919. && in_array($tagname, $tag_list)) ||
  920. ($rm_tags == true
  921. && !in_array($tagname, $tag_list))
  922. ) {
  923. $tagname = false;
  924. } else {
  925. /**
  926. * Convert body into div.
  927. */
  928. if ($tagname == "body"){
  929. $tagname = "div";
  930. $attary = tln_body2div($attary, $trans_image_path);
  931. }
  932. if ($tagtype == 1) {
  933. if (isset($open_tags{$tagname})) {
  934. $open_tags{$tagname}++;
  935. } else {
  936. $open_tags{$tagname} = 1;
  937. }
  938. }
  939. /**
  940. * This is where we run other checks.
  941. */
  942. if (is_array($attary) && sizeof($attary) > 0) {
  943. $attary = tln_fixatts(
  944. $tagname,
  945. $attary,
  946. $rm_attnames,
  947. $bad_attvals,
  948. $add_attr_to_tag,
  949. $trans_image_path,
  950. $block_external_images
  951. );
  952. }
  953. }
  954. }
  955. }
  956. }
  957. if ($tagname != false && $skip_content == false) {
  958. $trusted .= tln_tagprint($tagname, $attary, $tagtype);
  959. }
  960. }
  961. $curpos = $gt + 1;
  962. }
  963. $trusted .= substr($body, $curpos, strlen($body) - $curpos);
  964. if ($force_tag_closing == true) {
  965. foreach ($open_tags as $tagname => $opentimes) {
  966. while ($opentimes > 0) {
  967. $trusted .= '</' . $tagname . '>';
  968. $opentimes--;
  969. }
  970. }
  971. $trusted .= "\n";
  972. }
  973. $trusted .= "<!-- end tln_sanitized html -->\n";
  974. return $trusted;
  975. }
  976. //
  977. // Use the nifty htmlfilter library
  978. //
  979. function HTMLFilter($body, $trans_image_path, $block_external_images = false)
  980. {
  981. $tag_list = array(
  982. false,
  983. "object",
  984. "meta",
  985. "html",
  986. "head",
  987. "base",
  988. "link",
  989. "frame",
  990. "iframe",
  991. "plaintext",
  992. "marquee"
  993. );
  994. $rm_tags_with_content = array(
  995. "script",
  996. "applet",
  997. "embed",
  998. "title",
  999. "frameset",
  1000. "xmp",
  1001. "xml"
  1002. );
  1003. $self_closing_tags = array(
  1004. "img",
  1005. "br",
  1006. "hr",
  1007. "input",
  1008. "outbind"
  1009. );
  1010. $force_tag_closing = true;
  1011. $rm_attnames = array(
  1012. "/.*/" =>
  1013. array(
  1014. // "/target/i",
  1015. "/^on.*/i",
  1016. "/^dynsrc/i",
  1017. "/^data.*/i",
  1018. "/^lowsrc.*/i"
  1019. )
  1020. );
  1021. $bad_attvals = array(
  1022. "/.*/" =>
  1023. array(
  1024. "/^src|background/i" =>
  1025. array(
  1026. array(
  1027. '/^([\'"])\s*\S+script\s*:.*([\'"])/si',
  1028. '/^([\'"])\s*mocha\s*:*.*([\'"])/si',
  1029. '/^([\'"])\s*about\s*:.*([\'"])/si'
  1030. ),
  1031. array(
  1032. "\\1$trans_image_path\\2",
  1033. "\\1$trans_image_path\\2",
  1034. "\\1$trans_image_path\\2"
  1035. )
  1036. ),
  1037. "/^href|action/i" =>
  1038. array(
  1039. array(
  1040. '/^([\'"])\s*\S+script\s*:.*([\'"])/si',
  1041. '/^([\'"])\s*mocha\s*:*.*([\'"])/si',
  1042. '/^([\'"])\s*about\s*:.*([\'"])/si'
  1043. ),
  1044. array(
  1045. "\\1#\\1",
  1046. "\\1#\\1",
  1047. "\\1#\\1"
  1048. )
  1049. ),
  1050. "/^style/i" =>
  1051. array(
  1052. array(
  1053. "/\/\*.*\*\//",
  1054. "/expression/i",
  1055. "/binding/i",
  1056. "/behaviou*r/i",
  1057. "/include-source/i",
  1058. '/position\s*:/i',
  1059. '/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i',
  1060. '/url\s*\(\s*([\'"])\s*\S+script\s*:.*([\'"])\s*\)/si',
  1061. '/url\s*\(\s*([\'"])\s*mocha\s*:.*([\'"])\s*\)/si',
  1062. '/url\s*\(\s*([\'"])\s*about\s*:.*([\'"])\s*\)/si',
  1063. '/(.*)\s*:\s*url\s*\(\s*([\'"]*)\s*\S+script\s*:.*([\'"]*)\s*\)/si'
  1064. ),
  1065. array(
  1066. "",
  1067. "idiocy",
  1068. "idiocy",
  1069. "idiocy",
  1070. "idiocy",
  1071. "idiocy",
  1072. "url",
  1073. "url(\\1#\\1)",
  1074. "url(\\1#\\1)",
  1075. "url(\\1#\\1)",
  1076. "\\1:url(\\2#\\3)"
  1077. )
  1078. )
  1079. )
  1080. );
  1081. if ($block_external_images) {
  1082. array_push(
  1083. $bad_attvals{'/.*/'}{'/^src|background/i'}[0],
  1084. '/^([\'\"])\s*https*:.*([\'\"])/si'
  1085. );
  1086. array_push(
  1087. $bad_attvals{'/.*/'}{'/^src|background/i'}[1],
  1088. "\\1$trans_image_path\\1"
  1089. );
  1090. array_push(
  1091. $bad_attvals{'/.*/'}{'/^style/i'}[0],
  1092. '/url\(([\'\"])\s*https*:.*([\'\"])\)/si'
  1093. );
  1094. array_push(
  1095. $bad_attvals{'/.*/'}{'/^style/i'}[1],
  1096. "url(\\1$trans_image_path\\1)"
  1097. );
  1098. }
  1099. $add_attr_to_tag = array(
  1100. "/^a$/i" =>
  1101. array('target' => '"_blank"')
  1102. );
  1103. $trusted = tln_sanitize(
  1104. $body,
  1105. $tag_list,
  1106. $rm_tags_with_content,
  1107. $self_closing_tags,
  1108. $force_tag_closing,
  1109. $rm_attnames,
  1110. $bad_attvals,
  1111. $add_attr_to_tag,
  1112. $trans_image_path,
  1113. $block_external_images
  1114. );
  1115. return $trusted;
  1116. }