PageRenderTime 49ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/system/classes/inputfilter.php

https://github.com/HabariMag/habarimag-old
PHP | 487 lines | 321 code | 45 blank | 121 comment | 59 complexity | d35920e60f1266583f042b2629b06998 MD5 | raw file
Possible License(s): Apache-2.0
  1. <?php
  2. /**
  3. * @package Habari
  4. *
  5. */
  6. /**
  7. * Input filtering functions.
  8. *
  9. */
  10. class InputFilter
  11. {
  12. /**
  13. * Allowed elements.
  14. */
  15. private static $whitelist_elements = array(
  16. // http://www.w3.org/TR/html4/struct/global.html#h-7.5.4
  17. 'div', 'span',
  18. // http://www.w3.org/TR/html4/struct/links.html#h-12.2
  19. 'a',
  20. // http://www.w3.org/TR/html4/struct/text.html#h-9.2.1
  21. 'strong', 'em', 'code', 'kbd', 'dfn', 'samp', 'var', 'cite', 'abbr', 'acronym',
  22. // http://www.w3.org/TR/html4/struct/text.html#h-9.2.2
  23. 'blockquote', 'q',
  24. // http://www.w3.org/TR/html4/struct/text.html#h-9.2.3
  25. 'sub', 'sup',
  26. // http://www.w3.org/TR/html4/struct/text.html#h-9.3.1
  27. 'p',
  28. // http://www.w3.org/TR/html4/struct/text.html#h-9.3.2.1
  29. 'br',
  30. // http://www.w3.org/TR/html4/struct/text.html#h-9.3.4
  31. 'pre',
  32. // http://www.w3.org/TR/html4/struct/text.html#h-9.4
  33. 'ins', 'del',
  34. // http://www.w3.org/TR/html4/struct/lists.html#h-10.2
  35. 'ol', 'ul', 'li',
  36. // http://www.w3.org/TR/html4/struct/lists.html#h-10.3
  37. 'dl', 'dt', 'dd',
  38. // http://www.w3.org/TR/html4/present/graphics.html#h-15.2.1
  39. 'b', 'i', 'u', 's', 'tt',
  40. // http://www.w3.org/TR/html4/struct/global.html#h-7.5.5
  41. 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
  42. // http://www.w3.org/TR/html4/struct/global.html#h-7.5.6
  43. 'address',
  44. // http://www.w3.org/TR/html4/struct/dirlang.html#h-8.2.4
  45. 'bdo',
  46. // http://www.w3.org/TR/html4/struct/tables.html#h-11.2.1
  47. 'table',
  48. // http://www.w3.org/TR/html4/struct/tables.html#h-11.2.2
  49. 'caption',
  50. // http://www.w3.org/TR/html4/struct/tables.html#h-11.2.3
  51. 'thead', 'tfoot', 'tbody',
  52. // http://www.w3.org/TR/html4/struct/tables.html#h-11.2.4
  53. 'colgroup', 'col',
  54. // http://www.w3.org/TR/html4/struct/tables.html#h-11.2.5
  55. 'tr',
  56. // http://www.w3.org/TR/html4/struct/tables.html#h-11.2.6
  57. 'th', 'td',
  58. // http://www.w3.org/TR/html4/struct/objects.html#h-13.2
  59. 'img',
  60. // http://www.w3.org/TR/html4/struct/objects.html#h-13.6.1
  61. 'map', 'area',
  62. // http://www.w3.org/TR/html4/present/graphics.html#h-15.2.1 (the non-deprecated ones)
  63. 'tt', 'i', 'b', 'big', 'small',
  64. // http://www.w3.org/TR/html4/present/graphics.html#h-15.3
  65. 'hr',
  66. // http://www.w3.org/TR/html4/present/frames.html#h-16.2.1
  67. 'frameset',
  68. // http://www.w3.org/TR/html4/present/frames.html#h-16.2.2
  69. 'frame',
  70. // http://www.w3.org/TR/html4/present/frames.html#h-16.4.1
  71. 'noframes',
  72. // http://www.w3.org/TR/html4/present/frames.html#h-16.5
  73. 'iframe',
  74. );
  75. /**
  76. * Allowed attributes and values.
  77. */
  78. private static $whitelist_attributes = array(
  79. // attributes that are valid for ALL elements (a subset of coreattrs)
  80. // elements that only take coreattrs don't need to be listed separately
  81. '*' => array(
  82. 'lang' => 'language-code',
  83. 'xml:lang' => 'language-code', // this is our xhtml support... all of it
  84. 'dir' => array( 'ltr', 'rtl', ),
  85. 'title' => 'text',
  86. ),
  87. // http://www.w3.org/TR/html4/struct/links.html#h-12.2
  88. 'a' => array( 'href' => 'uri', ),
  89. // http://www.w3.org/TR/html4/struct/text.html#h-9.4
  90. 'ins' => array( 'cite' => 'uri', 'datetime' => 'datetime', ),
  91. 'del' => array( 'cite' => 'uri', 'datetime' => 'datetime', ),
  92. // http://www.w3.org/TR/html4/struct/text.html#h-9.2.2
  93. 'blockquote' => array( 'cite' => 'uri', ),
  94. 'q' => array( 'cite' => 'uri', ),
  95. 'img' => array( 'src' => 'uri', 'alt' => 'text' ),
  96. );
  97. /**
  98. * #EMPTY elements.
  99. */
  100. private static $elements_empty = array(
  101. 'img',
  102. );
  103. /**
  104. * Protocols that are ok for use in URIs.
  105. */
  106. private static $whitelist_protocols = array(
  107. 'http', 'https', 'ftp', 'mailto', 'irc', 'news', 'nntp', 'callto', 'rtsp', 'mms', 'svn',
  108. );
  109. /**
  110. * List of all defined named character entities in HTML 4.01 and XHTML.
  111. */
  112. private static $character_entities = array(
  113. 'nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen', 'brvbar', 'sect', 'uml',
  114. 'copy', 'ordf', 'laquo', 'not', 'shy', 'reg', 'macr', 'deg', 'plusmn',
  115. 'sup2', 'sup3', 'acute', 'micro', 'para', 'middot', 'cedil', 'sup1', 'ordm',
  116. 'raquo', 'frac14', 'frac12', 'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc',
  117. 'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil', 'Egrave', 'Eacute', 'Ecirc',
  118. 'Euml', 'Igrave', 'Iacute', 'Icirc', 'Iuml', 'ETH', 'Ntilde', 'Ograve',
  119. 'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times', 'Oslash', 'Ugrave', 'Uacute',
  120. 'Ucirc', 'Uuml', 'Yacute', 'THORN', 'szlig', 'agrave', 'aacute', 'acirc',
  121. 'atilde', 'auml', 'aring', 'aelig', 'ccedil', 'egrave', 'eacute', 'ecirc',
  122. 'euml', 'igrave', 'iacute', 'icirc', 'iuml', 'eth', 'ntilde', 'ograve',
  123. 'oacute', 'ocirc', 'otilde', 'ouml', 'divide', 'oslash', 'ugrave', 'uacute',
  124. 'ucirc', 'uuml', 'yacute', 'thorn', 'yuml', 'fnof', 'Alpha', 'Beta', 'Gamma',
  125. 'Delta', 'Epsilon', 'Zeta', 'Eta', 'Theta', 'Iota', 'Kappa', 'Lambda', 'Mu',
  126. 'Nu', 'Xi', 'Omicron', 'Pi', 'Rho', 'Sigma', 'Tau', 'Upsilon', 'Phi', 'Chi',
  127. 'Psi', 'Omega', 'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta',
  128. 'theta', 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho',
  129. 'sigmaf', 'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega',
  130. 'thetasym', 'upsih', 'piv', 'bull', 'hellip', 'prime', 'Prime', 'oline',
  131. 'frasl', 'weierp', 'image', 'real', 'trade', 'alefsym', 'larr', 'uarr',
  132. 'rarr', 'darr', 'harr', 'crarr', 'lArr', 'uArr', 'rArr', 'dArr', 'hArr',
  133. 'forall', 'part', 'exist', 'empty', 'nabla', 'isin', 'notin', 'ni', 'prod',
  134. 'sum', 'minus', 'lowast', 'radic', 'prop', 'infin', 'ang', 'and', 'or',
  135. 'cap', 'cup', 'int', 'there4', 'sim', 'cong', 'asymp', 'ne', 'equiv', 'le',
  136. 'ge', 'sub', 'sup', 'nsub', 'sube', 'supe', 'oplus', 'otimes', 'perp',
  137. 'sdot', 'lceil', 'rceil', 'lfloor', 'rfloor', 'lang', 'rang', 'loz',
  138. 'spades', 'clubs', 'hearts', 'diams', 'quot', 'amp', 'lt', 'gt', 'OElig',
  139. 'oelig', 'Scaron', 'scaron', 'Yuml', 'circ', 'tilde', 'ensp', 'emsp',
  140. 'thinsp', 'zwnj', 'zwj', 'lrm', 'rlm', 'ndash', 'mdash', 'lsquo', 'rsquo',
  141. 'sbquo', 'ldquo', 'rdquo', 'bdquo', 'dagger', 'Dagger', 'permil', 'lsaquo',
  142. 'rsaquo', 'euro',
  143. );
  144. private static $character_entities_re = '';
  145. /**
  146. * Perform all filtering, return new string.
  147. * @param string $str Input string.
  148. * @return string Filtered output string.
  149. */
  150. public static function filter( $str )
  151. {
  152. if ( !MultiByte::valid_data( $str ) ) {
  153. return '';
  154. }
  155. else {
  156. do {
  157. $_str = $str;
  158. $str = self::strip_nulls( $str );
  159. $str = self::strip_illegal_entities( $str );
  160. $str = self::filter_html_elements( $str );
  161. } while ( $str != $_str );
  162. return $str;
  163. }
  164. }
  165. public static function strip_nulls( $str )
  166. {
  167. $str = str_replace( '\0', '', $str );
  168. return $str;
  169. }
  170. /**
  171. * Callback function for strip_illegal_entities, do not use.
  172. * @access private
  173. * @param array $m matches
  174. */
  175. public static function _validate_entity( $m )
  176. {
  177. $is_valid = false;
  178. // valid entity references have the form
  179. // /&named([;<\n\r])/
  180. // for named entities, or
  181. // /&#(\d{1,5}|[xX][0-9a-fA-F]{1,4})([;<\n\r])/
  182. // for numeric character references
  183. $e = trim( $m[1] );
  184. $r = $m[2];
  185. if ( $r == ';' ) {
  186. $r = '';
  187. }
  188. if ( $e{0} == '#' ) {
  189. $e = strtolower( $e );
  190. if ( $e{1} == 'x' ) {
  191. $e = hexdec( substr( $e, 2 ) );
  192. }
  193. else {
  194. $e = substr( $e, 1 );
  195. }
  196. // numeric character references may only have values in the range 0-65535 (16 bit)
  197. // we strip null, though, just for kicks
  198. $is_valid = ( intval( $e ) > 0 && intval( $e ) <= 65535 );
  199. if ( $is_valid ) {
  200. // normalize to decimal form
  201. $e = '#' . intval( $e ) . ';';
  202. }
  203. }
  204. else {
  205. if ( self::$character_entities_re == '' ) {
  206. self::$character_entities_re = ';(' . implode( '|', self::$character_entities ) . ');';
  207. }
  208. // named entities must be known
  209. $is_valid = preg_match( self::$character_entities_re, $e, $matches );
  210. // XXX should we map named entities to their numeric equivalents?
  211. if ( $is_valid ) {
  212. // normalize to name and nothing but the name... eh.
  213. $e = $matches[1] . ';';
  214. }
  215. }
  216. return $is_valid ? '&' . $e . $r : '';
  217. }
  218. public static function strip_illegal_entities( $str )
  219. {
  220. $str = preg_replace_callback( "/&([^;<\n\r]+)([;<\n\r])/", array( __CLASS__, '_validate_entity' ), $str );
  221. return $str;
  222. }
  223. /**
  224. * This really doesn't belong here. It should also be done much better. This is a nasty, NASTY kludge.
  225. */
  226. public static function parse_url( $url )
  227. {
  228. // result array
  229. $r = array(
  230. 'scheme' => '',
  231. 'host' => '',
  232. 'port' => '',
  233. 'user' => '',
  234. 'pass' => '',
  235. 'path' => '',
  236. 'query' => '',
  237. 'fragment' => '',
  238. //
  239. 'is_relative' => false,
  240. 'is_pseudo' => false,
  241. 'is_error' => false,
  242. //
  243. 'pseudo_args' => '',
  244. );
  245. // sanitize the url
  246. $sanitized = html_entity_decode( $url, null, 'UTF-8' ); // make double-sure we've converted all entities
  247. $sanitized = filter_var( $sanitized, FILTER_SANITIZE_URL ); // strip everything but ascii, essentially
  248. $sanitized_scheme = parse_url( $sanitized, PHP_URL_SCHEME );
  249. // Use PHP's parse_url to get the basics
  250. $parsed = parse_url( $url );
  251. if ( $parsed == false ) {
  252. $r['is_error'] = true;
  253. return $r;
  254. }
  255. $r = array_merge( $r, $parsed );
  256. // replace the scheme with the one we got from the fully-sanitized string
  257. $r['scheme'] = $sanitized_scheme;
  258. $r['is_pseudo'] = !in_array( $r['scheme'], array( 'http', 'https', '' ) );
  259. $r['is_relative'] = ( $r['host'] == '' && !$r['is_pseudo'] );
  260. if ( $r['is_pseudo'] ) {
  261. $r['pseudo_args'] = $r['path'];
  262. $r['path'] = '';
  263. }
  264. return $r;
  265. }
  266. /**
  267. * Restore a URL separated by a parse_url() call.
  268. * @param $parsed_url array An array as returned by parse_url()
  269. */
  270. public static function glue_url( $parsed_url )
  271. {
  272. if ( ! is_array( $parsed_url ) ) {
  273. return false;
  274. }
  275. $res = '';
  276. $res .= $parsed_url['scheme'];
  277. if ( $parsed_url['is_pseudo'] || in_array( strtolower( $parsed_url['scheme'] ), array( 'mailto', 'callto' ) ) ) {
  278. $res .= ':';
  279. }
  280. else {
  281. if ( ! $parsed_url['is_relative'] ) {
  282. $res .= '://';
  283. }
  284. }
  285. if ( $parsed_url['is_pseudo'] ) {
  286. $res .= $parsed_url['pseudo_args'];
  287. }
  288. else {
  289. // user[:pass]@
  290. if ( $parsed_url['user'] ) {
  291. $res .= $parsed_url['user'];
  292. if ( $parsed_url['pass'] ) {
  293. $res .= ':' . $parsed_url['pass'];
  294. }
  295. $res .= '@';
  296. }
  297. $res .= $parsed_url['host'];
  298. if ( !empty( $parsed_url['port'] ) ) {
  299. if ( array_key_exists( $parsed_url['scheme'], Utils::scheme_ports() ) && Utils::scheme_ports( $parsed_url['scheme'] ) == $parsed_url['port'] ) {
  300. // default port for this scheme, do nothing
  301. }
  302. else {
  303. $res .= ':' . $parsed_url['port'];
  304. }
  305. }
  306. if ( !empty( $parsed_url['path'] ) ) {
  307. $res .= $parsed_url['path'];
  308. }
  309. else {
  310. $res .= '/';
  311. }
  312. if ( $parsed_url['query'] ) {
  313. $res .= '?' . $parsed_url['query'];
  314. }
  315. if ( $parsed_url['fragment'] ) {
  316. $res .= '#' . $parsed_url['fragment'];
  317. }
  318. }
  319. return $res;
  320. }
  321. private static function check_attr_value( $k, $v, $type )
  322. {
  323. if ( is_array( $type ) ) {
  324. // array of allowed values, exact matches only
  325. return in_array( $v, $type, true );
  326. }
  327. else {
  328. // data type
  329. switch ( $type ) {
  330. case 'uri':
  331. // RfC 2396 <http://www.ietf.org/rfc/rfc2396.txt>
  332. $bits = self::parse_url( $v );
  333. return $bits['is_relative'] || in_array( $bits['scheme'], self::$whitelist_protocols );
  334. break;
  335. case 'language-code':
  336. // RfC 1766 <http://www.ietf.org/rfc/rfc1766.txt>
  337. // Language-Tag = Primary-tag *( "-" Subtag )
  338. // Primary-tag = 1*8ALPHA
  339. // Subtag = 1*8ALPHA
  340. return preg_match( '/^[a-zA-Z]{1,8}(?:-[a-zA-Z]{1,8})*$/i', $v );
  341. break;
  342. case 'text':
  343. // XXX is this sufficient?
  344. return is_string( $v );
  345. break;
  346. case 'datetime':
  347. // <http://www.w3.org/TR/1998/NOTE-datetime-19980827>
  348. // <http://www.w3.org/TR/html4/types.html#h-6.11>
  349. // YYYY-MM-DDThh:mm:ssTZD
  350. return preg_match( '/^[0-9]{4}-[0-1][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]:[0-5][0-9](?:Z|[\+-][0-2][0-9]:[0-5][0-9])$/', $v );
  351. break;
  352. default:
  353. Error::raise( sprintf( _t( 'Unkown attribute type "%s" in %s' ), $type, __CLASS__ ) );
  354. return false;
  355. }
  356. }
  357. }
  358. /**
  359. * @todo TODO must build DOM to really properly remove offending elements
  360. * @todo TODO properly filter URLs
  361. */
  362. public static function filter_html_elements( $str )
  363. {
  364. $tokenizer = new HTMLTokenizer( $str );
  365. // tokenize, baby
  366. $tokens = $tokenizer->parse();
  367. // filter token stream
  368. $filtered = new HTMLTokenSet;
  369. $stack = array();
  370. foreach ( $tokens as $node ) {
  371. switch ( $node['type'] ) {
  372. case HTMLTokenizer::NODE_TYPE_TEXT:
  373. $node['value'] = html_entity_decode( $node['value'], ENT_QUOTES, MultiByte::hab_encoding() );
  374. break;
  375. case HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN:
  376. case HTMLTokenizer::NODE_TYPE_ELEMENT_EMPTY:
  377. // is this element allowed at all?
  378. if ( ! in_array( strtolower( $node['name'] ), self::$whitelist_elements ) ) {
  379. if ( ! in_array( strtolower( $node['name'] ), self::$elements_empty ) ) {
  380. array_push( $stack, $node['name'] );
  381. }
  382. //$node = null; //remove the node completely
  383. // convert the node to text
  384. $node = array(
  385. 'type' => HTMLTokenizer::NODE_TYPE_TEXT,
  386. 'name' => '#text',
  387. 'value' => HTMLTokenSet::token_to_string( $node ),
  388. 'attrs' => array(),
  389. );
  390. }
  391. else {
  392. // check attributes
  393. foreach ( $node['attrs'] as $k => $v ) {
  394. $attr_ok = false;
  395. // if the attribute is in the global whitelist and validates
  396. if ( array_key_exists( strtolower( $k ), self::$whitelist_attributes['*'] ) && self::check_attr_value( strtolower( $k ), $v, self::$whitelist_attributes['*'][ strtolower( $k ) ] ) ) {
  397. $attr_ok = true;
  398. }
  399. // if there is a whitelist for this node and this attribute is in that list and it validates
  400. if ( array_key_exists( strtolower( $node['name'] ), self::$whitelist_attributes ) && array_key_exists( strtolower( $k ), self::$whitelist_attributes[ strtolower( $node['name'] ) ] ) && self::check_attr_value( strtolower( $k ), $v, self::$whitelist_attributes[ strtolower( $node['name'] ) ][ strtolower( $k ) ] ) ) {
  401. $attr_ok = true;
  402. }
  403. // if it wasn't in one of the whitelists or failed its check, remove it
  404. if ( $attr_ok != true ) {
  405. unset( $node['attrs'][$k] );
  406. }
  407. }
  408. }
  409. break;
  410. case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE:
  411. if ( ! in_array( strtolower( $node['name'] ), self::$whitelist_elements ) ) {
  412. if ( strtolower( $temp = array_pop( $stack ) ) !== strtolower( $node['name'] ) ) {
  413. // something weird happened (Luke, use the DOM!)
  414. array_push( $stack, $temp );
  415. }
  416. //$node = null;
  417. //convert the node to text
  418. $node = array(
  419. 'type' => HTMLTokenizer::NODE_TYPE_TEXT,
  420. 'name' => '#text',
  421. 'value' => HTMLTokenSet::token_to_string( $node ),
  422. 'attrs' => array(),
  423. );
  424. }
  425. break;
  426. case HTMLTokenizer::NODE_TYPE_PI:
  427. case HTMLTokenizer::NODE_TYPE_COMMENT:
  428. case HTMLTokenizer::NODE_TYPE_CDATA_SECTION:
  429. case HTMLTokenizer::NODE_TYPE_STATEMENT:
  430. default:
  431. $node = null;
  432. break;
  433. }
  434. if ( $node != null ) {
  435. $filtered[] = $node;
  436. }
  437. }
  438. // rebuild our output string
  439. return preg_replace( '#<([^>\s]+)(?:\s+[^>]+)?></\1>#u', '', (string) $filtered );
  440. }
  441. }
  442. ?>