PageRenderTime 63ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/includes/Sanitizer.php

https://bitbucket.org/brunodefraine/mediawiki
PHP | 1763 lines | 1640 code | 18 blank | 105 comment | 4 complexity | b3d72ef041beabfe1890aa8d333f43ff MD5 | raw file
Possible License(s): GPL-2.0, Apache-2.0, LGPL-3.0

Large files files are truncated, but you can click here to view the full file

  1. <?php
  2. /**
  3. * XHTML sanitizer for MediaWiki
  4. *
  5. * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
  6. * http://www.mediawiki.org/
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License along
  19. * with this program; if not, write to the Free Software Foundation, Inc.,
  20. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21. * http://www.gnu.org/copyleft/gpl.html
  22. *
  23. * @file
  24. * @ingroup Parser
  25. */
  26. /**
  27. * XHTML sanitizer for MediaWiki
  28. * @ingroup Parser
  29. */
  30. class Sanitizer {
  31. /**
  32. * Regular expression to match various types of character references in
  33. * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  34. */
  35. const CHAR_REFS_REGEX =
  36. '/&([A-Za-z0-9\x80-\xff]+);
  37. |&\#([0-9]+);
  38. |&\#[xX]([0-9A-Fa-f]+);
  39. |(&)/x';
  40. /**
  41. * Blacklist for evil uris like javascript:
  42. * WARNING: DO NOT use this in any place that actually requires blacklisting
  43. * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the
  44. * only way to be secure from javascript: uri based xss vectors is to whitelist
  45. * things that you know are safe and deny everything else.
  46. * [1]: http://ha.ckers.org/xss.html
  47. */
  48. const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
  49. const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
  50. /**
  51. * List of all named character entities defined in HTML 4.01
  52. * http://www.w3.org/TR/html4/sgml/entities.html
  53. * As well as &apos; which is only defined starting in XHTML1.
  54. * @private
  55. */
  56. static $htmlEntities = array(
  57. 'Aacute' => 193,
  58. 'aacute' => 225,
  59. 'Acirc' => 194,
  60. 'acirc' => 226,
  61. 'acute' => 180,
  62. 'AElig' => 198,
  63. 'aelig' => 230,
  64. 'Agrave' => 192,
  65. 'agrave' => 224,
  66. 'alefsym' => 8501,
  67. 'Alpha' => 913,
  68. 'alpha' => 945,
  69. 'amp' => 38,
  70. 'and' => 8743,
  71. 'ang' => 8736,
  72. 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
  73. 'Aring' => 197,
  74. 'aring' => 229,
  75. 'asymp' => 8776,
  76. 'Atilde' => 195,
  77. 'atilde' => 227,
  78. 'Auml' => 196,
  79. 'auml' => 228,
  80. 'bdquo' => 8222,
  81. 'Beta' => 914,
  82. 'beta' => 946,
  83. 'brvbar' => 166,
  84. 'bull' => 8226,
  85. 'cap' => 8745,
  86. 'Ccedil' => 199,
  87. 'ccedil' => 231,
  88. 'cedil' => 184,
  89. 'cent' => 162,
  90. 'Chi' => 935,
  91. 'chi' => 967,
  92. 'circ' => 710,
  93. 'clubs' => 9827,
  94. 'cong' => 8773,
  95. 'copy' => 169,
  96. 'crarr' => 8629,
  97. 'cup' => 8746,
  98. 'curren' => 164,
  99. 'dagger' => 8224,
  100. 'Dagger' => 8225,
  101. 'darr' => 8595,
  102. 'dArr' => 8659,
  103. 'deg' => 176,
  104. 'Delta' => 916,
  105. 'delta' => 948,
  106. 'diams' => 9830,
  107. 'divide' => 247,
  108. 'Eacute' => 201,
  109. 'eacute' => 233,
  110. 'Ecirc' => 202,
  111. 'ecirc' => 234,
  112. 'Egrave' => 200,
  113. 'egrave' => 232,
  114. 'empty' => 8709,
  115. 'emsp' => 8195,
  116. 'ensp' => 8194,
  117. 'Epsilon' => 917,
  118. 'epsilon' => 949,
  119. 'equiv' => 8801,
  120. 'Eta' => 919,
  121. 'eta' => 951,
  122. 'ETH' => 208,
  123. 'eth' => 240,
  124. 'Euml' => 203,
  125. 'euml' => 235,
  126. 'euro' => 8364,
  127. 'exist' => 8707,
  128. 'fnof' => 402,
  129. 'forall' => 8704,
  130. 'frac12' => 189,
  131. 'frac14' => 188,
  132. 'frac34' => 190,
  133. 'frasl' => 8260,
  134. 'Gamma' => 915,
  135. 'gamma' => 947,
  136. 'ge' => 8805,
  137. 'gt' => 62,
  138. 'harr' => 8596,
  139. 'hArr' => 8660,
  140. 'hearts' => 9829,
  141. 'hellip' => 8230,
  142. 'Iacute' => 205,
  143. 'iacute' => 237,
  144. 'Icirc' => 206,
  145. 'icirc' => 238,
  146. 'iexcl' => 161,
  147. 'Igrave' => 204,
  148. 'igrave' => 236,
  149. 'image' => 8465,
  150. 'infin' => 8734,
  151. 'int' => 8747,
  152. 'Iota' => 921,
  153. 'iota' => 953,
  154. 'iquest' => 191,
  155. 'isin' => 8712,
  156. 'Iuml' => 207,
  157. 'iuml' => 239,
  158. 'Kappa' => 922,
  159. 'kappa' => 954,
  160. 'Lambda' => 923,
  161. 'lambda' => 955,
  162. 'lang' => 9001,
  163. 'laquo' => 171,
  164. 'larr' => 8592,
  165. 'lArr' => 8656,
  166. 'lceil' => 8968,
  167. 'ldquo' => 8220,
  168. 'le' => 8804,
  169. 'lfloor' => 8970,
  170. 'lowast' => 8727,
  171. 'loz' => 9674,
  172. 'lrm' => 8206,
  173. 'lsaquo' => 8249,
  174. 'lsquo' => 8216,
  175. 'lt' => 60,
  176. 'macr' => 175,
  177. 'mdash' => 8212,
  178. 'micro' => 181,
  179. 'middot' => 183,
  180. 'minus' => 8722,
  181. 'Mu' => 924,
  182. 'mu' => 956,
  183. 'nabla' => 8711,
  184. 'nbsp' => 160,
  185. 'ndash' => 8211,
  186. 'ne' => 8800,
  187. 'ni' => 8715,
  188. 'not' => 172,
  189. 'notin' => 8713,
  190. 'nsub' => 8836,
  191. 'Ntilde' => 209,
  192. 'ntilde' => 241,
  193. 'Nu' => 925,
  194. 'nu' => 957,
  195. 'Oacute' => 211,
  196. 'oacute' => 243,
  197. 'Ocirc' => 212,
  198. 'ocirc' => 244,
  199. 'OElig' => 338,
  200. 'oelig' => 339,
  201. 'Ograve' => 210,
  202. 'ograve' => 242,
  203. 'oline' => 8254,
  204. 'Omega' => 937,
  205. 'omega' => 969,
  206. 'Omicron' => 927,
  207. 'omicron' => 959,
  208. 'oplus' => 8853,
  209. 'or' => 8744,
  210. 'ordf' => 170,
  211. 'ordm' => 186,
  212. 'Oslash' => 216,
  213. 'oslash' => 248,
  214. 'Otilde' => 213,
  215. 'otilde' => 245,
  216. 'otimes' => 8855,
  217. 'Ouml' => 214,
  218. 'ouml' => 246,
  219. 'para' => 182,
  220. 'part' => 8706,
  221. 'permil' => 8240,
  222. 'perp' => 8869,
  223. 'Phi' => 934,
  224. 'phi' => 966,
  225. 'Pi' => 928,
  226. 'pi' => 960,
  227. 'piv' => 982,
  228. 'plusmn' => 177,
  229. 'pound' => 163,
  230. 'prime' => 8242,
  231. 'Prime' => 8243,
  232. 'prod' => 8719,
  233. 'prop' => 8733,
  234. 'Psi' => 936,
  235. 'psi' => 968,
  236. 'quot' => 34,
  237. 'radic' => 8730,
  238. 'rang' => 9002,
  239. 'raquo' => 187,
  240. 'rarr' => 8594,
  241. 'rArr' => 8658,
  242. 'rceil' => 8969,
  243. 'rdquo' => 8221,
  244. 'real' => 8476,
  245. 'reg' => 174,
  246. 'rfloor' => 8971,
  247. 'Rho' => 929,
  248. 'rho' => 961,
  249. 'rlm' => 8207,
  250. 'rsaquo' => 8250,
  251. 'rsquo' => 8217,
  252. 'sbquo' => 8218,
  253. 'Scaron' => 352,
  254. 'scaron' => 353,
  255. 'sdot' => 8901,
  256. 'sect' => 167,
  257. 'shy' => 173,
  258. 'Sigma' => 931,
  259. 'sigma' => 963,
  260. 'sigmaf' => 962,
  261. 'sim' => 8764,
  262. 'spades' => 9824,
  263. 'sub' => 8834,
  264. 'sube' => 8838,
  265. 'sum' => 8721,
  266. 'sup' => 8835,
  267. 'sup1' => 185,
  268. 'sup2' => 178,
  269. 'sup3' => 179,
  270. 'supe' => 8839,
  271. 'szlig' => 223,
  272. 'Tau' => 932,
  273. 'tau' => 964,
  274. 'there4' => 8756,
  275. 'Theta' => 920,
  276. 'theta' => 952,
  277. 'thetasym' => 977,
  278. 'thinsp' => 8201,
  279. 'THORN' => 222,
  280. 'thorn' => 254,
  281. 'tilde' => 732,
  282. 'times' => 215,
  283. 'trade' => 8482,
  284. 'Uacute' => 218,
  285. 'uacute' => 250,
  286. 'uarr' => 8593,
  287. 'uArr' => 8657,
  288. 'Ucirc' => 219,
  289. 'ucirc' => 251,
  290. 'Ugrave' => 217,
  291. 'ugrave' => 249,
  292. 'uml' => 168,
  293. 'upsih' => 978,
  294. 'Upsilon' => 933,
  295. 'upsilon' => 965,
  296. 'Uuml' => 220,
  297. 'uuml' => 252,
  298. 'weierp' => 8472,
  299. 'Xi' => 926,
  300. 'xi' => 958,
  301. 'Yacute' => 221,
  302. 'yacute' => 253,
  303. 'yen' => 165,
  304. 'Yuml' => 376,
  305. 'yuml' => 255,
  306. 'Zeta' => 918,
  307. 'zeta' => 950,
  308. 'zwj' => 8205,
  309. 'zwnj' => 8204
  310. );
  311. /**
  312. * Character entity aliases accepted by MediaWiki
  313. */
  314. static $htmlEntityAliases = array(
  315. 'רלמ' => 'rlm',
  316. 'رلم' => 'rlm',
  317. );
  318. /**
  319. * Lazy-initialised attributes regex, see getAttribsRegex()
  320. */
  321. static $attribsRegex;
  322. /**
  323. * Regular expression to match HTML/XML attribute pairs within a tag.
  324. * Allows some... latitude.
  325. * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  326. */
  327. static function getAttribsRegex() {
  328. if ( self::$attribsRegex === null ) {
  329. $attribFirst = '[:A-Z_a-z0-9]';
  330. $attrib = '[:A-Z_a-z-.0-9]';
  331. $space = '[\x09\x0a\x0d\x20]';
  332. self::$attribsRegex =
  333. "/(?:^|$space)({$attribFirst}{$attrib}*)
  334. ($space*=$space*
  335. (?:
  336. # The attribute value: quoted or alone
  337. \"([^<\"]*)\"
  338. | '([^<']*)'
  339. | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  340. | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  341. # colors are specified like this.
  342. # We'll be normalizing it.
  343. )
  344. )?(?=$space|\$)/sx";
  345. }
  346. return self::$attribsRegex;
  347. }
  348. /**
  349. * Cleans up HTML, removes dangerous tags and attributes, and
  350. * removes HTML comments
  351. * @private
  352. * @param $text String
  353. * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
  354. * @param $args Array for the processing callback
  355. * @param $extratags Array for any extra tags to include
  356. * @param $removetags Array for any tags (default or extra) to exclude
  357. * @return string
  358. */
  359. static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
  360. global $wgUseTidy;
  361. static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
  362. $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
  363. wfProfileIn( __METHOD__ );
  364. if ( !$staticInitialised ) {
  365. $htmlpairsStatic = array( # Tags that must be closed
  366. 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
  367. 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
  368. 'strike', 'strong', 'tt', 'var', 'div', 'center',
  369. 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
  370. 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
  371. 'kbd', 'samp'
  372. );
  373. $htmlsingle = array(
  374. 'br', 'hr', 'li', 'dt', 'dd'
  375. );
  376. $htmlsingleonly = array( # Elements that cannot have close tags
  377. 'br', 'hr'
  378. );
  379. $htmlnest = array( # Tags that can be nested--??
  380. 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
  381. 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
  382. );
  383. $tabletags = array( # Can only appear inside table, we will close them
  384. 'td', 'th', 'tr',
  385. );
  386. $htmllist = array( # Tags used by list
  387. 'ul','ol',
  388. );
  389. $listtags = array( # Tags that can appear in a list
  390. 'li',
  391. );
  392. global $wgAllowImageTag;
  393. if ( $wgAllowImageTag ) {
  394. $htmlsingle[] = 'img';
  395. $htmlsingleonly[] = 'img';
  396. }
  397. $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
  398. $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
  399. # Convert them all to hashtables for faster lookup
  400. $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
  401. 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
  402. foreach ( $vars as $var ) {
  403. $$var = array_flip( $$var );
  404. }
  405. $staticInitialised = true;
  406. }
  407. # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
  408. $extratags = array_flip( $extratags );
  409. $removetags = array_flip( $removetags );
  410. $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
  411. $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
  412. # Remove HTML comments
  413. $text = Sanitizer::removeHTMLcomments( $text );
  414. $bits = explode( '<', $text );
  415. $text = str_replace( '>', '&gt;', array_shift( $bits ) );
  416. if ( !$wgUseTidy ) {
  417. $tagstack = $tablestack = array();
  418. foreach ( $bits as $x ) {
  419. $regs = array();
  420. # $slash: Does the current element start with a '/'?
  421. # $t: Current element name
  422. # $params: String between element name and >
  423. # $brace: Ending '>' or '/>'
  424. # $rest: Everything until the next element of $bits
  425. if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
  426. list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
  427. } else {
  428. $slash = $t = $params = $brace = $rest = null;
  429. }
  430. $badtag = false;
  431. if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
  432. # Check our stack
  433. if ( $slash && isset( $htmlsingleonly[$t] ) ) {
  434. $badtag = true;
  435. } elseif ( $slash ) {
  436. # Closing a tag... is it the one we just opened?
  437. $ot = @array_pop( $tagstack );
  438. if ( $ot != $t ) {
  439. if ( isset( $htmlsingleallowed[$ot] ) ) {
  440. # Pop all elements with an optional close tag
  441. # and see if we find a match below them
  442. $optstack = array();
  443. array_push( $optstack, $ot );
  444. wfSuppressWarnings();
  445. $ot = array_pop( $tagstack );
  446. wfRestoreWarnings();
  447. while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
  448. array_push( $optstack, $ot );
  449. wfSuppressWarnings();
  450. $ot = array_pop( $tagstack );
  451. wfRestoreWarnings();
  452. }
  453. if ( $t != $ot ) {
  454. # No match. Push the optional elements back again
  455. $badtag = true;
  456. wfSuppressWarnings();
  457. $ot = array_pop( $optstack );
  458. wfRestoreWarnings();
  459. while ( $ot ) {
  460. array_push( $tagstack, $ot );
  461. wfSuppressWarnings();
  462. $ot = array_pop( $optstack );
  463. wfRestoreWarnings();
  464. }
  465. }
  466. } else {
  467. @array_push( $tagstack, $ot );
  468. # <li> can be nested in <ul> or <ol>, skip those cases:
  469. if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
  470. $badtag = true;
  471. }
  472. }
  473. } else {
  474. if ( $t == 'table' ) {
  475. $tagstack = array_pop( $tablestack );
  476. }
  477. }
  478. $newparams = '';
  479. } else {
  480. # Keep track for later
  481. if ( isset( $tabletags[$t] ) &&
  482. !in_array( 'table', $tagstack ) ) {
  483. $badtag = true;
  484. } elseif ( in_array( $t, $tagstack ) &&
  485. !isset( $htmlnest [$t ] ) ) {
  486. $badtag = true;
  487. # Is it a self closed htmlpair ? (bug 5487)
  488. } elseif ( $brace == '/>' &&
  489. isset( $htmlpairs[$t] ) ) {
  490. $badtag = true;
  491. } elseif ( isset( $htmlsingleonly[$t] ) ) {
  492. # Hack to force empty tag for uncloseable elements
  493. $brace = '/>';
  494. } elseif ( isset( $htmlsingle[$t] ) ) {
  495. # Hack to not close $htmlsingle tags
  496. $brace = null;
  497. } elseif ( isset( $tabletags[$t] )
  498. && in_array( $t, $tagstack ) ) {
  499. // New table tag but forgot to close the previous one
  500. $text .= "</$t>";
  501. } else {
  502. if ( $t == 'table' ) {
  503. array_push( $tablestack, $tagstack );
  504. $tagstack = array();
  505. }
  506. array_push( $tagstack, $t );
  507. }
  508. # Replace any variables or template parameters with
  509. # plaintext results.
  510. if( is_callable( $processCallback ) ) {
  511. call_user_func_array( $processCallback, array( &$params, $args ) );
  512. }
  513. # Strip non-approved attributes from the tag
  514. $newparams = Sanitizer::fixTagAttributes( $params, $t );
  515. }
  516. if ( !$badtag ) {
  517. $rest = str_replace( '>', '&gt;', $rest );
  518. $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
  519. $text .= "<$slash$t$newparams$close>$rest";
  520. continue;
  521. }
  522. }
  523. $text .= '&lt;' . str_replace( '>', '&gt;', $x);
  524. }
  525. # Close off any remaining tags
  526. while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
  527. $text .= "</$t>\n";
  528. if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
  529. }
  530. } else {
  531. # this might be possible using tidy itself
  532. foreach ( $bits as $x ) {
  533. preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
  534. $x, $regs );
  535. @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
  536. if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
  537. if( is_callable( $processCallback ) ) {
  538. call_user_func_array( $processCallback, array( &$params, $args ) );
  539. }
  540. $newparams = Sanitizer::fixTagAttributes( $params, $t );
  541. $rest = str_replace( '>', '&gt;', $rest );
  542. $text .= "<$slash$t$newparams$brace$rest";
  543. } else {
  544. $text .= '&lt;' . str_replace( '>', '&gt;', $x);
  545. }
  546. }
  547. }
  548. wfProfileOut( __METHOD__ );
  549. return $text;
  550. }
  551. /**
  552. * Remove '<!--', '-->', and everything between.
  553. * To avoid leaving blank lines, when a comment is both preceded
  554. * and followed by a newline (ignoring spaces), trim leading and
  555. * trailing spaces and one of the newlines.
  556. *
  557. * @private
  558. * @param $text String
  559. * @return string
  560. */
  561. static function removeHTMLcomments( $text ) {
  562. wfProfileIn( __METHOD__ );
  563. while (($start = strpos($text, '<!--')) !== false) {
  564. $end = strpos($text, '-->', $start + 4);
  565. if ($end === false) {
  566. # Unterminated comment; bail out
  567. break;
  568. }
  569. $end += 3;
  570. # Trim space and newline if the comment is both
  571. # preceded and followed by a newline
  572. $spaceStart = max($start - 1, 0);
  573. $spaceLen = $end - $spaceStart;
  574. while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
  575. $spaceStart--;
  576. $spaceLen++;
  577. }
  578. while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
  579. $spaceLen++;
  580. if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
  581. # Remove the comment, leading and trailing
  582. # spaces, and leave only one newline.
  583. $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
  584. }
  585. else {
  586. # Remove just the comment.
  587. $text = substr_replace($text, '', $start, $end - $start);
  588. }
  589. }
  590. wfProfileOut( __METHOD__ );
  591. return $text;
  592. }
  593. /**
  594. * Take an array of attribute names and values and fix some deprecated values
  595. * for the given element type.
  596. * This does not validate properties, so you should ensure that you call
  597. * validateTagAttributes AFTER this to ensure that the resulting style rule
  598. * this may add is safe.
  599. *
  600. * - Converts most presentational attributes like align into inline css
  601. *
  602. * @param $attribs Array
  603. * @param $element String
  604. * @return Array
  605. */
  606. static function fixDeprecatedAttributes( $attribs, $element ) {
  607. global $wgHtml5, $wgCleanupPresentationalAttributes;
  608. // presentational attributes were removed from html5, we can leave them
  609. // in when html5 is turned off
  610. if ( !$wgHtml5 || !$wgCleanupPresentationalAttributes ) {
  611. return $attribs;
  612. }
  613. $table = array( 'table' );
  614. $cells = array( 'td', 'th' );
  615. $colls = array( 'col', 'colgroup' );
  616. $tblocks = array( 'tbody', 'tfoot', 'thead' );
  617. $h = array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' );
  618. $presentationalAttribs = array(
  619. 'align' => array( 'text-align', array_merge( array( 'caption', 'hr', 'div', 'p', 'tr' ), $table, $cells, $colls, $tblocks, $h ) ),
  620. 'clear' => array( 'clear', array( 'br' ) ),
  621. 'height' => array( 'height', $cells ),
  622. 'nowrap' => array( 'white-space', $cells ),
  623. 'size' => array( 'height', array( 'hr' ) ),
  624. 'type' => array( 'list-style-type', array( 'li', 'ol', 'ul' ) ),
  625. 'valign' => array( 'vertical-align', array_merge( $cells, $colls, $tblocks ) ),
  626. 'width' => array( 'width', array_merge( array( 'hr', 'pre' ), $table, $cells, $colls ) ),
  627. );
  628. // Ensure that any upper case or mixed case attributes are converted to lowercase
  629. foreach ( $attribs as $attribute => $value ) {
  630. if ( $attribute !== strtolower( $attribute ) && array_key_exists( strtolower( $attribute ), $presentationalAttribs ) ) {
  631. $attribs[strtolower( $attribute )] = $value;
  632. unset( $attribs[$attribute] );
  633. }
  634. }
  635. $style = "";
  636. foreach ( $presentationalAttribs as $attribute => $info ) {
  637. list( $property, $elements ) = $info;
  638. // Skip if this attribute is not relevant to this element
  639. if ( !in_array( $element, $elements ) ) {
  640. continue;
  641. }
  642. // Skip if the attribute is not used
  643. if ( !array_key_exists( $attribute, $attribs ) ) {
  644. continue;
  645. }
  646. $value = $attribs[$attribute];
  647. // For nowrap the value should be nowrap instead of whatever text is in the value
  648. if ( $attribute === 'nowrap' ) {
  649. $value = 'nowrap';
  650. }
  651. // clear="all" is clear: both; in css
  652. if ( $attribute === 'clear' && strtolower( $value ) === 'all' ) {
  653. $value = 'both';
  654. }
  655. // Size based properties should have px applied to them if they have no unit
  656. if ( in_array( $attribute, array( 'height', 'width', 'size' ) ) ) {
  657. if ( preg_match( '/^[\d.]+$/', $value ) ) {
  658. $value = "{$value}px";
  659. }
  660. }
  661. $style .= " $property: $value;";
  662. unset( $attribs[$attribute] );
  663. }
  664. if ( $style ) {
  665. // Prepend our style rules so that they can be overridden by user css
  666. if ( isset($attribs['style']) ) {
  667. $style .= " " . $attribs['style'];
  668. }
  669. $attribs['style'] = trim($style);
  670. }
  671. return $attribs;
  672. }
  673. /**
  674. * Take an array of attribute names and values and normalize or discard
  675. * illegal values for the given element type.
  676. *
  677. * - Discards attributes not on a whitelist for the given element
  678. * - Unsafe style attributes are discarded
  679. * - Invalid id attributes are reencoded
  680. *
  681. * @param $attribs Array
  682. * @param $element String
  683. * @return Array
  684. *
  685. * @todo Check for legal values where the DTD limits things.
  686. * @todo Check for unique id attribute :P
  687. */
  688. static function validateTagAttributes( $attribs, $element ) {
  689. return Sanitizer::validateAttributes( $attribs,
  690. Sanitizer::attributeWhitelist( $element ) );
  691. }
  692. /**
  693. * Take an array of attribute names and values and normalize or discard
  694. * illegal values for the given whitelist.
  695. *
  696. * - Discards attributes not the given whitelist
  697. * - Unsafe style attributes are discarded
  698. * - Invalid id attributes are reencoded
  699. *
  700. * @param $attribs Array
  701. * @param $whitelist Array: list of allowed attribute names
  702. * @return Array
  703. *
  704. * @todo Check for legal values where the DTD limits things.
  705. * @todo Check for unique id attribute :P
  706. */
  707. static function validateAttributes( $attribs, $whitelist ) {
  708. global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
  709. $whitelist = array_flip( $whitelist );
  710. $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
  711. $out = array();
  712. foreach( $attribs as $attribute => $value ) {
  713. #allow XML namespace declaration if RDFa is enabled
  714. if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
  715. if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
  716. $out[$attribute] = $value;
  717. }
  718. continue;
  719. }
  720. # Allow any attribute beginning with "data-", if in HTML5 mode
  721. if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
  722. continue;
  723. }
  724. # Strip javascript "expression" from stylesheets.
  725. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
  726. if( $attribute == 'style' ) {
  727. $value = Sanitizer::checkCss( $value );
  728. }
  729. if ( $attribute === 'id' ) {
  730. $value = Sanitizer::escapeId( $value, 'noninitial' );
  731. }
  732. //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
  733. if ( $attribute === 'rel' || $attribute === 'rev' ||
  734. $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
  735. $attribute === 'datatype' || $attribute === 'typeof' || #RDFa
  736. $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
  737. $attribute === 'itemscope' || $attribute === 'itemtype' ) { #HTML5 microdata
  738. //Paranoia. Allow "simple" values but suppress javascript
  739. if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
  740. continue;
  741. }
  742. }
  743. # NOTE: even though elements using href/src are not allowed directly, supply
  744. # validation code that can be used by tag hook handlers, etc
  745. if ( $attribute === 'href' || $attribute === 'src' ) {
  746. if ( !preg_match( $hrefExp, $value ) ) {
  747. continue; //drop any href or src attributes not using an allowed protocol.
  748. //NOTE: this also drops all relative URLs
  749. }
  750. }
  751. // If this attribute was previously set, override it.
  752. // Output should only have one attribute of each name.
  753. $out[$attribute] = $value;
  754. }
  755. if ( $wgAllowMicrodataAttributes ) {
  756. # itemtype, itemid, itemref don't make sense without itemscope
  757. if ( !array_key_exists( 'itemscope', $out ) ) {
  758. unset( $out['itemtype'] );
  759. unset( $out['itemid'] );
  760. unset( $out['itemref'] );
  761. }
  762. # TODO: Strip itemprop if we aren't descendants of an itemscope.
  763. }
  764. return $out;
  765. }
  766. /**
  767. * Merge two sets of HTML attributes. Conflicting items in the second set
  768. * will override those in the first, except for 'class' attributes which
  769. * will be combined (if they're both strings).
  770. *
  771. * @todo implement merging for other attributes such as style
  772. * @param $a Array
  773. * @param $b Array
  774. * @return array
  775. */
  776. static function mergeAttributes( $a, $b ) {
  777. $out = array_merge( $a, $b );
  778. if( isset( $a['class'] ) && isset( $b['class'] )
  779. && is_string( $a['class'] ) && is_string( $b['class'] )
  780. && $a['class'] !== $b['class'] ) {
  781. $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
  782. -1, PREG_SPLIT_NO_EMPTY );
  783. $out['class'] = implode( ' ', array_unique( $classes ) );
  784. }
  785. return $out;
  786. }
  787. /**
  788. * Pick apart some CSS and check it for forbidden or unsafe structures.
  789. * Returns a sanitized string. This sanitized string will have
  790. * character references and escape sequences decoded, and comments
  791. * stripped. If the input is just too evil, only a comment complaining
  792. * about evilness will be returned.
  793. *
  794. * Currently URL references, 'expression', 'tps' are forbidden.
  795. *
  796. * NOTE: Despite the fact that character references are decoded, the
  797. * returned string may contain character references given certain
  798. * clever input strings. These character references must
  799. * be escaped before the return value is embedded in HTML.
  800. *
  801. * @param $value String
  802. * @return String
  803. */
  804. static function checkCss( $value ) {
  805. // Decode character references like &#123;
  806. $value = Sanitizer::decodeCharReferences( $value );
  807. // Decode escape sequences and line continuation
  808. // See the grammar in the CSS 2 spec, appendix D.
  809. // This has to be done AFTER decoding character references.
  810. // This means it isn't possible for this function to return
  811. // unsanitized escape sequences. It is possible to manufacture
  812. // input that contains character references that decode to
  813. // escape sequences that decode to character references, but
  814. // it's OK for the return value to contain character references
  815. // because the caller is supposed to escape those anyway.
  816. static $decodeRegex;
  817. if ( !$decodeRegex ) {
  818. $space = '[\\x20\\t\\r\\n\\f]';
  819. $nl = '(?:\\n|\\r\\n|\\r|\\f)';
  820. $backslash = '\\\\';
  821. $decodeRegex = "/ $backslash
  822. (?:
  823. ($nl) | # 1. Line continuation
  824. ([0-9A-Fa-f]{1,6})$space? | # 2. character number
  825. (.) | # 3. backslash cancelling special meaning
  826. () | # 4. backslash at end of string
  827. )/xu";
  828. }
  829. $value = preg_replace_callback( $decodeRegex,
  830. array( __CLASS__, 'cssDecodeCallback' ), $value );
  831. // Remove any comments; IE gets token splitting wrong
  832. // This must be done AFTER decoding character references and
  833. // escape sequences, because those steps can introduce comments
  834. // This step cannot introduce character references or escape
  835. // sequences, because it replaces comments with spaces rather
  836. // than removing them completely.
  837. $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
  838. // Remove anything after a comment-start token, to guard against
  839. // incorrect client implementations.
  840. $commentPos = strpos( $value, '/*' );
  841. if ( $commentPos !== false ) {
  842. $value = substr( $value, 0, $commentPos );
  843. }
  844. // Reject problematic keywords and control characters
  845. if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
  846. return '/* invalid control char */';
  847. } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( !ix', $value ) ) {
  848. return '/* insecure input */';
  849. }
  850. return $value;
  851. }
  852. /**
  853. * @param $matches array
  854. * @return String
  855. */
  856. static function cssDecodeCallback( $matches ) {
  857. if ( $matches[1] !== '' ) {
  858. // Line continuation
  859. return '';
  860. } elseif ( $matches[2] !== '' ) {
  861. $char = codepointToUtf8( hexdec( $matches[2] ) );
  862. } elseif ( $matches[3] !== '' ) {
  863. $char = $matches[3];
  864. } else {
  865. $char = '\\';
  866. }
  867. if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
  868. // These characters need to be escaped in strings
  869. // Clean up the escape sequence to avoid parsing errors by clients
  870. return '\\' . dechex( ord( $char ) ) . ' ';
  871. } else {
  872. // Decode unnecessary escape
  873. return $char;
  874. }
  875. }
  876. /**
  877. * Take a tag soup fragment listing an HTML element's attributes
  878. * and normalize it to well-formed XML, discarding unwanted attributes.
  879. * Output is safe for further wikitext processing, with escaping of
  880. * values that could trigger problems.
  881. *
  882. * - Normalizes attribute names to lowercase
  883. * - Discards attributes not on a whitelist for the given element
  884. * - Turns broken or invalid entities into plaintext
  885. * - Double-quotes all attribute values
  886. * - Attributes without values are given the name as attribute
  887. * - Double attributes are discarded
  888. * - Unsafe style attributes are discarded
  889. * - Prepends space if there are attributes.
  890. *
  891. * @param $text String
  892. * @param $element String
  893. * @return String
  894. */
  895. static function fixTagAttributes( $text, $element ) {
  896. if( trim( $text ) == '' ) {
  897. return '';
  898. }
  899. $decoded = Sanitizer::decodeTagAttributes( $text );
  900. $decoded = Sanitizer::fixDeprecatedAttributes( $decoded, $element );
  901. $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
  902. $attribs = array();
  903. foreach( $stripped as $attribute => $value ) {
  904. $encAttribute = htmlspecialchars( $attribute );
  905. $encValue = Sanitizer::safeEncodeAttribute( $value );
  906. $attribs[] = "$encAttribute=\"$encValue\"";
  907. }
  908. return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
  909. }
  910. /**
  911. * Encode an attribute value for HTML output.
  912. * @param $text String
  913. * @return HTML-encoded text fragment
  914. */
  915. static function encodeAttribute( $text ) {
  916. $encValue = htmlspecialchars( $text, ENT_QUOTES );
  917. // Whitespace is normalized during attribute decoding,
  918. // so if we've been passed non-spaces we must encode them
  919. // ahead of time or they won't be preserved.
  920. $encValue = strtr( $encValue, array(
  921. "\n" => '&#10;',
  922. "\r" => '&#13;',
  923. "\t" => '&#9;',
  924. ) );
  925. return $encValue;
  926. }
  927. /**
  928. * Encode an attribute value for HTML tags, with extra armoring
  929. * against further wiki processing.
  930. * @param $text String
  931. * @return HTML-encoded text fragment
  932. */
  933. static function safeEncodeAttribute( $text ) {
  934. $encValue = Sanitizer::encodeAttribute( $text );
  935. # Templates and links may be expanded in later parsing,
  936. # creating invalid or dangerous output. Suppress this.
  937. $encValue = strtr( $encValue, array(
  938. '<' => '&lt;', // This should never happen,
  939. '>' => '&gt;', // we've received invalid input
  940. '"' => '&quot;', // which should have been escaped.
  941. '{' => '&#123;',
  942. '[' => '&#91;',
  943. "''" => '&#39;&#39;',
  944. 'ISBN' => '&#73;SBN',
  945. 'RFC' => '&#82;FC',
  946. 'PMID' => '&#80;MID',
  947. '|' => '&#124;',
  948. '__' => '&#95;_',
  949. ) );
  950. # Stupid hack
  951. $encValue = preg_replace_callback(
  952. '/(' . wfUrlProtocols() . ')/',
  953. array( 'Sanitizer', 'armorLinksCallback' ),
  954. $encValue );
  955. return $encValue;
  956. }
  957. /**
  958. * Given a value, escape it so that it can be used in an id attribute and
  959. * return it. This will use HTML5 validation if $wgExperimentalHtmlIds is
  960. * true, allowing anything but ASCII whitespace. Otherwise it will use
  961. * HTML 4 rules, which means a narrow subset of ASCII, with bad characters
  962. * escaped with lots of dots.
  963. *
  964. * To ensure we don't have to bother escaping anything, we also strip ', ",
  965. * & even if $wgExperimentalIds is true. TODO: Is this the best tactic?
  966. * We also strip # because it upsets IE, and % because it could be
  967. * ambiguous if it's part of something that looks like a percent escape
  968. * (which don't work reliably in fragments cross-browser).
  969. *
  970. * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
  971. * in the id and
  972. * name attributes
  973. * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
  974. * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
  975. * HTML5 definition of id attribute
  976. *
  977. * @param $id String: id to escape
  978. * @param $options Mixed: string or array of strings (default is array()):
  979. * 'noninitial': This is a non-initial fragment of an id, not a full id,
  980. * so don't pay attention if the first character isn't valid at the
  981. * beginning of an id. Only matters if $wgExperimentalHtmlIds is
  982. * false.
  983. * 'legacy': Behave the way the old HTML 4-based ID escaping worked even
  984. * if $wgExperimentalHtmlIds is used, so we can generate extra
  985. * anchors and links won't break.
  986. * @return String
  987. */
  988. static function escapeId( $id, $options = array() ) {
  989. global $wgHtml5, $wgExperimentalHtmlIds;
  990. $options = (array)$options;
  991. if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
  992. $id = Sanitizer::decodeCharReferences( $id );
  993. $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
  994. $id = trim( $id, '_' );
  995. if ( $id === '' ) {
  996. # Must have been all whitespace to start with.
  997. return '_';
  998. } else {
  999. return $id;
  1000. }
  1001. }
  1002. # HTML4-style escaping
  1003. static $replace = array(
  1004. '%3A' => ':',
  1005. '%' => '.'
  1006. );
  1007. $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
  1008. $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
  1009. if ( !preg_match( '/^[a-zA-Z]/', $id )
  1010. && !in_array( 'noninitial', $options ) ) {
  1011. // Initial character must be a letter!
  1012. $id = "x$id";
  1013. }
  1014. return $id;
  1015. }
  1016. /**
  1017. * Given a value, escape it so that it can be used as a CSS class and
  1018. * return it.
  1019. *
  1020. * @todo For extra validity, input should be validated UTF-8.
  1021. *
  1022. * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
  1023. *
  1024. * @param $class String
  1025. * @return String
  1026. */
  1027. static function escapeClass( $class ) {
  1028. // Convert ugly stuff to underscores and kill underscores in ugly places
  1029. return rtrim(preg_replace(
  1030. array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
  1031. '_',
  1032. $class ), '_');
  1033. }
  1034. /**
  1035. * Given HTML input, escape with htmlspecialchars but un-escape entites.
  1036. * This allows (generally harmless) entities like &#160; to survive.
  1037. *
  1038. * @param $html String to escape
  1039. * @return String: escaped input
  1040. */
  1041. static function escapeHtmlAllowEntities( $html ) {
  1042. $html = Sanitizer::decodeCharReferences( $html );
  1043. # It seems wise to escape ' as well as ", as a matter of course. Can't
  1044. # hurt.
  1045. $html = htmlspecialchars( $html, ENT_QUOTES );
  1046. return $html;
  1047. }
  1048. /**
  1049. * Regex replace callback for armoring links against further processing.
  1050. * @param $matches Array
  1051. * @return string
  1052. */
  1053. private static function armorLinksCallback( $matches ) {
  1054. return str_replace( ':', '&#58;', $matches[1] );
  1055. }
  1056. /**
  1057. * Return an associative array of attribute names and values from
  1058. * a partial tag string. Attribute names are forces to lowercase,
  1059. * character references are decoded to UTF-8 text.
  1060. *
  1061. * @param $text String
  1062. * @return Array
  1063. */
  1064. public static function decodeTagAttributes( $text ) {
  1065. if( trim( $text ) == '' ) {
  1066. return array();
  1067. }
  1068. $attribs = array();
  1069. $pairs = array();
  1070. if( !preg_match_all(
  1071. self::getAttribsRegex(),
  1072. $text,
  1073. $pairs,
  1074. PREG_SET_ORDER ) ) {
  1075. return $attribs;
  1076. }
  1077. foreach( $pairs as $set ) {
  1078. $attribute = strtolower( $set[1] );
  1079. $value = Sanitizer::getTagAttributeCallback( $set );
  1080. // Normalize whitespace
  1081. $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
  1082. $value = trim( $value );
  1083. // Decode character references
  1084. $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
  1085. }
  1086. return $attribs;
  1087. }
  1088. /**
  1089. * Pick the appropriate attribute value from a match set from the
  1090. * attribs regex matches.
  1091. *
  1092. * @param $set Array
  1093. * @return String
  1094. */
  1095. private static function getTagAttributeCallback( $set ) {
  1096. if( isset( $set[6] ) ) {
  1097. # Illegal #XXXXXX color with no quotes.
  1098. return $set[6];
  1099. } elseif( isset( $set[5] ) ) {
  1100. # No quotes.
  1101. return $set[5];
  1102. } elseif( isset( $set[4] ) ) {
  1103. # Single-quoted
  1104. return $set[4];
  1105. } elseif( isset( $set[3] ) ) {
  1106. # Double-quoted
  1107. return $set[3];
  1108. } elseif( !isset( $set[2] ) ) {
  1109. # In XHTML, attributes must have a value.
  1110. # For 'reduced' form, return explicitly the attribute name here.
  1111. return $set[1];
  1112. } else {
  1113. throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
  1114. }
  1115. }
  1116. /**
  1117. * Normalize whitespace and character references in an XML source-
  1118. * encoded text for an attribute value.
  1119. *
  1120. * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
  1121. * but note that we're not returning the value, but are returning
  1122. * XML source fragments that will be slapped into output.
  1123. *
  1124. * @param $text String
  1125. * @return String
  1126. */
  1127. private static function normalizeAttributeValue( $text ) {
  1128. return str_replace( '"', '&quot;',
  1129. self::normalizeWhitespace(
  1130. Sanitizer::normalizeCharReferences( $text ) ) );
  1131. }
  1132. /**
  1133. * @param $text string
  1134. * @return mixed
  1135. */
  1136. private static function normalizeWhitespace( $text ) {
  1137. return preg_replace(
  1138. '/\r\n|[\x20\x0d\x0a\x09]/',
  1139. ' ',
  1140. $text );
  1141. }
  1142. /**
  1143. * Normalizes whitespace in a section name, such as might be returned
  1144. * by Parser::stripSectionName(), for use in the id's that are used for
  1145. * section links.
  1146. *
  1147. * @param $section String
  1148. * @return String
  1149. */
  1150. static function normalizeSectionNameWhitespace( $section ) {
  1151. return trim( preg_replace( '/[ _]+/', ' ', $section ) );
  1152. }
  1153. /**
  1154. * Ensure that any entities and character references are legal
  1155. * for XML and XHTML specifically. Any stray bits will be
  1156. * &amp;-escaped to result in a valid text fragment.
  1157. *
  1158. * a. named char refs can only be &lt; &gt; &amp; &quot;, others are
  1159. * numericized (this way we're well-formed even without a DTD)
  1160. * b. any numeric char refs must be legal chars, not invalid or forbidden
  1161. * c. use &#x, not &#X
  1162. * d. fix or reject non-valid attributes
  1163. *
  1164. * @param $text String
  1165. * @return String
  1166. * @private
  1167. */
  1168. static function normalizeCharReferences( $text ) {
  1169. return preg_replace_callback(
  1170. self::CHAR_REFS_REGEX,
  1171. array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
  1172. $text );
  1173. }
  1174. /**
  1175. * @param $matches String
  1176. * @return String
  1177. */
  1178. static function normalizeCharReferencesCallback( $matches ) {
  1179. $ret = null;
  1180. if( $matches[1] != '' ) {
  1181. $ret = Sanitizer::normalizeEntity( $matches[1] );
  1182. } elseif( $matches[2] != '' ) {
  1183. $ret = Sanitizer::decCharReference( $matches[2] );
  1184. } elseif( $matches[3] != '' ) {
  1185. $ret = Sanitizer::hexCharReference( $matches[3] );
  1186. }
  1187. if( is_null( $ret ) ) {
  1188. return htmlspecialchars( $matches[0] );
  1189. } else {
  1190. return $ret;
  1191. }
  1192. }
  1193. /**
  1194. * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
  1195. * return the equivalent numeric entity reference (except for the core &lt;
  1196. * &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
  1197. * the HTML equivalent. Otherwise, returns HTML-escaped text of
  1198. * pseudo-entity source (eg &amp;foo;)
  1199. *
  1200. * @param $name String
  1201. * @return String
  1202. */
  1203. static function normalizeEntity( $name ) {
  1204. if ( isset( self::$htmlEntityAliases[$name] ) ) {
  1205. return '&' . self::$htmlEntityAliases[$name] . ';';
  1206. } elseif ( in_array( $name,
  1207. array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
  1208. return "&$name;";
  1209. } elseif ( isset( self::$htmlEntities[$name] ) ) {
  1210. return '&#' . self::$htmlEntities[$name] . ';';
  1211. } else {
  1212. return "&amp;$name;";
  1213. }
  1214. }
  1215. /**
  1216. * @param $codepoint
  1217. * @return null|string
  1218. */
  1219. static function decCharReference( $codepoint ) {
  1220. $point = intval( $codepoint );
  1221. if( Sanitizer::validateCodepoint( $point ) ) {
  1222. return sprintf( '&#%d;', $point );
  1223. } else {
  1224. return null;
  1225. }
  1226. }
  1227. /**
  1228. * @param $codepoint
  1229. * @return null|string
  1230. */
  1231. static function hexCharReference( $codepoint ) {
  1232. $point = hexdec( $codepoint );
  1233. if( Sanitizer::validateCodepoint( $point ) ) {
  1234. return sprintf( '&#x%x;', $point );
  1235. } else {
  1236. return null;
  1237. }
  1238. }
  1239. /**
  1240. * Returns true if a given Unicode codepoint is a valid character in XML.
  1241. * @param $codepoint Integer
  1242. * @return Boolean
  1243. */
  1244. private static function validateCodepoint( $codepoint ) {
  1245. return ($codepoint == 0x09)
  1246. || ($codepoint == 0x0a)
  1247. || ($codepoint == 0x0d)
  1248. || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
  1249. || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
  1250. || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
  1251. }
  1252. /**
  1253. * Decode any character references, numeric or named entities,
  1254. * in the text and return a UTF-8 string.
  1255. *
  1256. * @param $text String
  1257. * @return String
  1258. */
  1259. public static function decodeCharReferences( $text ) {
  1260. return preg_replace_callback(
  1261. self::CHAR_REFS_REGEX,
  1262. array( 'Sanitizer', 'decodeCharReferencesCallback' ),
  1263. $text );
  1264. }
  1265. /**
  1266. * Decode any character references, numeric or named entities,
  1267. * in the next and normalize the resulting string. (bug 14952)
  1268. *
  1269. * This is useful for page titles, not for text to be displayed,
  1270. * MediaWiki allows HTML entities to escape normalization as a feature.
  1271. *
  1272. * @param $text String (already normalized, containing entities)
  1273. * @return String (still normalized, without entities)
  1274. */
  1275. public static function decodeCharReferencesAndNormalize( $text ) {
  1276. global $wgContLang;
  1277. $text = preg_replace_callback(
  1278. self::CHAR_REFS_REGEX,
  1279. array( 'Sanitizer', 'decodeCharReferencesCallback' ),
  1280. $text, /* limit */ -1, $count );
  1281. if ( $count ) {
  1282. return $wgContLang->normalize( $text );
  1283. } else {
  1284. return $text;
  1285. }
  1286. }
  1287. /**
  1288. * @param $matches String
  1289. * @return String
  1290. */
  1291. static function decodeCharReferencesCallback( $matches ) {
  1292. if( $matches[1] != '' ) {
  1293. return Sanitizer::decodeEntity( $matches[1] );
  1294. } elseif( $matches[2] != '' ) {
  1295. return Sanitizer::decodeChar( intval( $matches[2] ) );
  1296. } elseif( $matches[3] != '' ) {
  1297. return Sanitizer::decodeChar( hexdec( $matches[3] ) );
  1298. }
  1299. # Last case should be an ampersand by itself
  1300. return $matches[0];
  1301. }
  1302. /**
  1303. * Return UTF-8 string for a codepoint if that is a valid
  1304. * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
  1305. * @param $codepoint Integer
  1306. * @return String
  1307. * @private
  1308. */
  1309. static function decodeChar( $codepoint ) {
  1310. if( Sanitizer::validateCodepoint( $codepoint ) ) {
  1311. return codepointToUtf8( $codepoint );
  1312. } else {
  1313. return UTF8_REPLACEMENT;
  1314. }
  1315. }
  1316. /**
  1317. * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
  1318. * return the UTF-8 encoding of that character. Otherwise, returns
  1319. * pseudo-entity source (eg &foo;)
  1320. *
  1321. * @param $name String
  1322. * @return String
  1323. */
  1324. static function decodeEntity( $name ) {
  1325. if ( isset( self::$htmlEntityAliases[$name] ) ) {
  1326. $name = self::$htmlEntityAliases[$name];
  1327. }
  1328. if( isset( self::$htmlEntities[$name] ) ) {
  1329. return codepointToUtf8( self::$htmlEntities[$name] );
  1330. } else {
  1331. return "&$name;";
  1332. }
  1333. }
  1334. /**
  1335. * Fetch the whitelist of acceptable attributes for a given element name.
  1336. *
  1337. * @param $element String
  1338. * @return Array
  1339. */
  1340. static function attributeWhitelist( $element ) {
  1341. static $list;
  1342. if( !isset( $list ) ) {
  1343. $list = Sanitizer::setupAttributeWhitelist();
  1344. }
  1345. return isset( $list[$element] )
  1346. ? $list[$element]
  1347. : array();
  1348. }
  1349. /**
  1350. * Foreach array key (an allowed HTML element), return an array
  1351. * of allowed attributes
  1352. * @return Array
  1353. */
  1354. static function setupAttributeWhitelist() {
  1355. global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
  1356. $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
  1357. if ( $wgAllowRdfaAttributes ) {
  1358. #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
  1359. $common = array_merge( $common, array(
  1360. 'about', 'property', 'resource', 'datatype', 'typeof',
  1361. ) );
  1362. }
  1363. if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
  1364. # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
  1365. $common = array_merge( $common, array(
  1366. 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
  1367. ) );
  1368. }
  1369. $block = array_merge( $common, array( 'align' ) );
  1370. $tablealign = array( 'align', 'char', 'charoff', 'valign' );
  1371. $tablecell = array( 'abbr',
  1372. 'axis',
  1373. 'headers',
  1374. 'scope',
  1375. 'rowspan',
  1376. 'colspan',
  1377. 'nowrap', # deprecated
  1378. 'width', # deprecated
  1379. 'height', # deprecated
  1380. 'bgcolor' # deprecated
  1381. );
  1382. # Numbers refer to sections in HTML 4.01 standard describing the element.
  1383. # See: http://www.w3.org/TR/html4/
  1384. $whitelist = array (
  1385. # 7.5.4
  1386. 'div' => $block,
  1387. 'center' => $common, # deprecated
  1388. 'span' => $block, # ??
  1389. # 7.5.5
  1390. 'h1' => $block,
  1391. 'h2' => $block,
  1392. 'h3' => $block,
  1393. 'h4' => $block,
  1394. 'h5' => $block,
  1395. 'h6' => $block,
  1396. # 7.5.6
  1397. # address
  1398. # 8.2.4
  1399. # bdo
  1400. # 9.2.1
  1401. 'em' => $common,
  1402. 'strong' => $common,
  1403. 'cite' => $common,
  1404. 'dfn' => $common,
  1405. 'code' => $common,
  1406. 'samp' => $common,
  1407. 'kbd' => $common,
  1408. 'var' => $common,
  1409. 'abbr' => $common,
  1410. # acronym
  1411. # 9.2.2
  1412. 'blockquote' => array_merge( $common, array( 'cite' ) ),
  1413. # q
  1414. # 9.2.3
  1415. 'sub' => $common,
  1416. 'sup' => $common,
  1417. # 9.3.1
  1418. 'p' => $block,
  1419. # 9.3.2
  1420. 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
  1421. # 9.3.4
  1422. 'pre' => array_merge( $common, array( 'width' ) ),
  1423. # 9.4
  1424. 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
  1425. 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
  1426. # 10.2
  1427. 'ul' => array_merge( $common, array( 'type' ) ),
  1428. 'ol' => array_merge( $common, array( 'type', 'start' ) ),
  1429. 'li' => array_merge( $common, array( 'type', 'value' ) ),
  1430. # 10.3
  1431. 'dl' => $common,
  1432. 'dd' => $common,
  1433. 'dt' => $common,
  1434. # 11.2.1
  1435. 'table' => array_merge( $common,
  1436. array( 'summary', 'width', 'border', 'frame',
  1437. 'rules', 'cellspacing', 'cellpadding',
  1438. 'align', 'bgcolor',
  1439. ) ),
  1440. # 11.2.2
  1441. 'caption' => array_merge( $common, array( 'align' ) ),
  1442. # 11.2.3
  1443. 'thead' => array_merge( $common, $tablealign ),
  1444. 'tfoot' => array_merge( $common, $tablealign ),
  1445. 'tbody' => array_merge( $common, $tablealign ),
  1446. # 11.2.4
  1447. 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
  1448. 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
  1449. # 11.2.5
  1450. 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
  1451. # 11.2.6
  1452. 'td' => array_merge( $common, $tablecell, $tablealign ),
  1453. 'th' => array_merge( $common, $tablecell, $tablealign ),
  1454. # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
  1455. 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
  1456. # 13.2
  1457. # Not usually allowed, but may be used for extension-style hooks
  1458. # such as <math> when it is rasterized, or if $wgAllowImageTag is
  1459. # true
  1460. 'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
  1461. # 15.2.1
  1462. 'tt' => $common,
  1463. 'b' => $common,
  1464. 'i' => $common,
  1465. 'big' => $common,
  1466. 'small' => $common,
  1467. 'strike' => $common,
  1468. 's' => $common,
  1469. 'u' => $common,
  1470. # 15.2.2
  1471. 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
  1472. # basefont
  1473. # 15.3
  1474. 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
  1475. # XHTML Ruby annotation text module, simple ruby only.
  1476. # http://www.w3c.org/TR/ruby/
  1477. 'ruby' => $common,
  1478. # rbc
  1479. # rtc
  1480. 'rb' => $common,
  1481. 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
  1482. 'rp' => $common,
  1483. # MathML root element, where used for extensions
  1484. # 'title' may not be 100% valid here; it's XHTML
  1485. # http://www.w3.org/TR/REC-MathML/
  1486. 'math' => array( 'class', 'style', 'id', 'title' ),
  1487. );
  1488. return $whitelist;
  1489. }
  1490. /**
  1491. * Take a fragment of (potentially invalid) HTML and return
  1492. * a version with any tags removed, encoded as plain text.
  1493. *
  1494. * Warning: this return value must be further escaped for literal
  1495. * inclusion in HTML output as of 1.10!
  1496. *
  1497. * @param $text String: HTML fragment
  1498. * @return String
  1499. */
  1500. static function stripAllTags( $text ) {
  1501. # Actual <tags>
  1502. $text = StringUtils::delimiterReplace( '<', '>', '', $text );
  1503. # Normalize &entities and whitespace
  1504. $text = self::decodeCharReferences( $text );
  1505. $text = self::normalizeWhitespace( $text );
  1506. return $text;
  1507. }
  1508. /**
  1509. * Hack up a private DOCTYPE with HTML's standard entity declarations.
  1510. * PHP 4 seemed to know these if you gave it an HTML doctype, but
  1511. * PHP 5.1 doesn't.
  1512. *
  1513. * Use for passing XHTML fragments to PHP's XML parsing functions
  1514. *
  1515. * @return String
  1516. */
  1517. static function hackDocType() {
  1518. $out = "<!DOCTYPE html [\n";
  1519. foreach( self::$htmlEntities as $entity => $codepoint

Large files files are truncated, but you can click here to view the full file