PageRenderTime 37ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 1ms

/includes/wikiengine/TagSanitizer.php

https://code.google.com/p/enanocms/
PHP | 858 lines | 840 code | 4 blank | 14 comment | 0 complexity | dcef7b31a7103e2836399866cfb3e278 MD5 | raw file
Possible License(s): GPL-2.0
  1. <?php
  2. /*
  3. * Enano - an open-source CMS capable of wiki functions, Drupal-like sidebar blocks, and everything in between
  4. * Copyright (C) 2006-2009 Dan Fuhry
  5. *
  6. * This program is Free Software; you can redistribute and/or modify it under the terms of the GNU General Public License
  7. * as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
  10. * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details.
  11. *
  12. * This script contains code originally found in MediaWiki (http://www.mediawiki.org). MediaWiki is also licensed under
  13. * the GPLv2 or later; see the file GPL included with this package for details.
  14. */
  15. $attrib = '[a-zA-Z0-9]';
  16. $space = '[\x09\x0a\x0d\x20]';
  17. define( 'MW_CHAR_REFS_REGEX',
  18. '/&([A-Za-z0-9]+);
  19. |&\#([0-9]+);
  20. |&\#x([0-9A-Za-z]+);
  21. |&\#X([0-9A-Za-z]+);
  22. |(&)/x' );
  23. define( 'MW_ATTRIBS_REGEX',
  24. "/(?:^|$space)($attrib+)
  25. ($space*=$space*
  26. (?:
  27. # The attribute value: quoted or alone
  28. \"([^<\"]*)\"
  29. | '([^<']*)'
  30. | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  31. | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  32. # colors are specified like this.
  33. # We'll be normalizing it.
  34. )
  35. )?(?=$space|\$)/sx" );
  36. /**
  37. * Take a tag soup fragment listing an HTML element's attributes
  38. * and normalize it to well-formed XML, discarding unwanted attributes.
  39. * Output is safe for further wikitext processing, with escaping of
  40. * values that could trigger problems.
  41. *
  42. * - Normalizes attribute names to lowercase
  43. * - Discards attributes not on a whitelist for the given element
  44. * - Turns broken or invalid entities into plaintext
  45. * - Double-quotes all attribute values
  46. * - Attributes without values are given the name as attribute
  47. * - Double attributes are discarded
  48. * - Unsafe style attributes are discarded
  49. * - Prepends space if there are attributes.
  50. *
  51. * @param string $text
  52. * @param string $element
  53. * @return string
  54. */
  55. function fixTagAttributes( $text, $element ) {
  56. if( trim( $text ) == '' ) {
  57. return '';
  58. }
  59. $stripped = validateTagAttributes(
  60. decodeTagAttributes( $text ), $element );
  61. $attribs = array();
  62. foreach( $stripped as $attribute => $value ) {
  63. $encAttribute = htmlspecialchars( $attribute );
  64. $encValue = safeEncodeAttribute( $value );
  65. $attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // "
  66. }
  67. return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
  68. }
  69. /**
  70. * Encode an attribute value for HTML tags, with extra armoring
  71. * against further wiki processing.
  72. * @param $text
  73. * @return HTML-encoded text fragment
  74. */
  75. function safeEncodeAttribute( $text ) {
  76. $encValue= encodeAttribute( $text );
  77. # Templates and links may be expanded in later parsing,
  78. # creating invalid or dangerous output. Suppress this.
  79. $encValue = strtr( $encValue, array(
  80. '<' => '&lt;', // This should never happen,
  81. '>' => '&gt;', // we've received invalid input
  82. '"' => '&quot;', // which should have been escaped.
  83. '{' => '&#123;',
  84. '[' => '&#91;',
  85. "''" => '&#39;&#39;',
  86. 'ISBN' => '&#73;SBN',
  87. 'RFC' => '&#82;FC',
  88. 'PMID' => '&#80;MID',
  89. '|' => '&#124;',
  90. '__' => '&#95;_',
  91. ) );
  92. return $encValue;
  93. }
  94. /**
  95. * Encode an attribute value for HTML output.
  96. * @param $text
  97. * @return HTML-encoded text fragment
  98. */
  99. function encodeAttribute( $text ) {
  100. // In Enano 1.0.3, added this cheapo hack to keep ampersands
  101. // from being double-sanitized. Thanks to markybob from #deluge.
  102. // htmlspecialchars() the "manual" way
  103. $encValue = strtr( $text, array(
  104. '&amp;' => '&',
  105. '&quot;' => '"',
  106. '&lt;' => '<',
  107. '&gt;' => '>',
  108. '&#039;' => "'"
  109. ) );
  110. $encValue = strtr( $text, array(
  111. '&' => '&amp;',
  112. '"' => '&quot;',
  113. '<' => '&lt;',
  114. '>' => '&gt;',
  115. "'" => '&#039;'
  116. ) );
  117. // Whitespace is normalized during attribute decoding,
  118. // so if we've been passed non-spaces we must encode them
  119. // ahead of time or they won't be preserved.
  120. $encValue = strtr( $encValue, array(
  121. "\n" => '&#10;',
  122. "\r" => '&#13;',
  123. "\t" => '&#9;',
  124. ) );
  125. return $encValue;
  126. }
  127. function unstripForHTML( $text ) {
  128. global $mStripState;
  129. $text = unstrip( $text, $mStripState );
  130. $text = unstripNoWiki( $text, $mStripState );
  131. return $text;
  132. }
  133. /**
  134. * Always call this after unstrip() to preserve the order
  135. *
  136. * @private
  137. */
  138. function unstripNoWiki( $text, &$state ) {
  139. if ( !isset( $state['nowiki'] ) ) {
  140. return $text;
  141. }
  142. # TODO: good candidate for FSS
  143. $text = strtr( $text, $state['nowiki'] );
  144. return $text;
  145. }
  146. /**
  147. * Take an array of attribute names and values and normalize or discard
  148. * illegal values for the given element type.
  149. *
  150. * - Discards attributes not on a whitelist for the given element
  151. * - Unsafe style attributes are discarded
  152. *
  153. * @param array $attribs
  154. * @param string $element
  155. * @return array
  156. *
  157. * @todo Check for legal values where the DTD limits things.
  158. * @todo Check for unique id attribute :P
  159. */
  160. function validateTagAttributes( $attribs, $element ) {
  161. $whitelist = array_flip( attributeWhitelist( $element ) );
  162. $out = array();
  163. foreach( $attribs as $attribute => $value ) {
  164. if( !isset( $whitelist[$attribute] ) ) {
  165. continue;
  166. }
  167. # Strip javascript "expression" from stylesheets.
  168. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
  169. if( $attribute == 'style' ) {
  170. $value = checkCss( $value );
  171. if( $value === false ) {
  172. # haxx0r
  173. continue;
  174. }
  175. }
  176. if ( $attribute === 'id' )
  177. $value = escapeId( $value );
  178. // If this attribute was previously set, override it.
  179. // Output should only have one attribute of each name.
  180. $out[$attribute] = $value;
  181. }
  182. return $out;
  183. }
  184. /**
  185. * Pick apart some CSS and check it for forbidden or unsafe structures.
  186. * Returns a sanitized string, or false if it was just too evil.
  187. *
  188. * Currently URL references, 'expression', 'tps' are forbidden.
  189. *
  190. * @param string $value
  191. * @return mixed
  192. */
  193. function checkCss( $value ) {
  194. $stripped = decodeCharReferences( $value );
  195. // Remove any comments; IE gets token splitting wrong
  196. $stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped );
  197. $value = $stripped;
  198. // ... and continue checks
  199. $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
  200. 'codepointToUtf8(hexdec("$1"))', $stripped );
  201. $stripped = str_replace( '\\', '', $stripped );
  202. if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
  203. $stripped ) ) {
  204. # haxx0r
  205. return false;
  206. }
  207. return $value;
  208. }
  209. /**
  210. * Decode any character references, numeric or named entities,
  211. * in the text and return a UTF-8 string.
  212. *
  213. * @param string $text
  214. * @return string
  215. * @access public
  216. * @static
  217. */
  218. function decodeCharReferences( $text ) {
  219. return preg_replace_callback(
  220. MW_CHAR_REFS_REGEX,
  221. 'decodeCharReferencesCallback',
  222. $text );
  223. }
  224. /**
  225. * Fetch the whitelist of acceptable attributes for a given
  226. * element name.
  227. *
  228. * @param string $element
  229. * @return array
  230. */
  231. function attributeWhitelist( $element ) {
  232. static $list;
  233. if( !isset( $list ) ) {
  234. $list = setupAttributeWhitelist();
  235. }
  236. return isset( $list[$element] )
  237. ? $list[$element]
  238. : array();
  239. }
  240. /**
  241. * @todo Document it a bit
  242. * @return array
  243. */
  244. function setupAttributeWhitelist() {
  245. global $db, $session, $paths, $template, $plugins;
  246. $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
  247. $block = array_merge( $common, array( 'align' ) );
  248. $tablealign = array( 'align', 'char', 'charoff', 'valign' );
  249. $tablecell = array( 'abbr',
  250. 'axis',
  251. 'headers',
  252. 'scope',
  253. 'rowspan',
  254. 'colspan',
  255. 'nowrap', # deprecated
  256. 'width', # deprecated
  257. 'height', # deprecated
  258. 'bgcolor' # deprecated
  259. );
  260. # Numbers refer to sections in HTML 4.01 standard describing the element.
  261. # See: http://www.w3.org/TR/html4/
  262. $whitelist = array (
  263. # 7.5.4
  264. 'div' => $block,
  265. 'center' => $common, # deprecated
  266. 'span' => $block, # ??
  267. # 7.5.5
  268. 'h1' => $block,
  269. 'h2' => $block,
  270. 'h3' => $block,
  271. 'h4' => $block,
  272. 'h5' => $block,
  273. 'h6' => $block,
  274. # 7.5.6
  275. # address
  276. # 8.2.4
  277. # bdo
  278. # 9.2.1
  279. 'em' => $common,
  280. 'strong' => $common,
  281. 'cite' => $common,
  282. # dfn
  283. 'code' => $common,
  284. # samp
  285. # kbd
  286. 'var' => $common,
  287. # abbr
  288. # acronym
  289. # 9.2.2
  290. 'blockquote' => array_merge( $common, array( 'cite' ) ),
  291. # q
  292. # 9.2.3
  293. 'sub' => $common,
  294. 'sup' => $common,
  295. # 9.3.1
  296. 'p' => $block,
  297. # 9.3.2
  298. 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
  299. # 9.3.4
  300. 'pre' => array_merge( $common, array( 'width' ) ),
  301. # 9.4
  302. 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
  303. 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
  304. # 10.2
  305. 'ul' => array_merge( $common, array( 'type' ) ),
  306. 'ol' => array_merge( $common, array( 'type', 'start' ) ),
  307. 'li' => array_merge( $common, array( 'type', 'value' ) ),
  308. # 10.3
  309. 'dl' => $common,
  310. 'dd' => $common,
  311. 'dt' => $common,
  312. # 11.2.1
  313. 'table' => array_merge( $common,
  314. array( 'summary', 'width', 'border', 'frame',
  315. 'rules', 'cellspacing', 'cellpadding',
  316. 'align', 'bgcolor',
  317. ) ),
  318. # 11.2.2
  319. 'caption' => array_merge( $common, array( 'align' ) ),
  320. # 11.2.3
  321. 'thead' => array_merge( $common, $tablealign ),
  322. 'tfoot' => array_merge( $common, $tablealign ),
  323. 'tbody' => array_merge( $common, $tablealign ),
  324. # 11.2.4
  325. 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
  326. 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
  327. # 11.2.5
  328. 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
  329. # 11.2.6
  330. 'td' => array_merge( $common, $tablecell, $tablealign ),
  331. 'th' => array_merge( $common, $tablecell, $tablealign ),
  332. # 12.2
  333. # added by dan
  334. 'a' => array_merge( $common, array( 'href', 'name' ) ),
  335. # 13.2
  336. # added by dan
  337. 'img' => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ),
  338. # 15.2.1
  339. 'tt' => $common,
  340. 'b' => $common,
  341. 'i' => $common,
  342. 'big' => $common,
  343. 'small' => $common,
  344. 'strike' => $common,
  345. 's' => $common,
  346. 'u' => $common,
  347. # 15.2.2
  348. 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
  349. # basefont
  350. # 15.3
  351. 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
  352. # XHTML Ruby annotation text module, simple ruby only.
  353. # http://www.w3c.org/TR/ruby/
  354. 'ruby' => $common,
  355. # rbc
  356. # rtc
  357. 'rb' => $common,
  358. 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
  359. 'rp' => $common,
  360. # For compatibility with the XHTML parser.
  361. 'nowiki' => array(),
  362. 'noinclude' => array(),
  363. 'nodisplay' => array(),
  364. 'lang' => array('code'),
  365. # XHTML stuff
  366. 'acronym' => $common
  367. );
  368. // custom tags can be added by plugins
  369. $code = $plugins->setHook('html_attribute_whitelist');
  370. foreach ( $code as $cmd )
  371. {
  372. eval($cmd);
  373. }
  374. return $whitelist;
  375. }
  376. /**
  377. * Given a value escape it so that it can be used in an id attribute and
  378. * return it, this does not validate the value however (see first link)
  379. *
  380. * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
  381. * in the id and
  382. * name attributes
  383. * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
  384. *
  385. * @bug 4461
  386. *
  387. * @static
  388. *
  389. * @param string $id
  390. * @return string
  391. */
  392. function escapeId( $id ) {
  393. static $replace = array(
  394. '%3A' => ':',
  395. '%' => '.'
  396. );
  397. $id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) );
  398. return str_replace( array_keys( $replace ), array_values( $replace ), $id );
  399. }
  400. /**
  401. * More or less "markup-safe" explode()
  402. * Ignores any instances of the separator inside <...>
  403. * @param string $separator
  404. * @param string $text
  405. * @return array
  406. */
  407. function wfExplodeMarkup( $separator, $text ) {
  408. $placeholder = "\x00";
  409. // Just in case...
  410. $text = str_replace( $placeholder, '', $text );
  411. // Trim stuff
  412. $replacer = new ReplacerCallback( $separator, $placeholder );
  413. $cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text );
  414. $items = explode( $separator, $cleaned );
  415. foreach( $items as $i => $str ) {
  416. $items[$i] = str_replace( $placeholder, $separator, $str );
  417. }
  418. return $items;
  419. }
  420. class ReplacerCallback {
  421. function ReplacerCallback( $from, $to ) {
  422. $this->from = $from;
  423. $this->to = $to;
  424. }
  425. function go( $matches ) {
  426. return str_replace( $this->from, $this->to, $matches[1] );
  427. }
  428. }
  429. /**
  430. * Return an associative array of attribute names and values from
  431. * a partial tag string. Attribute names are forces to lowercase,
  432. * character references are decoded to UTF-8 text.
  433. *
  434. * @param string
  435. * @return array
  436. */
  437. function decodeTagAttributes( $text ) {
  438. $attribs = array();
  439. if( trim( $text ) == '' ) {
  440. return $attribs;
  441. }
  442. $pairs = array();
  443. if( !preg_match_all(
  444. MW_ATTRIBS_REGEX,
  445. $text,
  446. $pairs,
  447. PREG_SET_ORDER ) ) {
  448. return $attribs;
  449. }
  450. foreach( $pairs as $set ) {
  451. $attribute = strtolower( $set[1] );
  452. $value = getTagAttributeCallback( $set );
  453. // Normalize whitespace
  454. $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
  455. $value = trim( $value );
  456. // Decode character references
  457. $attribs[$attribute] = decodeCharReferences( $value );
  458. }
  459. return $attribs;
  460. }
  461. /**
  462. * Pick the appropriate attribute value from a match set from the
  463. * MW_ATTRIBS_REGEX matches.
  464. *
  465. * @param array $set
  466. * @return string
  467. * @access private
  468. */
  469. function getTagAttributeCallback( $set ) {
  470. if( isset( $set[6] ) ) {
  471. # Illegal #XXXXXX color with no quotes.
  472. return $set[6];
  473. } elseif( isset( $set[5] ) ) {
  474. # No quotes.
  475. return $set[5];
  476. } elseif( isset( $set[4] ) ) {
  477. # Single-quoted
  478. return $set[4];
  479. } elseif( isset( $set[3] ) ) {
  480. # Double-quoted
  481. return $set[3];
  482. } elseif( !isset( $set[2] ) ) {
  483. # In XHTML, attributes must have a value.
  484. # For 'reduced' form, return explicitly the attribute name here.
  485. return $set[1];
  486. } else {
  487. die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" );
  488. }
  489. }
  490. /**
  491. * Strips and renders nowiki, pre, math, hiero
  492. * If $render is set, performs necessary rendering operations on plugins
  493. * Returns the text, and fills an array with data needed in unstrip()
  494. * If the $state is already a valid strip state, it adds to the state
  495. *
  496. * @param bool $stripcomments when set, HTML comments <!-- like this -->
  497. * will be stripped in addition to other tags. This is important
  498. * for section editing, where these comments cause confusion when
  499. * counting the sections in the wikisource
  500. *
  501. * @param array dontstrip contains tags which should not be stripped;
  502. * used to prevent stipping of <gallery> when saving (fixes bug 2700)
  503. *
  504. * @access private
  505. */
  506. function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) {
  507. global $wgRandomKey;
  508. $render = true;
  509. $wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
  510. $uniq_prefix =& $wgRandomKey;
  511. $commentState = array();
  512. $elements = array( 'nowiki', 'gallery' );
  513. # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
  514. foreach ( $elements AS $k => $v ) {
  515. if ( !in_array ( $v , $dontstrip ) ) continue;
  516. unset ( $elements[$k] );
  517. }
  518. $matches = array();
  519. $text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );
  520. foreach( $matches as $marker => $data ) {
  521. list( $element, $content, $params, $tag ) = $data;
  522. if( $render ) {
  523. $tagName = strtolower( $element );
  524. switch( $tagName ) {
  525. case '!--':
  526. // Comment
  527. if( substr( $tag, -3 ) == '-->' ) {
  528. $output = $tag;
  529. } else {
  530. // Unclosed comment in input.
  531. // Close it so later stripping can remove it
  532. $output = "$tag-->";
  533. }
  534. break;
  535. case 'html':
  536. if( $wgRawHtml ) {
  537. $output = $content;
  538. break;
  539. }
  540. // Shouldn't happen otherwise. :)
  541. case 'nowiki':
  542. $output = wfEscapeHTMLTagsOnly( $content );
  543. break;
  544. default:
  545. }
  546. } else {
  547. // Just stripping tags; keep the source
  548. $output = $tag;
  549. }
  550. // Unstrip the output, because unstrip() is no longer recursive so
  551. // it won't do it itself
  552. $output = unstrip( $output, $state );
  553. if( !$stripcomments && $element == '!--' ) {
  554. $commentState[$marker] = $output;
  555. } elseif ( $element == 'html' || $element == 'nowiki' ) {
  556. $state['nowiki'][$marker] = $output;
  557. } else {
  558. $state['general'][$marker] = $output;
  559. }
  560. }
  561. # Unstrip comments unless explicitly told otherwise.
  562. # (The comments are always stripped prior to this point, so as to
  563. # not invoke any extension tags / parser hooks contained within
  564. # a comment.)
  565. if ( !$stripcomments ) {
  566. // Put them all back and forget them
  567. $text = strtr( $text, $commentState );
  568. }
  569. return $text;
  570. }
  571. /**
  572. * Replaces all occurrences of HTML-style comments and the given tags
  573. * in the text with a random marker and returns teh next text. The output
  574. * parameter $matches will be an associative array filled with data in
  575. * the form:
  576. * 'UNIQ-xxxxx' => array(
  577. * 'element',
  578. * 'tag content',
  579. * array( 'param' => 'x' ),
  580. * '<element param="x">tag content</element>' ) )
  581. *
  582. * @param $elements list of element names. Comments are always extracted.
  583. * @param $text Source text string.
  584. * @param $uniq_prefix
  585. *
  586. * @access private
  587. * @static
  588. */
  589. function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
  590. static $n = 1;
  591. $stripped = '';
  592. $matches = array();
  593. $taglist = implode( '|', $elements );
  594. $start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";
  595. while ( '' != $text ) {
  596. $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
  597. $stripped .= $p[0];
  598. if( count( $p ) < 5 ) {
  599. break;
  600. }
  601. if( count( $p ) > 5 ) {
  602. // comment
  603. $element = $p[4];
  604. $attributes = '';
  605. $close = '';
  606. $inside = $p[5];
  607. } else {
  608. // tag
  609. $element = $p[1];
  610. $attributes = $p[2];
  611. $close = $p[3];
  612. $inside = $p[4];
  613. }
  614. $marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU';
  615. $stripped .= $marker;
  616. if ( $close === '/>' ) {
  617. // Empty element tag, <tag />
  618. $content = null;
  619. $text = $inside;
  620. $tail = null;
  621. } else {
  622. if( $element == '!--' ) {
  623. $end = '/(-->)/';
  624. } else {
  625. $end = "/(<\\/$element\\s*>)/i";
  626. }
  627. $q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
  628. $content = $q[0];
  629. if( count( $q ) < 3 ) {
  630. # No end tag -- let it run out to the end of the text.
  631. $tail = '';
  632. $text = '';
  633. } else {
  634. $tail = $q[1];
  635. $text = $q[2];
  636. }
  637. }
  638. $matches[$marker] = array( $element,
  639. $content,
  640. decodeTagAttributes( $attributes ),
  641. "<$element$attributes$close$content$tail" );
  642. }
  643. return $stripped;
  644. }
  645. /**
  646. * Escape html tags
  647. * Basically replacing " > and < with HTML entities ( &quot;, &gt;, &lt;)
  648. *
  649. * @param $in String: text that might contain HTML tags.
  650. * @return string Escaped string
  651. */
  652. function wfEscapeHTMLTagsOnly( $in ) {
  653. return str_replace(
  654. array( '"', '>', '<' ),
  655. array( '&quot;', '&gt;', '&lt;' ),
  656. $in );
  657. }
  658. /**
  659. * Restores pre, math, and other extensions removed by strip()
  660. *
  661. * always call unstripNoWiki() after this one
  662. * @private
  663. */
  664. function unstrip( $text, &$state ) {
  665. if ( !isset( $state['general'] ) ) {
  666. return $text;
  667. }
  668. # TODO: good candidate for FSS
  669. $text = strtr( $text, $state['general'] );
  670. return $text;
  671. }
  672. /**
  673. * Return UTF-8 string for a codepoint if that is a valid
  674. * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
  675. * @param int $codepoint
  676. * @return string
  677. * @private
  678. */
  679. function decodeChar( $codepoint ) {
  680. if( validateCodepoint( $codepoint ) ) {
  681. return codepointToUtf8( $codepoint );
  682. } else {
  683. return UTF8_REPLACEMENT;
  684. }
  685. }
  686. /**
  687. * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
  688. * return the UTF-8 encoding of that character. Otherwise, returns
  689. * pseudo-entity source (eg &foo;)
  690. *
  691. * @param string $name
  692. * @return string
  693. */
  694. function decodeEntity( $name ) {
  695. global $wgHtmlEntities;
  696. if( isset( $wgHtmlEntities[$name] ) ) {
  697. return codepointToUtf8( $wgHtmlEntities[$name] );
  698. } else {
  699. return "&$name;";
  700. }
  701. }
  702. /**
  703. * Returns true if a given Unicode codepoint is a valid character in XML.
  704. * @param int $codepoint
  705. * @return bool
  706. */
  707. function validateCodepoint( $codepoint ) {
  708. return ($codepoint == 0x09)
  709. || ($codepoint == 0x0a)
  710. || ($codepoint == 0x0d)
  711. || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
  712. || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
  713. || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
  714. }
  715. /**
  716. * Return UTF-8 sequence for a given Unicode code point.
  717. * May die if fed out of range data.
  718. *
  719. * @param $codepoint Integer:
  720. * @return String
  721. * @public
  722. */
  723. function codepointToUtf8( $codepoint ) {
  724. if($codepoint < 0x80) return chr($codepoint);
  725. if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
  726. chr($codepoint & 0x3f | 0x80);
  727. if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
  728. chr($codepoint >> 6 & 0x3f | 0x80) .
  729. chr($codepoint & 0x3f | 0x80);
  730. if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
  731. chr($codepoint >> 12 & 0x3f | 0x80) .
  732. chr($codepoint >> 6 & 0x3f | 0x80) .
  733. chr($codepoint & 0x3f | 0x80);
  734. echo "Asked for code outside of range ($codepoint)\n";
  735. die( -1 );
  736. }
  737. /**
  738. * @param string $matches
  739. * @return string
  740. */
  741. function decodeCharReferencesCallback( $matches ) {
  742. if( $matches[1] != '' ) {
  743. return decodeEntity( $matches[1] );
  744. } elseif( $matches[2] != '' ) {
  745. return decodeChar( intval( $matches[2] ) );
  746. } elseif( $matches[3] != '' ) {
  747. return decodeChar( hexdec( $matches[3] ) );
  748. } elseif( $matches[4] != '' ) {
  749. return decodeChar( hexdec( $matches[4] ) );
  750. }
  751. # Last case should be an ampersand by itself
  752. return $matches[0];
  753. }