PageRenderTime 48ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/common/libraries/plugin/wiki/mediawiki/Sanitizer.php

https://bitbucket.org/renaatdemuynck/chamilo
PHP | 1271 lines | 1217 code | 12 blank | 42 comment | 1 complexity | bede5880d4db9185511589f62b2d8b88 MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.1, LGPL-3.0, GPL-3.0, MIT, GPL-2.0
  1. <?php
  2. /**
  3. * XHTML sanitizer for MediaWiki
  4. *
  5. * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
  6. * http://www.mediawiki.org/
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License along
  19. * with this program; if not, write to the Free Software Foundation, Inc.,
  20. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21. * http://www.gnu.org/copyleft/gpl.html
  22. *
  23. * @file
  24. * @ingroup Parser
  25. */
  26. /**
  27. * Regular expression to match various types of character references in
  28. * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  29. */
  30. define('MW_CHAR_REFS_REGEX', '/&([A-Za-z0-9\x80-\xff]+);
  31. |&\#([0-9]+);
  32. |&\#x([0-9A-Za-z]+);
  33. |&\#X([0-9A-Za-z]+);
  34. |(&)/x');
  35. /**
  36. * Regular expression to match HTML/XML attribute pairs within a tag.
  37. * Allows some... latitude.
  38. * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  39. */
  40. $attrib = '[A-Za-z0-9]';
  41. $space = '[\x09\x0a\x0d\x20]';
  42. define('MW_ATTRIBS_REGEX', "/(?:^|$space)($attrib+)
  43. ($space*=$space*
  44. (?:
  45. # The attribute value: quoted or alone
  46. \"([^<\"]*)\"
  47. | '([^<']*)'
  48. | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  49. | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  50. # colors are specified like this.
  51. # We'll be normalizing it.
  52. )
  53. )?(?=$space|\$)/sx");
  54. /**
  55. * List of all named character entities defined in HTML 4.01
  56. * http://www.w3.org/TR/html4/sgml/entities.html
  57. * @private
  58. */
  59. global $wgHtmlEntities;
  60. $wgHtmlEntities = array('Aacute' => 193, 'aacute' => 225, 'Acirc' => 194, 'acirc' => 226, 'acute' => 180,
  61. 'AElig' => 198, 'aelig' => 230, 'Agrave' => 192, 'agrave' => 224, 'alefsym' => 8501, 'Alpha' => 913,
  62. 'alpha' => 945, 'amp' => 38, 'and' => 8743, 'ang' => 8736, 'Aring' => 197, 'aring' => 229, 'asymp' => 8776,
  63. 'Atilde' => 195, 'atilde' => 227, 'Auml' => 196, 'auml' => 228, 'bdquo' => 8222, 'Beta' => 914, 'beta' => 946,
  64. 'brvbar' => 166, 'bull' => 8226, 'cap' => 8745, 'Ccedil' => 199, 'ccedil' => 231, 'cedil' => 184, 'cent' => 162,
  65. 'Chi' => 935, 'chi' => 967, 'circ' => 710, 'clubs' => 9827, 'cong' => 8773, 'copy' => 169, 'crarr' => 8629,
  66. 'cup' => 8746, 'curren' => 164, 'dagger' => 8224, 'Dagger' => 8225, 'darr' => 8595, 'dArr' => 8659, 'deg' => 176,
  67. 'Delta' => 916, 'delta' => 948, 'diams' => 9830, 'divide' => 247, 'Eacute' => 201, 'eacute' => 233,
  68. 'Ecirc' => 202, 'ecirc' => 234, 'Egrave' => 200, 'egrave' => 232, 'empty' => 8709, 'emsp' => 8195,
  69. 'ensp' => 8194, 'Epsilon' => 917, 'epsilon' => 949, 'equiv' => 8801, 'Eta' => 919, 'eta' => 951, 'ETH' => 208,
  70. 'eth' => 240, 'Euml' => 203, 'euml' => 235, 'euro' => 8364, 'exist' => 8707, 'fnof' => 402, 'forall' => 8704,
  71. 'frac12' => 189, 'frac14' => 188, 'frac34' => 190, 'frasl' => 8260, 'Gamma' => 915, 'gamma' => 947, 'ge' => 8805,
  72. 'gt' => 62, 'harr' => 8596, 'hArr' => 8660, 'hearts' => 9829, 'hellip' => 8230, 'Iacute' => 205, 'iacute' => 237,
  73. 'Icirc' => 206, 'icirc' => 238, 'iexcl' => 161, 'Igrave' => 204, 'igrave' => 236, 'image' => 8465,
  74. 'infin' => 8734, 'int' => 8747, 'Iota' => 921, 'iota' => 953, 'iquest' => 191, 'isin' => 8712, 'Iuml' => 207,
  75. 'iuml' => 239, 'Kappa' => 922, 'kappa' => 954, 'Lambda' => 923, 'lambda' => 955, 'lang' => 9001, 'laquo' => 171,
  76. 'larr' => 8592, 'lArr' => 8656, 'lceil' => 8968, 'ldquo' => 8220, 'le' => 8804, 'lfloor' => 8970,
  77. 'lowast' => 8727, 'loz' => 9674, 'lrm' => 8206, 'lsaquo' => 8249, 'lsquo' => 8216, 'lt' => 60, 'macr' => 175,
  78. 'mdash' => 8212, 'micro' => 181, 'middot' => 183, 'minus' => 8722, 'Mu' => 924, 'mu' => 956, 'nabla' => 8711,
  79. 'nbsp' => 160, 'ndash' => 8211, 'ne' => 8800, 'ni' => 8715, 'not' => 172, 'notin' => 8713, 'nsub' => 8836,
  80. 'Ntilde' => 209, 'ntilde' => 241, 'Nu' => 925, 'nu' => 957, 'Oacute' => 211, 'oacute' => 243, 'Ocirc' => 212,
  81. 'ocirc' => 244, 'OElig' => 338, 'oelig' => 339, 'Ograve' => 210, 'ograve' => 242, 'oline' => 8254,
  82. 'Omega' => 937, 'omega' => 969, 'Omicron' => 927, 'omicron' => 959, 'oplus' => 8853, 'or' => 8744, 'ordf' => 170,
  83. 'ordm' => 186, 'Oslash' => 216, 'oslash' => 248, 'Otilde' => 213, 'otilde' => 245, 'otimes' => 8855,
  84. 'Ouml' => 214, 'ouml' => 246, 'para' => 182, 'part' => 8706, 'permil' => 8240, 'perp' => 8869, 'Phi' => 934,
  85. 'phi' => 966, 'Pi' => 928, 'pi' => 960, 'piv' => 982, 'plusmn' => 177, 'pound' => 163, 'prime' => 8242,
  86. 'Prime' => 8243, 'prod' => 8719, 'prop' => 8733, 'Psi' => 936, 'psi' => 968, 'quot' => 34, 'radic' => 8730,
  87. 'rang' => 9002, 'raquo' => 187, 'rarr' => 8594, 'rArr' => 8658, 'rceil' => 8969, 'rdquo' => 8221, 'real' => 8476,
  88. 'reg' => 174, 'rfloor' => 8971, 'Rho' => 929, 'rho' => 961, 'rlm' => 8207, 'rsaquo' => 8250, 'rsquo' => 8217,
  89. 'sbquo' => 8218, 'Scaron' => 352, 'scaron' => 353, 'sdot' => 8901, 'sect' => 167, 'shy' => 173, 'Sigma' => 931,
  90. 'sigma' => 963, 'sigmaf' => 962, 'sim' => 8764, 'spades' => 9824, 'sub' => 8834, 'sube' => 8838, 'sum' => 8721,
  91. 'sup' => 8835, 'sup1' => 185, 'sup2' => 178, 'sup3' => 179, 'supe' => 8839, 'szlig' => 223, 'Tau' => 932,
  92. 'tau' => 964, 'there4' => 8756, 'Theta' => 920, 'theta' => 952, 'thetasym' => 977, 'thinsp' => 8201,
  93. 'THORN' => 222, 'thorn' => 254, 'tilde' => 732, 'times' => 215, 'trade' => 8482, 'Uacute' => 218,
  94. 'uacute' => 250, 'uarr' => 8593, 'uArr' => 8657, 'Ucirc' => 219, 'ucirc' => 251, 'Ugrave' => 217,
  95. 'ugrave' => 249, 'uml' => 168, 'upsih' => 978, 'Upsilon' => 933, 'upsilon' => 965, 'Uuml' => 220, 'uuml' => 252,
  96. 'weierp' => 8472, 'Xi' => 926, 'xi' => 958, 'Yacute' => 221, 'yacute' => 253, 'yen' => 165, 'Yuml' => 376,
  97. 'yuml' => 255, 'Zeta' => 918, 'zeta' => 950, 'zwj' => 8205, 'zwnj' => 8204);
  98. /**
  99. * Character entity aliases accepted by MediaWiki
  100. */
  101. global $wgHtmlEntityAliases;
  102. $wgHtmlEntityAliases = array('רלמ' => 'rlm', 'رلم' => 'rlm');
  103. /**
  104. * XHTML sanitizer for MediaWiki
  105. * @ingroup Parser
  106. */
  107. class MediawikiSanitizer
  108. {
  109. /**
  110. * Cleans up HTML, removes dangerous tags and attributes, and
  111. * removes HTML comments
  112. * @private
  113. * @param string $text
  114. * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
  115. * @param array $args for the processing callback
  116. * @return string
  117. */
  118. static function removeHTMLtags($text, $processCallback = null, $args = array(), $extratags = array())
  119. {
  120. global $wgUseTidy;
  121. static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
  122. wfProfileIn(__METHOD__);
  123. if (! $staticInitialised)
  124. {
  125. $htmlpairs = array_merge($extratags, array(# Tags that must be closed
  126. 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub',
  127. 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt',
  128. 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rt',
  129. 'rb', 'rp', 'p', 'span', 'u'));
  130. $htmlsingle = array('br', 'hr', 'li', 'dt', 'dd');
  131. $htmlsingleonly = array(# Elements that cannot have close tags
  132. 'br', 'hr');
  133. $htmlnest = array(# Tags that can be nested--??
  134. 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'dl', 'font', 'big', 'small',
  135. 'sub', 'sup', 'span');
  136. $tabletags = array(# Can only appear inside table, we will close them
  137. 'td', 'th', 'tr');
  138. $htmllist = array(# Tags used by list
  139. 'ul', 'ol');
  140. $listtags = array(# Tags that can appear in a list
  141. 'li');
  142. $htmlsingleallowed = array_merge($htmlsingle, $tabletags);
  143. $htmlelements = array_merge($htmlsingle, $htmlpairs, $htmlnest);
  144. # Convert them all to hashtables for faster lookup
  145. $vars = array('htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 'htmllist',
  146. 'listtags', 'htmlsingleallowed', 'htmlelements');
  147. foreach ($vars as $var)
  148. {
  149. $$var = array_flip($$var);
  150. }
  151. $staticInitialised = true;
  152. }
  153. # Remove HTML comments
  154. $text = self :: removeHTMLcomments($text);
  155. $bits = explode('<', $text);
  156. $text = str_replace('>', '&gt;', array_shift($bits));
  157. if (! $wgUseTidy)
  158. {
  159. $tagstack = $tablestack = array();
  160. foreach ($bits as $x)
  161. {
  162. $regs = array();
  163. if (preg_match('!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs))
  164. {
  165. list( /* $qbar */, $slash, $t, $params, $brace, $rest) = $regs;
  166. }
  167. else
  168. {
  169. $slash = $t = $params = $brace = $rest = null;
  170. }
  171. $badtag = 0;
  172. if (isset($htmlelements[$t = strtolower($t)]))
  173. {
  174. # Check our stack
  175. if ($slash)
  176. {
  177. # Closing a tag...
  178. if (isset($htmlsingleonly[$t]))
  179. {
  180. $badtag = 1;
  181. }
  182. elseif (($ot = @array_pop($tagstack)) != $t)
  183. {
  184. if (isset($htmlsingleallowed[$ot]))
  185. {
  186. # Pop all elements with an optional close tag
  187. # and see if we find a match below them
  188. $optstack = array();
  189. array_push($optstack, $ot);
  190. while ((($ot = @array_pop($tagstack)) != $t) && isset($htmlsingleallowed[$ot]))
  191. {
  192. array_push($optstack, $ot);
  193. }
  194. if ($t != $ot)
  195. {
  196. # No match. Push the optinal elements back again
  197. $badtag = 1;
  198. while ($ot = @array_pop($optstack))
  199. {
  200. array_push($tagstack, $ot);
  201. }
  202. }
  203. }
  204. else
  205. {
  206. @array_push($tagstack, $ot);
  207. # <li> can be nested in <ul> or <ol>, skip those cases:
  208. if (! (isset($htmllist[$ot]) && isset($listtags[$t])))
  209. {
  210. $badtag = 1;
  211. }
  212. }
  213. }
  214. else
  215. {
  216. if ($t == 'table')
  217. {
  218. $tagstack = array_pop($tablestack);
  219. }
  220. }
  221. $newparams = '';
  222. }
  223. else
  224. {
  225. # Keep track for later
  226. if (isset($tabletags[$t]) && ! in_array('table', $tagstack))
  227. {
  228. $badtag = 1;
  229. }
  230. else
  231. if (in_array($t, $tagstack) && ! isset($htmlnest[$t]))
  232. {
  233. $badtag = 1;
  234. # Is it a self closed htmlpair ? (bug 5487)
  235. }
  236. else
  237. if ($brace == '/>' && isset($htmlpairs[$t]))
  238. {
  239. $badtag = 1;
  240. }
  241. elseif (isset($htmlsingleonly[$t]))
  242. {
  243. # Hack to force empty tag for uncloseable elements
  244. $brace = '/>';
  245. }
  246. else
  247. if (isset($htmlsingle[$t]))
  248. {
  249. # Hack to not close $htmlsingle tags
  250. $brace = NULL;
  251. }
  252. else
  253. if (isset($tabletags[$t]) && in_array($t, $tagstack))
  254. {
  255. // New table tag but forgot to close the previous one
  256. $text .= "</$t>";
  257. }
  258. else
  259. {
  260. if ($t == 'table')
  261. {
  262. array_push($tablestack, $tagstack);
  263. $tagstack = array();
  264. }
  265. array_push($tagstack, $t);
  266. }
  267. # Replace any variables or template parameters with
  268. # plaintext results.
  269. if (is_callable($processCallback))
  270. {
  271. call_user_func_array($processCallback, array(&$params, $args));
  272. }
  273. # Strip non-approved attributes from the tag
  274. $newparams = self :: fixTagAttributes($params, $t);
  275. }
  276. if (! $badtag)
  277. {
  278. $rest = str_replace('>', '&gt;', $rest);
  279. $close = ($brace == '/>' && ! $slash) ? ' /' : '';
  280. $text .= "<$slash$t$newparams$close>$rest";
  281. continue;
  282. }
  283. }
  284. $text .= '&lt;' . str_replace('>', '&gt;', $x);
  285. }
  286. # Close off any remaining tags
  287. while (is_array($tagstack) && ($t = array_pop($tagstack)))
  288. {
  289. $text .= "</$t>\n";
  290. if ($t == 'table')
  291. {
  292. $tagstack = array_pop($tablestack);
  293. }
  294. }
  295. }
  296. else
  297. {
  298. # this might be possible using tidy itself
  299. foreach ($bits as $x)
  300. {
  301. preg_match('/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs);
  302. @list( /* $qbar */, $slash, $t, $params, $brace, $rest) = $regs;
  303. if (isset($htmlelements[$t = strtolower($t)]))
  304. {
  305. if (is_callable($processCallback))
  306. {
  307. call_user_func_array($processCallback, array(&$params, $args));
  308. }
  309. $newparams = self :: fixTagAttributes($params, $t);
  310. $rest = str_replace('>', '&gt;', $rest);
  311. $text .= "<$slash$t$newparams$brace$rest";
  312. }
  313. else
  314. {
  315. $text .= '&lt;' . str_replace('>', '&gt;', $x);
  316. }
  317. }
  318. }
  319. wfProfileOut(__METHOD__);
  320. return $text;
  321. }
  322. /**
  323. * Remove '<!--', '-->', and everything between.
  324. * To avoid leaving blank lines, when a comment is both preceded
  325. * and followed by a newline (ignoring spaces), trim leading and
  326. * trailing spaces and one of the newlines.
  327. *
  328. * @private
  329. * @param string $text
  330. * @return string
  331. */
  332. static function removeHTMLcomments($text)
  333. {
  334. wfProfileIn(__METHOD__);
  335. while (($start = strpos($text, '<!--')) !== false)
  336. {
  337. $end = strpos($text, '-->', $start + 4);
  338. if ($end === false)
  339. {
  340. # Unterminated comment; bail out
  341. break;
  342. }
  343. $end += 3;
  344. # Trim space and newline if the comment is both
  345. # preceded and followed by a newline
  346. $spaceStart = max($start - 1, 0);
  347. $spaceLen = $end - $spaceStart;
  348. while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0)
  349. {
  350. $spaceStart --;
  351. $spaceLen ++;
  352. }
  353. while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
  354. $spaceLen ++;
  355. if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n")
  356. {
  357. # Remove the comment, leading and trailing
  358. # spaces, and leave only one newline.
  359. $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
  360. }
  361. else
  362. {
  363. # Remove just the comment.
  364. $text = substr_replace($text, '', $start, $end - $start);
  365. }
  366. }
  367. wfProfileOut(__METHOD__);
  368. return $text;
  369. }
  370. /**
  371. * Take an array of attribute names and values and normalize or discard
  372. * illegal values for the given element type.
  373. *
  374. * - Discards attributes not on a whitelist for the given element
  375. * - Unsafe style attributes are discarded
  376. * - Invalid id attributes are reencoded
  377. *
  378. * @param array $attribs
  379. * @param string $element
  380. * @return array
  381. *
  382. * @todo Check for legal values where the DTD limits things.
  383. * @todo Check for unique id attribute :P
  384. */
  385. static function validateTagAttributes($attribs, $element)
  386. {
  387. return self :: validateAttributes($attribs, self :: attributeWhitelist($element));
  388. }
  389. /**
  390. * Take an array of attribute names and values and normalize or discard
  391. * illegal values for the given whitelist.
  392. *
  393. * - Discards attributes not the given whitelist
  394. * - Unsafe style attributes are discarded
  395. * - Invalid id attributes are reencoded
  396. *
  397. * @param array $attribs
  398. * @param array $whitelist list of allowed attribute names
  399. * @return array
  400. *
  401. * @todo Check for legal values where the DTD limits things.
  402. * @todo Check for unique id attribute :P
  403. */
  404. static function validateAttributes($attribs, $whitelist)
  405. {
  406. $whitelist = array_flip($whitelist);
  407. $out = array();
  408. foreach ($attribs as $attribute => $value)
  409. {
  410. if (! isset($whitelist[$attribute]))
  411. {
  412. continue;
  413. }
  414. # Strip javascript "expression" from stylesheets.
  415. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
  416. if ($attribute == 'style')
  417. {
  418. $value = self :: checkCss($value);
  419. }
  420. if ($attribute === 'id')
  421. {
  422. global $wgEnforceHtmlIds;
  423. $value = self :: escapeId($value, $wgEnforceHtmlIds ? 'noninitial' : 'xml');
  424. }
  425. // If this attribute was previously set, override it.
  426. // Output should only have one attribute of each name.
  427. $out[$attribute] = $value;
  428. }
  429. return $out;
  430. }
  431. /**
  432. * Merge two sets of HTML attributes. Conflicting items in the second set
  433. * will override those in the first, except for 'class' attributes which
  434. * will be combined (if they're both strings).
  435. *
  436. * @todo implement merging for other attributes such as style
  437. * @param array $a
  438. * @param array $b
  439. * @return array
  440. */
  441. static function mergeAttributes($a, $b)
  442. {
  443. $out = array_merge($a, $b);
  444. if (isset($a['class']) && isset($b['class']) && is_string($a['class']) && is_string($b['class']) && $a['class'] !== $b['class'])
  445. {
  446. $classes = preg_split('/\s+/', "{$a['class']} {$b['class']}", - 1, PREG_SPLIT_NO_EMPTY);
  447. $out['class'] = implode(' ', array_unique($classes));
  448. }
  449. return $out;
  450. }
  451. /**
  452. * Pick apart some CSS and check it for forbidden or unsafe structures.
  453. * Returns a sanitized string, or false if it was just too evil.
  454. *
  455. * Currently URL references, 'expression', 'tps' are forbidden.
  456. *
  457. * @param string $value
  458. * @return mixed
  459. */
  460. static function checkCss($value)
  461. {
  462. $value = self :: decodeCharReferences($value);
  463. // Remove any comments; IE gets token splitting wrong
  464. $value = MediawikiStringUtils :: delimiterReplace('/*', '*/', ' ', $value);
  465. // Decode escape sequences and line continuation
  466. // See the grammar in the CSS 2 spec, appendix D.
  467. static $decodeRegex, $reencodeTable;
  468. if (! $decodeRegex)
  469. {
  470. $space = '[\\x20\\t\\r\\n\\f]';
  471. $nl = '(?:\\n|\\r\\n|\\r|\\f)';
  472. $backslash = '\\\\';
  473. $decodeRegex = "/ $backslash
  474. (?:
  475. ($nl) | # 1. Line continuation
  476. ([0-9A-Fa-f]{1,6})$space? | # 2. character number
  477. (.) | # 3. backslash cancelling special meaning
  478. () | # 4. backslash at end of string
  479. )/xu";
  480. }
  481. $value = preg_replace_callback($decodeRegex, array(__CLASS__, 'cssDecodeCallback'), $value);
  482. // Reject problematic keywords and control characters
  483. if (preg_match('/[\000-\010\016-\037\177]/', $value))
  484. {
  485. return '/* invalid control char */';
  486. }
  487. elseif (preg_match('! expression | filter\s*: | accelerator\s*: | url\s*\( !ix', $value))
  488. {
  489. return '/* insecure input */';
  490. }
  491. return $value;
  492. }
  493. static function cssDecodeCallback($matches)
  494. {
  495. if ($matches[1] !== '')
  496. {
  497. // Line continuation
  498. return '';
  499. }
  500. elseif ($matches[2] !== '')
  501. {
  502. $char = codepointToUtf8(hexdec($matches[2]));
  503. }
  504. elseif ($matches[3] !== '')
  505. {
  506. $char = $matches[3];
  507. }
  508. else
  509. {
  510. $char = '\\';
  511. }
  512. if ($char == "\n" || $char == '"' || $char == "'" || $char == '\\')
  513. {
  514. // These characters need to be escaped in strings
  515. // Clean up the escape sequence to avoid parsing errors by clients
  516. return '\\' . dechex(ord($char)) . ' ';
  517. }
  518. else
  519. {
  520. // Decode unnecessary escape
  521. return $char;
  522. }
  523. }
  524. /**
  525. * Take a tag soup fragment listing an HTML element's attributes
  526. * and normalize it to well-formed XML, discarding unwanted attributes.
  527. * Output is safe for further wikitext processing, with escaping of
  528. * values that could trigger problems.
  529. *
  530. * - Normalizes attribute names to lowercase
  531. * - Discards attributes not on a whitelist for the given element
  532. * - Turns broken or invalid entities into plaintext
  533. * - Double-quotes all attribute values
  534. * - Attributes without values are given the name as attribute
  535. * - Double attributes are discarded
  536. * - Unsafe style attributes are discarded
  537. * - Prepends space if there are attributes.
  538. *
  539. * @param string $text
  540. * @param string $element
  541. * @return string
  542. */
  543. static function fixTagAttributes($text, $element)
  544. {
  545. if (trim($text) == '')
  546. {
  547. return '';
  548. }
  549. $stripped = self :: validateTagAttributes(self :: decodeTagAttributes($text), $element);
  550. $attribs = array();
  551. foreach ($stripped as $attribute => $value)
  552. {
  553. $encAttribute = htmlspecialchars($attribute);
  554. $encValue = self :: safeEncodeAttribute($value);
  555. $attribs[] = "$encAttribute=\"$encValue\"";
  556. }
  557. return count($attribs) ? ' ' . implode(' ', $attribs) : '';
  558. }
  559. /**
  560. * Encode an attribute value for HTML output.
  561. * @param $text
  562. * @return HTML-encoded text fragment
  563. */
  564. static function encodeAttribute($text)
  565. {
  566. $encValue = htmlspecialchars($text, ENT_QUOTES);
  567. // Whitespace is normalized during attribute decoding,
  568. // so if we've been passed non-spaces we must encode them
  569. // ahead of time or they won't be preserved.
  570. $encValue = strtr($encValue, array("\n" => '&#10;', "\r" => '&#13;', "\t" => '&#9;'));
  571. return $encValue;
  572. }
  573. /**
  574. * Encode an attribute value for HTML tags, with extra armoring
  575. * against further wiki processing.
  576. * @param $text
  577. * @return HTML-encoded text fragment
  578. */
  579. static function safeEncodeAttribute($text)
  580. {
  581. $encValue = self :: encodeAttribute($text);
  582. # Templates and links may be expanded in later parsing,
  583. # creating invalid or dangerous output. Suppress this.
  584. $encValue = strtr($encValue, array('<' => '&lt;', // This should never happen,
  585. '>' => '&gt;', // we've received invalid input
  586. '"' => '&quot;', // which should have been escaped.
  587. '{' => '&#123;', '[' => '&#91;', "''" => '&#39;&#39;', 'ISBN' => '&#73;SBN',
  588. 'RFC' => '&#82;FC', 'PMID' => '&#80;MID', '|' => '&#124;', '__' => '&#95;_'));
  589. # Stupid hack
  590. $encValue = preg_replace_callback('/(' . wfUrlProtocols() . ')/', array('MediawikiSanitizer',
  591. 'armorLinksCallback'), $encValue);
  592. return $encValue;
  593. }
  594. /**
  595. * Given a value escape it so that it can be used in an id attribute and
  596. * return it, this does not validate the value however (see first link)
  597. *
  598. * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
  599. * in the id and
  600. * name attributes
  601. * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
  602. *
  603. * @param string $id Id to validate
  604. * @param mixed $options String or array of strings (default is array()):
  605. * 'noninitial': This is a non-initial fragment of an id, not a full id,
  606. * so don't pay attention if the first character isn't valid at the
  607. * beginning of an id.
  608. * 'xml': Don't restrict the id to be HTML4-compatible. This option
  609. * allows any alphabetic character to be used, per the XML standard.
  610. * Therefore, it also completely changes the type of escaping: instead
  611. * of weird dot-encoding, runs of invalid characters (mostly
  612. * whitespace) are just compressed into a single underscore.
  613. * @return string
  614. */
  615. static function escapeId($id, $options = array())
  616. {
  617. $options = (array) $options;
  618. if (! in_array('xml', $options))
  619. {
  620. # HTML4-style escaping
  621. static $replace = array('%3A' => ':', '%' => '.');
  622. $id = urlencode(self :: decodeCharReferences(strtr($id, ' ', '_')));
  623. $id = str_replace(array_keys($replace), array_values($replace), $id);
  624. if (! preg_match('/^[a-zA-Z]/', $id) && ! in_array('noninitial', $options))
  625. {
  626. // Initial character must be a letter!
  627. $id = "x$id";
  628. }
  629. return $id;
  630. }
  631. # XML-style escaping. For the patterns used, see the XML 1.0 standard,
  632. # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
  633. $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}' . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}' . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
  634. $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}' . '\x{203F}-\x{2040}';
  635. # Replace _ as well so we don't get multiple consecutive underscores
  636. $id = preg_replace("/([^$nameChar]|_)+/u", '_', $id);
  637. $id = trim($id, '_');
  638. if (! preg_match("/^[$nameStartChar]/u", $id) && ! in_array('noninitial', $options))
  639. {
  640. $id = "_$id";
  641. }
  642. return $id;
  643. }
  644. /**
  645. * Given a value, escape it so that it can be used as a CSS class and
  646. * return it.
  647. *
  648. * @todo For extra validity, input should be validated UTF-8.
  649. *
  650. * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
  651. *
  652. * @param string $class
  653. * @return string
  654. */
  655. static function escapeClass($class)
  656. {
  657. // Convert ugly stuff to underscores and kill underscores in ugly places
  658. return rtrim(preg_replace(array(
  659. '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/'), '_', $class), '_');
  660. }
  661. /**
  662. * Given HTML input, escape with htmlspecialchars but un-escape entites.
  663. * This allows (generally harmless) entities like &nbsp; to survive.
  664. *
  665. * @param string $html String to escape
  666. * @return string Escaped input
  667. */
  668. static function escapeHtmlAllowEntities($html)
  669. {
  670. # It seems wise to escape ' as well as ", as a matter of course. Can't
  671. # hurt.
  672. $html = htmlspecialchars($html, ENT_QUOTES);
  673. $html = str_replace('&amp;', '&', $html);
  674. $html = self :: normalizeCharReferences($html);
  675. return $html;
  676. }
  677. /**
  678. * Regex replace callback for armoring links against further processing.
  679. * @param array $matches
  680. * @return string
  681. * @private
  682. */
  683. private static function armorLinksCallback($matches)
  684. {
  685. return str_replace(':', '&#58;', $matches[1]);
  686. }
  687. /**
  688. * Return an associative array of attribute names and values from
  689. * a partial tag string. Attribute names are forces to lowercase,
  690. * character references are decoded to UTF-8 text.
  691. *
  692. * @param string
  693. * @return array
  694. */
  695. public static function decodeTagAttributes($text)
  696. {
  697. $attribs = array();
  698. if (trim($text) == '')
  699. {
  700. return $attribs;
  701. }
  702. $pairs = array();
  703. if (! preg_match_all(MW_ATTRIBS_REGEX, $text, $pairs, PREG_SET_ORDER))
  704. {
  705. return $attribs;
  706. }
  707. foreach ($pairs as $set)
  708. {
  709. $attribute = strtolower($set[1]);
  710. $value = self :: getTagAttributeCallback($set);
  711. // Normalize whitespace
  712. $value = preg_replace('/[\t\r\n ]+/', ' ', $value);
  713. $value = trim($value);
  714. // Decode character references
  715. $attribs[$attribute] = self :: decodeCharReferences($value);
  716. }
  717. return $attribs;
  718. }
  719. /**
  720. * Pick the appropriate attribute value from a match set from the
  721. * MW_ATTRIBS_REGEX matches.
  722. *
  723. * @param array $set
  724. * @return string
  725. * @private
  726. */
  727. private static function getTagAttributeCallback($set)
  728. {
  729. if (isset($set[6]))
  730. {
  731. # Illegal #XXXXXX color with no quotes.
  732. return $set[6];
  733. }
  734. elseif (isset($set[5]))
  735. {
  736. # No quotes.
  737. return $set[5];
  738. }
  739. elseif (isset($set[4]))
  740. {
  741. # Single-quoted
  742. return $set[4];
  743. }
  744. elseif (isset($set[3]))
  745. {
  746. # Double-quoted
  747. return $set[3];
  748. }
  749. elseif (! isset($set[2]))
  750. {
  751. # In XHTML, attributes must have a value.
  752. # For 'reduced' form, return explicitly the attribute name here.
  753. return $set[1];
  754. }
  755. else
  756. {
  757. throw new MWException("Tag conditions not met. This should never happen and is a bug.");
  758. }
  759. }
  760. /**
  761. * Normalize whitespace and character references in an XML source-
  762. * encoded text for an attribute value.
  763. *
  764. * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
  765. * but note that we're not returning the value, but are returning
  766. * XML source fragments that will be slapped into output.
  767. *
  768. * @param string $text
  769. * @return string
  770. * @private
  771. */
  772. private static function normalizeAttributeValue($text)
  773. {
  774. return str_replace('"', '&quot;', self :: normalizeWhitespace(self :: normalizeCharReferences($text)));
  775. }
  776. private static function normalizeWhitespace($text)
  777. {
  778. return preg_replace('/\r\n|[\x20\x0d\x0a\x09]/', ' ', $text);
  779. }
  780. /**
  781. * Ensure that any entities and character references are legal
  782. * for XML and XHTML specifically. Any stray bits will be
  783. * &amp;-escaped to result in a valid text fragment.
  784. *
  785. * a. any named char refs must be known in XHTML
  786. * b. any numeric char refs must be legal chars, not invalid or forbidden
  787. * c. use &#x, not &#X
  788. * d. fix or reject non-valid attributes
  789. *
  790. * @param string $text
  791. * @return string
  792. * @private
  793. */
  794. static function normalizeCharReferences($text)
  795. {
  796. return preg_replace_callback(MW_CHAR_REFS_REGEX, array('Sanitizer', 'normalizeCharReferencesCallback'), $text);
  797. }
  798. /**
  799. * @param string $matches
  800. * @return string
  801. */
  802. static function normalizeCharReferencesCallback($matches)
  803. {
  804. $ret = null;
  805. if ($matches[1] != '')
  806. {
  807. $ret = self :: normalizeEntity($matches[1]);
  808. }
  809. elseif ($matches[2] != '')
  810. {
  811. $ret = self :: decCharReference($matches[2]);
  812. }
  813. elseif ($matches[3] != '')
  814. {
  815. $ret = self :: hexCharReference($matches[3]);
  816. }
  817. elseif ($matches[4] != '')
  818. {
  819. $ret = self :: hexCharReference($matches[4]);
  820. }
  821. if (is_null($ret))
  822. {
  823. return htmlspecialchars($matches[0]);
  824. }
  825. else
  826. {
  827. return $ret;
  828. }
  829. }
  830. /**
  831. * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
  832. * return the named entity reference as is. If the entity is a
  833. * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
  834. * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
  835. *
  836. * @param string $name
  837. * @return string
  838. * @static
  839. */
  840. static function normalizeEntity($name)
  841. {
  842. global $wgHtmlEntities, $wgHtmlEntityAliases;
  843. if (isset($wgHtmlEntityAliases[$name]))
  844. {
  845. return "&{$wgHtmlEntityAliases[$name]};";
  846. }
  847. elseif (isset($wgHtmlEntities[$name]))
  848. {
  849. return "&$name;";
  850. }
  851. else
  852. {
  853. return "&amp;$name;";
  854. }
  855. }
  856. static function decCharReference($codepoint)
  857. {
  858. $point = intval($codepoint);
  859. if (self :: validateCodepoint($point))
  860. {
  861. return sprintf('&#%d;', $point);
  862. }
  863. else
  864. {
  865. return null;
  866. }
  867. }
  868. static function hexCharReference($codepoint)
  869. {
  870. $point = hexdec($codepoint);
  871. if (self :: validateCodepoint($point))
  872. {
  873. return sprintf('&#x%x;', $point);
  874. }
  875. else
  876. {
  877. return null;
  878. }
  879. }
  880. /**
  881. * Returns true if a given Unicode codepoint is a valid character in XML.
  882. * @param int $codepoint
  883. * @return bool
  884. */
  885. private static function validateCodepoint($codepoint)
  886. {
  887. return ($codepoint == 0x09) || ($codepoint == 0x0a) || ($codepoint == 0x0d) || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
  888. }
  889. /**
  890. * Decode any character references, numeric or named entities,
  891. * in the text and return a UTF-8 string.
  892. *
  893. * @param string $text
  894. * @return string
  895. * @public
  896. * @static
  897. */
  898. public static function decodeCharReferences($text)
  899. {
  900. return preg_replace_callback(MW_CHAR_REFS_REGEX, array('MediawikiSanitizer', 'decodeCharReferencesCallback'), $text);
  901. }
  902. /**
  903. * @param string $matches
  904. * @return string
  905. */
  906. static function decodeCharReferencesCallback($matches)
  907. {
  908. if ($matches[1] != '')
  909. {
  910. return self :: decodeEntity($matches[1]);
  911. }
  912. elseif ($matches[2] != '')
  913. {
  914. return self :: decodeChar(intval($matches[2]));
  915. }
  916. elseif ($matches[3] != '')
  917. {
  918. return self :: decodeChar(hexdec($matches[3]));
  919. }
  920. elseif ($matches[4] != '')
  921. {
  922. return self :: decodeChar(hexdec($matches[4]));
  923. }
  924. # Last case should be an ampersand by itself
  925. return $matches[0];
  926. }
  927. /**
  928. * Return UTF-8 string for a codepoint if that is a valid
  929. * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
  930. * @param int $codepoint
  931. * @return string
  932. * @private
  933. */
  934. static function decodeChar($codepoint)
  935. {
  936. if (self :: validateCodepoint($codepoint))
  937. {
  938. return MediawikiUtilities :: codepointToUtf8($codepoint);
  939. }
  940. else
  941. {
  942. return UTF8_REPLACEMENT;
  943. }
  944. }
  945. /**
  946. * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
  947. * return the UTF-8 encoding of that character. Otherwise, returns
  948. * pseudo-entity source (eg &foo;)
  949. *
  950. * @param string $name
  951. * @return string
  952. */
  953. static function decodeEntity($name)
  954. {
  955. global $wgHtmlEntities, $wgHtmlEntityAliases;
  956. if (isset($wgHtmlEntityAliases[$name]))
  957. {
  958. $name = $wgHtmlEntityAliases[$name];
  959. }
  960. if (isset($wgHtmlEntities[$name]))
  961. {
  962. return MediawikiUtilities :: codepointToUtf8($wgHtmlEntities[$name]);
  963. }
  964. else
  965. {
  966. return "&$name;";
  967. }
  968. }
  969. /**
  970. * Fetch the whitelist of acceptable attributes for a given
  971. * element name.
  972. *
  973. * @param string $element
  974. * @return array
  975. */
  976. static function attributeWhitelist($element)
  977. {
  978. static $list;
  979. if (! isset($list))
  980. {
  981. $list = self :: setupAttributeWhitelist();
  982. }
  983. return isset($list[$element]) ? $list[$element] : array();
  984. }
  985. /**
  986. * Foreach array key (an allowed HTML element), return an array
  987. * of allowed attributes
  988. * @return array
  989. */
  990. static function setupAttributeWhitelist()
  991. {
  992. $common = array('id', 'class', 'lang', 'dir', 'title', 'style');
  993. $block = array_merge($common, array('align'));
  994. $tablealign = array('align', 'char', 'charoff', 'valign');
  995. $tablecell = array('abbr', 'axis', 'headers', 'scope', 'rowspan', 'colspan', 'nowrap', # deprecated
  996. 'width', # deprecated
  997. 'height', # deprecated
  998. 'bgcolor'); # deprecated
  999. # Numbers refer to sections in HTML 4.01 standard describing the element.
  1000. # See: http://www.w3.org/TR/html4/
  1001. $whitelist = array(# 7.5.4
  1002. 'div' => $block, 'center' => $common, # deprecated
  1003. 'span' => $block, # ??
  1004. # 7.5.5
  1005. 'h1' => $block, 'h2' => $block, 'h3' => $block, 'h4' => $block, 'h5' => $block, 'h6' => $block,
  1006. # 7.5.6
  1007. # address
  1008. # 8.2.4
  1009. # bdo
  1010. # 9.2.1
  1011. 'em' => $common, 'strong' => $common, 'cite' => $common, # dfn
  1012. 'code' => $common,
  1013. # samp
  1014. # kbd
  1015. 'var' => $common, # abbr
  1016. # acronym
  1017. # 9.2.2
  1018. 'blockquote' => array_merge($common, array('cite')), # q
  1019. # 9.2.3
  1020. 'sub' => $common, 'sup' => $common,
  1021. # 9.3.1
  1022. 'p' => $block,
  1023. # 9.3.2
  1024. 'br' => array('id', 'class', 'title', 'style', 'clear'),
  1025. # 9.3.4
  1026. 'pre' => array_merge($common, array('width')),
  1027. # 9.4
  1028. 'ins' => array_merge($common, array('cite', 'datetime')),
  1029. 'del' => array_merge($common, array('cite', 'datetime')),
  1030. # 10.2
  1031. 'ul' => array_merge($common, array('type')),
  1032. 'ol' => array_merge($common, array('type', 'start')),
  1033. 'li' => array_merge($common, array('type', 'value')),
  1034. # 10.3
  1035. 'dl' => $common, 'dd' => $common, 'dt' => $common,
  1036. # 11.2.1
  1037. 'table' => array_merge($common, array('summary', 'width', 'border', 'frame', 'rules',
  1038. 'cellspacing', 'cellpadding', 'align', 'bgcolor')),
  1039. # 11.2.2
  1040. 'caption' => array_merge($common, array('align')),
  1041. # 11.2.3
  1042. 'thead' => array_merge($common, $tablealign),
  1043. 'tfoot' => array_merge($common, $tablealign), 'tbody' => array_merge($common, $tablealign),
  1044. # 11.2.4
  1045. 'colgroup' => array_merge($common, array('span', 'width'), $tablealign),
  1046. 'col' => array_merge($common, array('span', 'width'), $tablealign),
  1047. # 11.2.5
  1048. 'tr' => array_merge($common, array('bgcolor'), $tablealign),
  1049. # 11.2.6
  1050. 'td' => array_merge($common, $tablecell, $tablealign),
  1051. 'th' => array_merge($common, $tablecell, $tablealign),
  1052. # 13.2
  1053. # Not usually allowed, but may be used for extension-style hooks
  1054. # such as <math> when it is rasterized
  1055. 'img' => array_merge($common, array('alt')),
  1056. # 15.2.1
  1057. 'tt' => $common, 'b' => $common, 'i' => $common, 'big' => $common, 'small' => $common,
  1058. 'strike' => $common, 's' => $common, 'u' => $common,
  1059. # 15.2.2
  1060. 'font' => array_merge($common, array('size', 'color', 'face')), # basefont
  1061. # 15.3
  1062. 'hr' => array_merge($common, array('noshade', 'size', 'width')),
  1063. # XHTML Ruby annotation text module, simple ruby only.
  1064. # http://www.w3c.org/TR/ruby/
  1065. 'ruby' => $common, # rbc
  1066. # rtc
  1067. 'rb' => $common, 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
  1068. 'rp' => $common,
  1069. # MathML root element, where used for extensions
  1070. # 'title' may not be 100% valid here; it's XHTML
  1071. # http://www.w3.org/TR/REC-MathML/
  1072. 'math' => array('class', 'style', 'id', 'title'));
  1073. return $whitelist;
  1074. }
  1075. /**
  1076. * Take a fragment of (potentially invalid) HTML and return
  1077. * a version with any tags removed, encoded as plain text.
  1078. *
  1079. * Warning: this return value must be further escaped for literal
  1080. * inclusion in HTML output as of 1.10!
  1081. *
  1082. * @param string $text HTML fragment
  1083. * @return string
  1084. */
  1085. static function stripAllTags($text)
  1086. {
  1087. # Actual <tags>
  1088. $text = StringUtils :: delimiterReplace('<', '>', '', $text);
  1089. # Normalize &entities and whitespace
  1090. $text = self :: decodeCharReferences($text);
  1091. $text = self :: normalizeWhitespace($text);
  1092. return $text;
  1093. }
  1094. /**
  1095. * Hack up a private DOCTYPE with HTML's standard entity declarations.
  1096. * PHP 4 seemed to know these if you gave it an HTML doctype, but
  1097. * PHP 5.1 doesn't.
  1098. *
  1099. * Use for passing XHTML fragments to PHP's XML parsing functions
  1100. *
  1101. * @return string
  1102. * @static
  1103. */
  1104. static function hackDocType()
  1105. {
  1106. global $wgHtmlEntities;
  1107. $out = "<!DOCTYPE html [\n";
  1108. foreach ($wgHtmlEntities as $entity => $codepoint)
  1109. {
  1110. $out .= "<!ENTITY $entity \"&#$codepoint;\">";
  1111. }
  1112. $out .= "]>\n";
  1113. return $out;
  1114. }
  1115. static function cleanUrl($url)
  1116. {
  1117. # Normalize any HTML entities in input. They will be
  1118. # re-escaped by makeExternalLink().
  1119. $url = self :: decodeCharReferences($url);
  1120. # Escape any control characters introduced by the above step
  1121. $url = preg_replace('/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url);
  1122. # Validate hostname portion
  1123. $matches = array();
  1124. if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches))
  1125. {
  1126. list( /* $whole */, $protocol, $host, $rest) = $matches;
  1127. // Characters that will be ignored in IDNs.
  1128. // http://tools.ietf.org/html/3454#section-3.1
  1129. // Strip them before further processing so blacklists and such work.
  1130. $strip = "/
  1131. \\s| # general whitespace
  1132. \xc2\xad| # 00ad SOFT HYPHEN
  1133. \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
  1134. \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
  1135. \xe2\x81\xa0| # 2060 WORD JOINER
  1136. \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
  1137. \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
  1138. \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
  1139. \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
  1140. \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
  1141. \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
  1142. \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
  1143. [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
  1144. /xuD";
  1145. $host = preg_replace($strip, '', $host);
  1146. // @fixme: validate hostnames here
  1147. return $protocol . $host . $rest;
  1148. }
  1149. else
  1150. {
  1151. return $url;
  1152. }
  1153. }
  1154. }