PageRenderTime 48ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/moodle/lib/htmlpurifier/HTMLPurifier/Encoder.php

https://bitbucket.org/geek745/moodle-db2
PHP | 467 lines | 276 code | 29 blank | 162 comment | 77 complexity | ca6f651a517a6c7d56604e1605fbb838 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, BSD-3-Clause, LGPL-2.0
  1. <?php
  2. HTMLPurifier_ConfigSchema::define(
  3. 'Core', 'Encoding', 'utf-8', 'istring',
  4. 'If for some reason you are unable to convert all webpages to UTF-8, '.
  5. 'you can use this directive as a stop-gap compatibility change to '.
  6. 'let HTML Purifier deal with non UTF-8 input. This technique has '.
  7. 'notable deficiencies: absolutely no characters outside of the selected '.
  8. 'character encoding will be preserved, not even the ones that have '.
  9. 'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
  10. 'that automatically resolves all entities), making it pretty useless '.
  11. 'for anything except the most I18N-blind applications, although '.
  12. '%Core.EscapeNonASCIICharacters offers fixes this trouble with '.
  13. 'another tradeoff. This directive '.
  14. 'only accepts ISO-8859-1 if iconv is not enabled.'
  15. );
  16. HTMLPurifier_ConfigSchema::define(
  17. 'Core', 'EscapeNonASCIICharacters', false, 'bool',
  18. 'This directive overcomes a deficiency in %Core.Encoding by blindly '.
  19. 'converting all non-ASCII characters into decimal numeric entities before '.
  20. 'converting it to its native encoding. This means that even '.
  21. 'characters that can be expressed in the non-UTF-8 encoding will '.
  22. 'be entity-ized, which can be a real downer for encodings like Big5. '.
  23. 'It also assumes that the ASCII repetoire is available, although '.
  24. 'this is the case for almost all encodings. Anyway, use UTF-8! This '.
  25. 'directive has been available since 1.4.0.'
  26. );
  27. if ( !function_exists('iconv') ) {
  28. // only encodings with native PHP support
  29. HTMLPurifier_ConfigSchema::defineAllowedValues(
  30. 'Core', 'Encoding', array(
  31. 'utf-8',
  32. 'iso-8859-1'
  33. )
  34. );
  35. HTMLPurifier_ConfigSchema::defineValueAliases(
  36. 'Core', 'Encoding', array(
  37. 'iso8859-1' => 'iso-8859-1'
  38. )
  39. );
  40. }
  41. HTMLPurifier_ConfigSchema::define(
  42. 'Test', 'ForceNoIconv', false, 'bool',
  43. 'When set to true, HTMLPurifier_Encoder will act as if iconv does not '.
  44. 'exist and use only pure PHP implementations.'
  45. );
  46. /**
  47. * A UTF-8 specific character encoder that handles cleaning and transforming.
  48. * @note All functions in this class should be static.
  49. */
  50. class HTMLPurifier_Encoder
  51. {
  52. /**
  53. * Constructor throws fatal error if you attempt to instantiate class
  54. */
  55. function HTMLPurifier_Encoder() {
  56. trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
  57. }
  58. /**
  59. * Error-handler that mutes errors, alternative to shut-up operator.
  60. */
  61. function muteErrorHandler() {}
  62. /**
  63. /**
  64. * Cleans a UTF-8 string for well-formedness and SGML validity
  65. *
  66. * It will parse according to UTF-8 and return a valid UTF8 string, with
  67. * non-SGML codepoints excluded.
  68. *
  69. * @static
  70. * @note Just for reference, the non-SGML code points are 0 to 31 and
  71. * 127 to 159, inclusive. However, we allow code points 9, 10
  72. * and 13, which are the tab, line feed and carriage return
  73. * respectively. 128 and above the code points map to multibyte
  74. * UTF-8 representations.
  75. *
  76. * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
  77. * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
  78. * LGPL license. Notes on what changed are inside, but in general,
  79. * the original code transformed UTF-8 text into an array of integer
  80. * Unicode codepoints. Understandably, transforming that back to
  81. * a string would be somewhat expensive, so the function was modded to
  82. * directly operate on the string. However, this discourages code
  83. * reuse, and the logic enumerated here would be useful for any
  84. * function that needs to be able to understand UTF-8 characters.
  85. * As of right now, only smart lossless character encoding converters
  86. * would need that, and I'm probably not going to implement them.
  87. * Once again, PHP 6 should solve all our problems.
  88. */
  89. function cleanUTF8($str, $force_php = false) {
  90. // UTF-8 validity is checked since PHP 4.3.5
  91. // This is an optimization: if the string is already valid UTF-8, no
  92. // need to do PHP stuff. 99% of the time, this will be the case.
  93. // The regexp matches the XML char production, as well as well as excluding
  94. // non-SGML codepoints U+007F to U+009F
  95. if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
  96. return $str;
  97. }
  98. $mState = 0; // cached expected number of octets after the current octet
  99. // until the beginning of the next UTF8 character sequence
  100. $mUcs4 = 0; // cached Unicode character
  101. $mBytes = 1; // cached expected number of octets in the current sequence
  102. // original code involved an $out that was an array of Unicode
  103. // codepoints. Instead of having to convert back into UTF-8, we've
  104. // decided to directly append valid UTF-8 characters onto a string
  105. // $out once they're done. $char accumulates raw bytes, while $mUcs4
  106. // turns into the Unicode code point, so there's some redundancy.
  107. $out = '';
  108. $char = '';
  109. $len = strlen($str);
  110. for($i = 0; $i < $len; $i++) {
  111. $in = ord($str{$i});
  112. $char .= $str[$i]; // append byte to char
  113. if (0 == $mState) {
  114. // When mState is zero we expect either a US-ASCII character
  115. // or a multi-octet sequence.
  116. if (0 == (0x80 & ($in))) {
  117. // US-ASCII, pass straight through.
  118. if (($in <= 31 || $in == 127) &&
  119. !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
  120. ) {
  121. // control characters, remove
  122. } else {
  123. $out .= $char;
  124. }
  125. // reset
  126. $char = '';
  127. $mBytes = 1;
  128. } elseif (0xC0 == (0xE0 & ($in))) {
  129. // First octet of 2 octet sequence
  130. $mUcs4 = ($in);
  131. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  132. $mState = 1;
  133. $mBytes = 2;
  134. } elseif (0xE0 == (0xF0 & ($in))) {
  135. // First octet of 3 octet sequence
  136. $mUcs4 = ($in);
  137. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  138. $mState = 2;
  139. $mBytes = 3;
  140. } elseif (0xF0 == (0xF8 & ($in))) {
  141. // First octet of 4 octet sequence
  142. $mUcs4 = ($in);
  143. $mUcs4 = ($mUcs4 & 0x07) << 18;
  144. $mState = 3;
  145. $mBytes = 4;
  146. } elseif (0xF8 == (0xFC & ($in))) {
  147. // First octet of 5 octet sequence.
  148. //
  149. // This is illegal because the encoded codepoint must be
  150. // either:
  151. // (a) not the shortest form or
  152. // (b) outside the Unicode range of 0-0x10FFFF.
  153. // Rather than trying to resynchronize, we will carry on
  154. // until the end of the sequence and let the later error
  155. // handling code catch it.
  156. $mUcs4 = ($in);
  157. $mUcs4 = ($mUcs4 & 0x03) << 24;
  158. $mState = 4;
  159. $mBytes = 5;
  160. } elseif (0xFC == (0xFE & ($in))) {
  161. // First octet of 6 octet sequence, see comments for 5
  162. // octet sequence.
  163. $mUcs4 = ($in);
  164. $mUcs4 = ($mUcs4 & 1) << 30;
  165. $mState = 5;
  166. $mBytes = 6;
  167. } else {
  168. // Current octet is neither in the US-ASCII range nor a
  169. // legal first octet of a multi-octet sequence.
  170. $mState = 0;
  171. $mUcs4 = 0;
  172. $mBytes = 1;
  173. $char = '';
  174. }
  175. } else {
  176. // When mState is non-zero, we expect a continuation of the
  177. // multi-octet sequence
  178. if (0x80 == (0xC0 & ($in))) {
  179. // Legal continuation.
  180. $shift = ($mState - 1) * 6;
  181. $tmp = $in;
  182. $tmp = ($tmp & 0x0000003F) << $shift;
  183. $mUcs4 |= $tmp;
  184. if (0 == --$mState) {
  185. // End of the multi-octet sequence. mUcs4 now contains
  186. // the final Unicode codepoint to be output
  187. // Check for illegal sequences and codepoints.
  188. // From Unicode 3.1, non-shortest form is illegal
  189. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  190. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  191. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
  192. (4 < $mBytes) ||
  193. // From Unicode 3.2, surrogate characters = illegal
  194. (($mUcs4 & 0xFFFFF800) == 0xD800) ||
  195. // Codepoints outside the Unicode range are illegal
  196. ($mUcs4 > 0x10FFFF)
  197. ) {
  198. } elseif (0xFEFF != $mUcs4 && // omit BOM
  199. // check for valid Char unicode codepoints
  200. (
  201. 0x9 == $mUcs4 ||
  202. 0xA == $mUcs4 ||
  203. 0xD == $mUcs4 ||
  204. (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
  205. // 7F-9F is not strictly prohibited by XML,
  206. // but it is non-SGML, and thus we don't allow it
  207. (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
  208. (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
  209. )
  210. ) {
  211. $out .= $char;
  212. }
  213. // initialize UTF8 cache (reset)
  214. $mState = 0;
  215. $mUcs4 = 0;
  216. $mBytes = 1;
  217. $char = '';
  218. }
  219. } else {
  220. // ((0xC0 & (*in) != 0x80) && (mState != 0))
  221. // Incomplete multi-octet sequence.
  222. // used to result in complete fail, but we'll reset
  223. $mState = 0;
  224. $mUcs4 = 0;
  225. $mBytes = 1;
  226. $char ='';
  227. }
  228. }
  229. }
  230. return $out;
  231. }
  232. /**
  233. * Translates a Unicode codepoint into its corresponding UTF-8 character.
  234. * @static
  235. * @note Based on Feyd's function at
  236. * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
  237. * which is in public domain.
  238. * @note While we're going to do code point parsing anyway, a good
  239. * optimization would be to refuse to translate code points that
  240. * are non-SGML characters. However, this could lead to duplication.
  241. * @note This is very similar to the unichr function in
  242. * maintenance/generate-entity-file.php (although this is superior,
  243. * due to its sanity checks).
  244. */
  245. // +----------+----------+----------+----------+
  246. // | 33222222 | 22221111 | 111111 | |
  247. // | 10987654 | 32109876 | 54321098 | 76543210 | bit
  248. // +----------+----------+----------+----------+
  249. // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
  250. // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
  251. // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
  252. // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
  253. // +----------+----------+----------+----------+
  254. // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
  255. // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
  256. // +----------+----------+----------+----------+
  257. function unichr($code) {
  258. if($code > 1114111 or $code < 0 or
  259. ($code >= 55296 and $code <= 57343) ) {
  260. // bits are set outside the "valid" range as defined
  261. // by UNICODE 4.1.0
  262. return '';
  263. }
  264. $x = $y = $z = $w = 0;
  265. if ($code < 128) {
  266. // regular ASCII character
  267. $x = $code;
  268. } else {
  269. // set up bits for UTF-8
  270. $x = ($code & 63) | 128;
  271. if ($code < 2048) {
  272. $y = (($code & 2047) >> 6) | 192;
  273. } else {
  274. $y = (($code & 4032) >> 6) | 128;
  275. if($code < 65536) {
  276. $z = (($code >> 12) & 15) | 224;
  277. } else {
  278. $z = (($code >> 12) & 63) | 128;
  279. $w = (($code >> 18) & 7) | 240;
  280. }
  281. }
  282. }
  283. // set up the actual character
  284. $ret = '';
  285. if($w) $ret .= chr($w);
  286. if($z) $ret .= chr($z);
  287. if($y) $ret .= chr($y);
  288. $ret .= chr($x);
  289. return $ret;
  290. }
  291. /**
  292. * Converts a string to UTF-8 based on configuration.
  293. * @static
  294. */
  295. function convertToUTF8($str, $config, &$context) {
  296. $encoding = $config->get('Core', 'Encoding');
  297. if ($encoding === 'utf-8') return $str;
  298. static $iconv = null;
  299. if ($iconv === null) $iconv = function_exists('iconv');
  300. set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
  301. if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
  302. $str = iconv($encoding, 'utf-8//IGNORE', $str);
  303. // If the string is bjorked by Shift_JIS or a similar encoding
  304. // that doesn't support all of ASCII, convert the naughty
  305. // characters to their true byte-wise ASCII/UTF-8 equivalents.
  306. $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
  307. restore_error_handler();
  308. return $str;
  309. } elseif ($encoding === 'iso-8859-1') {
  310. $str = utf8_encode($str);
  311. restore_error_handler();
  312. return $str;
  313. }
  314. trigger_error('Encoding not supported', E_USER_ERROR);
  315. }
  316. /**
  317. * Converts a string from UTF-8 based on configuration.
  318. * @static
  319. * @note Currently, this is a lossy conversion, with unexpressable
  320. * characters being omitted.
  321. */
  322. function convertFromUTF8($str, $config, &$context) {
  323. $encoding = $config->get('Core', 'Encoding');
  324. if ($encoding === 'utf-8') return $str;
  325. static $iconv = null;
  326. if ($iconv === null) $iconv = function_exists('iconv');
  327. if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) {
  328. $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
  329. }
  330. set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
  331. if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
  332. // Undo our previous fix in convertToUTF8, otherwise iconv will barf
  333. $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
  334. if (!$escape && !empty($ascii_fix)) {
  335. $clear_fix = array();
  336. foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
  337. $str = strtr($str, $clear_fix);
  338. }
  339. $str = strtr($str, array_flip($ascii_fix));
  340. // Normal stuff
  341. $str = iconv('utf-8', $encoding . '//IGNORE', $str);
  342. restore_error_handler();
  343. return $str;
  344. } elseif ($encoding === 'iso-8859-1') {
  345. $str = utf8_decode($str);
  346. restore_error_handler();
  347. return $str;
  348. }
  349. trigger_error('Encoding not supported', E_USER_ERROR);
  350. }
  351. /**
  352. * Lossless (character-wise) conversion of HTML to ASCII
  353. * @static
  354. * @param $str UTF-8 string to be converted to ASCII
  355. * @returns ASCII encoded string with non-ASCII character entity-ized
  356. * @warning Adapted from MediaWiki, claiming fair use: this is a common
  357. * algorithm. If you disagree with this license fudgery,
  358. * implement it yourself.
  359. * @note Uses decimal numeric entities since they are best supported.
  360. * @note This is a DUMB function: it has no concept of keeping
  361. * character entities that the projected character encoding
  362. * can allow. We could possibly implement a smart version
  363. * but that would require it to also know which Unicode
  364. * codepoints the charset supported (not an easy task).
  365. * @note Sort of with cleanUTF8() but it assumes that $str is
  366. * well-formed UTF-8
  367. */
  368. function convertToASCIIDumbLossless($str) {
  369. $bytesleft = 0;
  370. $result = '';
  371. $working = 0;
  372. $len = strlen($str);
  373. for( $i = 0; $i < $len; $i++ ) {
  374. $bytevalue = ord( $str[$i] );
  375. if( $bytevalue <= 0x7F ) { //0xxx xxxx
  376. $result .= chr( $bytevalue );
  377. $bytesleft = 0;
  378. } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
  379. $working = $working << 6;
  380. $working += ($bytevalue & 0x3F);
  381. $bytesleft--;
  382. if( $bytesleft <= 0 ) {
  383. $result .= "&#" . $working . ";";
  384. }
  385. } elseif( $bytevalue <= 0xDF ) { //110x xxxx
  386. $working = $bytevalue & 0x1F;
  387. $bytesleft = 1;
  388. } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
  389. $working = $bytevalue & 0x0F;
  390. $bytesleft = 2;
  391. } else { //1111 0xxx
  392. $working = $bytevalue & 0x07;
  393. $bytesleft = 3;
  394. }
  395. }
  396. return $result;
  397. }
  398. /**
  399. * This expensive function tests whether or not a given character
  400. * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
  401. * fail this test, and require special processing. Variable width
  402. * encodings shouldn't ever fail.
  403. *
  404. * @param string $encoding Encoding name to test, as per iconv format
  405. * @param bool $bypass Whether or not to bypass the precompiled arrays.
  406. * @return Array of UTF-8 characters to their corresponding ASCII,
  407. * which can be used to "undo" any overzealous iconv action.
  408. */
  409. function testEncodingSupportsASCII($encoding, $bypass = false) {
  410. static $encodings = array();
  411. if (!$bypass) {
  412. if (isset($encodings[$encoding])) return $encodings[$encoding];
  413. $lenc = strtolower($encoding);
  414. switch ($lenc) {
  415. case 'shift_jis':
  416. return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
  417. case 'johab':
  418. return array("\xE2\x82\xA9" => '\\');
  419. }
  420. if (strpos($lenc, 'iso-8859-') === 0) return array();
  421. }
  422. $ret = array();
  423. set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
  424. if (iconv('UTF-8', $encoding, 'a') === false) return false;
  425. for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
  426. $c = chr($i);
  427. if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') {
  428. // Reverse engineer: what's the UTF-8 equiv of this byte
  429. // sequence? This assumes that there's no variable width
  430. // encoding that doesn't support ASCII.
  431. $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
  432. }
  433. }
  434. restore_error_handler();
  435. $encodings[$encoding] = $ret;
  436. return $ret;
  437. }
  438. }