PageRenderTime 48ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/phpBB/includes/utf/utf_tools.php

http://github.com/phpbb/phpbb3
PHP | 1469 lines | 1124 code | 92 blank | 253 comment | 78 complexity | 0fc00f7f8430bae340e74168df1e1a29 MD5 | raw file
Possible License(s): AGPL-1.0
  1. <?php
  2. /**
  3. *
  4. * This file is part of the phpBB Forum Software package.
  5. *
  6. * @copyright (c) phpBB Limited <https://www.phpbb.com>
  7. * @license GNU General Public License, version 2 (GPL-2.0)
  8. *
  9. * For full copyright and license information, please see
  10. * the docs/CREDITS.txt file.
  11. *
  12. */
  13. /**
  14. */
  15. if (!defined('IN_PHPBB'))
  16. {
  17. exit;
  18. }
  19. // Enforce ASCII only string handling
  20. setlocale(LC_CTYPE, 'C');
  21. /**
  22. * Setup the UTF-8 portability layer
  23. */
  24. Patchwork\Utf8\Bootup::initUtf8Encode();
  25. Patchwork\Utf8\Bootup::initMbstring();
  26. Patchwork\Utf8\Bootup::initIntl();
  27. /**
  28. * UTF-8 tools
  29. *
  30. * Whenever possible, these functions will try to use PHP's built-in functions or
  31. * extensions, otherwise they will default to custom routines.
  32. *
  33. */
  34. /**
  35. * UTF-8 aware alternative to strrpos
  36. * @ignore
  37. */
  38. function utf8_strrpos($str, $needle, $offset = null)
  39. {
  40. // Emulate behaviour of strrpos rather than raising warning
  41. if (empty($str))
  42. {
  43. return false;
  44. }
  45. if (is_null($offset))
  46. {
  47. return mb_strrpos($str, $needle);
  48. }
  49. else
  50. {
  51. return mb_strrpos($str, $needle, $offset);
  52. }
  53. }
  54. /**
  55. * UTF-8 aware alternative to strpos
  56. * @ignore
  57. */
  58. function utf8_strpos($str, $needle, $offset = null)
  59. {
  60. if (is_null($offset))
  61. {
  62. return mb_strpos($str, $needle);
  63. }
  64. else
  65. {
  66. return mb_strpos($str, $needle, $offset);
  67. }
  68. }
  69. /**
  70. * UTF-8 aware alternative to strtolower
  71. * @ignore
  72. */
  73. function utf8_strtolower($str)
  74. {
  75. return mb_strtolower($str);
  76. }
  77. /**
  78. * UTF-8 aware alternative to strtoupper
  79. * @ignore
  80. */
  81. function utf8_strtoupper($str)
  82. {
  83. return mb_strtoupper($str);
  84. }
  85. /**
  86. * UTF-8 aware alternative to substr
  87. * @ignore
  88. */
  89. function utf8_substr($str, $offset, $length = null)
  90. {
  91. if (is_null($length))
  92. {
  93. return mb_substr($str, $offset);
  94. }
  95. else
  96. {
  97. return mb_substr($str, $offset, $length);
  98. }
  99. }
  100. /**
  101. * Return the length (in characters) of a UTF-8 string
  102. * @ignore
  103. */
  104. function utf8_strlen($text)
  105. {
  106. return mb_strlen($text, 'utf-8');
  107. }
  108. /**
  109. * UTF-8 aware alternative to str_split
  110. * Convert a string to an array
  111. *
  112. * @author Harry Fuecks
  113. * @param string $str UTF-8 encoded
  114. * @param int $split_len number to characters to split string by
  115. * @return array characters in string reverses
  116. */
  117. function utf8_str_split($str, $split_len = 1)
  118. {
  119. if (!is_int($split_len) || $split_len < 1)
  120. {
  121. return false;
  122. }
  123. $len = utf8_strlen($str);
  124. if ($len <= $split_len)
  125. {
  126. return array($str);
  127. }
  128. preg_match_all('/.{' . $split_len . '}|[^\x00]{1,' . $split_len . '}$/us', $str, $ar);
  129. return $ar[0];
  130. }
  131. /**
  132. * UTF-8 aware alternative to strspn
  133. * Find length of initial segment matching the mask
  134. *
  135. * @author Harry Fuecks
  136. */
  137. function utf8_strspn($str, $mask, $start = null, $length = null)
  138. {
  139. if ($start !== null || $length !== null)
  140. {
  141. $str = utf8_substr($str, $start, $length);
  142. }
  143. preg_match('/^[' . $mask . ']+/u', $str, $matches);
  144. if (isset($matches[0]))
  145. {
  146. return utf8_strlen($matches[0]);
  147. }
  148. return 0;
  149. }
  150. /**
  151. * UTF-8 aware alternative to ucfirst
  152. * Make a string's first character uppercase
  153. *
  154. * @author Harry Fuecks
  155. * @param string
  156. * @return string with first character as upper case (if applicable)
  157. */
  158. function utf8_ucfirst($str)
  159. {
  160. switch (utf8_strlen($str))
  161. {
  162. case 0:
  163. return '';
  164. break;
  165. case 1:
  166. return utf8_strtoupper($str);
  167. break;
  168. default:
  169. preg_match('/^(.{1})(.*)$/us', $str, $matches);
  170. return utf8_strtoupper($matches[1]) . $matches[2];
  171. break;
  172. }
  173. }
  174. /**
  175. * Recode a string to UTF-8
  176. *
  177. * If the encoding is not supported, the string is returned as-is
  178. *
  179. * @param string $string Original string
  180. * @param string $encoding Original encoding (lowered)
  181. * @return string The string, encoded in UTF-8
  182. */
  183. function utf8_recode($string, $encoding)
  184. {
  185. $encoding = strtolower($encoding);
  186. if ($encoding == 'utf-8' || !is_string($string) || empty($string))
  187. {
  188. return $string;
  189. }
  190. // we force iso-8859-1 to be cp1252
  191. if ($encoding == 'iso-8859-1')
  192. {
  193. $encoding = 'cp1252';
  194. }
  195. // convert iso-8859-8-i to iso-8859-8
  196. else if ($encoding == 'iso-8859-8-i')
  197. {
  198. $encoding = 'iso-8859-8';
  199. $string = hebrev($string);
  200. }
  201. // First, try iconv()
  202. if (function_exists('iconv'))
  203. {
  204. $ret = @iconv($encoding, 'utf-8', $string);
  205. if (!empty($ret))
  206. {
  207. return $ret;
  208. }
  209. }
  210. // Try the mb_string extension
  211. if (function_exists('mb_convert_encoding'))
  212. {
  213. // mbstring is nasty on PHP4, we must make *sure* that we send a good encoding
  214. switch ($encoding)
  215. {
  216. case 'iso-8859-1':
  217. case 'iso-8859-2':
  218. case 'iso-8859-4':
  219. case 'iso-8859-7':
  220. case 'iso-8859-9':
  221. case 'iso-8859-15':
  222. case 'windows-1251':
  223. case 'windows-1252':
  224. case 'cp1252':
  225. case 'shift_jis':
  226. case 'euc-kr':
  227. case 'big5':
  228. case 'gb2312':
  229. $ret = @mb_convert_encoding($string, 'utf-8', $encoding);
  230. if (!empty($ret))
  231. {
  232. return $ret;
  233. }
  234. }
  235. }
  236. // Try the recode extension
  237. if (function_exists('recode_string'))
  238. {
  239. $ret = @recode_string($encoding . '..utf-8', $string);
  240. if (!empty($ret))
  241. {
  242. return $ret;
  243. }
  244. }
  245. // If nothing works, check if we have a custom transcoder available
  246. if (!preg_match('#^[a-z0-9_ \\-]+$#', $encoding))
  247. {
  248. // Make sure the encoding name is alphanumeric, we don't want it to be abused into loading arbitrary files
  249. trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
  250. }
  251. global $phpbb_root_path, $phpEx;
  252. // iso-8859-* character encoding
  253. if (preg_match('/iso[_ -]?8859[_ -]?(\\d+)/', $encoding, $array))
  254. {
  255. switch ($array[1])
  256. {
  257. case '1':
  258. case '2':
  259. case '4':
  260. case '7':
  261. case '8':
  262. case '9':
  263. case '15':
  264. if (!function_exists('iso_8859_' . $array[1]))
  265. {
  266. if (!file_exists($phpbb_root_path . 'includes/utf/data/recode_basic.' . $phpEx))
  267. {
  268. trigger_error('Basic reencoder file is missing', E_USER_ERROR);
  269. }
  270. include($phpbb_root_path . 'includes/utf/data/recode_basic.' . $phpEx);
  271. }
  272. return call_user_func('iso_8859_' . $array[1], $string);
  273. break;
  274. default:
  275. trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
  276. break;
  277. }
  278. }
  279. // CP/WIN character encoding
  280. if (preg_match('/(?:cp|windows)[_\- ]?(\\d+)/', $encoding, $array))
  281. {
  282. switch ($array[1])
  283. {
  284. case '932':
  285. break;
  286. case '1250':
  287. case '1251':
  288. case '1252':
  289. case '1254':
  290. case '1255':
  291. case '1256':
  292. case '1257':
  293. case '874':
  294. if (!function_exists('cp' . $array[1]))
  295. {
  296. if (!file_exists($phpbb_root_path . 'includes/utf/data/recode_basic.' . $phpEx))
  297. {
  298. trigger_error('Basic reencoder file is missing', E_USER_ERROR);
  299. }
  300. include($phpbb_root_path . 'includes/utf/data/recode_basic.' . $phpEx);
  301. }
  302. return call_user_func('cp' . $array[1], $string);
  303. break;
  304. default:
  305. trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
  306. break;
  307. }
  308. }
  309. // TIS-620
  310. if (preg_match('/tis[_ -]?620/', $encoding))
  311. {
  312. if (!function_exists('tis_620'))
  313. {
  314. if (!file_exists($phpbb_root_path . 'includes/utf/data/recode_basic.' . $phpEx))
  315. {
  316. trigger_error('Basic reencoder file is missing', E_USER_ERROR);
  317. }
  318. include($phpbb_root_path . 'includes/utf/data/recode_basic.' . $phpEx);
  319. }
  320. return tis_620($string);
  321. }
  322. // SJIS
  323. if (preg_match('/sjis(?:[_ -]?win)?|(?:cp|ibm)[_ -]?932|shift[_ -]?jis/', $encoding))
  324. {
  325. if (!function_exists('sjis'))
  326. {
  327. if (!file_exists($phpbb_root_path . 'includes/utf/data/recode_cjk.' . $phpEx))
  328. {
  329. trigger_error('CJK reencoder file is missing', E_USER_ERROR);
  330. }
  331. include($phpbb_root_path . 'includes/utf/data/recode_cjk.' . $phpEx);
  332. }
  333. return sjis($string);
  334. }
  335. // EUC_KR
  336. if (preg_match('/euc[_ -]?kr/', $encoding))
  337. {
  338. if (!function_exists('euc_kr'))
  339. {
  340. if (!file_exists($phpbb_root_path . 'includes/utf/data/recode_cjk.' . $phpEx))
  341. {
  342. trigger_error('CJK reencoder file is missing', E_USER_ERROR);
  343. }
  344. include($phpbb_root_path . 'includes/utf/data/recode_cjk.' . $phpEx);
  345. }
  346. return euc_kr($string);
  347. }
  348. // BIG-5
  349. if (preg_match('/big[_ -]?5/', $encoding))
  350. {
  351. if (!function_exists('big5'))
  352. {
  353. if (!file_exists($phpbb_root_path . 'includes/utf/data/recode_cjk.' . $phpEx))
  354. {
  355. trigger_error('CJK reencoder file is missing', E_USER_ERROR);
  356. }
  357. include($phpbb_root_path . 'includes/utf/data/recode_cjk.' . $phpEx);
  358. }
  359. return big5($string);
  360. }
  361. // GB2312
  362. if (preg_match('/gb[_ -]?2312/', $encoding))
  363. {
  364. if (!function_exists('gb2312'))
  365. {
  366. if (!file_exists($phpbb_root_path . 'includes/utf/data/recode_cjk.' . $phpEx))
  367. {
  368. trigger_error('CJK reencoder file is missing', E_USER_ERROR);
  369. }
  370. include($phpbb_root_path . 'includes/utf/data/recode_cjk.' . $phpEx);
  371. }
  372. return gb2312($string);
  373. }
  374. // Trigger an error?! Fow now just give bad data :-(
  375. trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
  376. }
  377. /**
  378. * Replace some special UTF-8 chars that are not in ASCII with their UCR.
  379. * using their Numeric Character Reference's Hexadecimal notation.
  380. *
  381. * Doesn't interfere with Japanese or Cyrillic etc.
  382. * Unicode character visualization will depend on the character support
  383. * of your web browser and the fonts installed on your system.
  384. *
  385. * @see https://en.wikibooks.org/wiki/Unicode/Character_reference/1F000-1FFFF
  386. *
  387. * @param string $text UTF-8 string in NFC
  388. * @return string ASCII string using NCR for non-ASCII chars
  389. */
  390. function utf8_encode_ucr($text)
  391. {
  392. return preg_replace_callback('/[\\xF0-\\xF4].../', 'utf8_encode_ncr_callback', $text);
  393. }
  394. /**
  395. * Replace all UTF-8 chars that are not in ASCII with their NCR
  396. * using their Numeric Character Reference's Hexadecimal notation.
  397. *
  398. * @param string $text UTF-8 string in NFC
  399. * @return string ASCII string using NCRs for non-ASCII chars
  400. */
  401. function utf8_encode_ncr($text)
  402. {
  403. return preg_replace_callback('#[\\xC2-\\xF4][\\x80-\\xBF]{1,3}#', 'utf8_encode_ncr_callback', $text);
  404. }
  405. /**
  406. * Callback used in utf8_encode_ncr() and utf8_encode_ucr()
  407. *
  408. * Takes a UTF-8 char and replaces it with its NCR. Attention, $m is an array
  409. *
  410. * @param array $m 0-based numerically indexed array passed by preg_replace_callback()
  411. * @return string A HTML NCR if the character is valid, or the original string otherwise
  412. */
  413. function utf8_encode_ncr_callback($m)
  414. {
  415. return '&#' . utf8_ord($m[0]) . ';';
  416. }
  417. /**
  418. * Converts a UTF-8 char to an NCR
  419. *
  420. * @param string $chr UTF-8 char
  421. * @return integer UNICODE code point
  422. */
  423. function utf8_ord($chr)
  424. {
  425. switch (strlen($chr))
  426. {
  427. case 1:
  428. return ord($chr);
  429. break;
  430. case 2:
  431. return ((ord($chr[0]) & 0x1F) << 6) | (ord($chr[1]) & 0x3F);
  432. break;
  433. case 3:
  434. return ((ord($chr[0]) & 0x0F) << 12) | ((ord($chr[1]) & 0x3F) << 6) | (ord($chr[2]) & 0x3F);
  435. break;
  436. case 4:
  437. return ((ord($chr[0]) & 0x07) << 18) | ((ord($chr[1]) & 0x3F) << 12) | ((ord($chr[2]) & 0x3F) << 6) | (ord($chr[3]) & 0x3F);
  438. break;
  439. default:
  440. return $chr;
  441. }
  442. }
  443. /**
  444. * Converts an NCR to a UTF-8 char
  445. *
  446. * @param int $cp UNICODE code point
  447. * @return string UTF-8 char
  448. */
  449. function utf8_chr($cp)
  450. {
  451. if ($cp > 0xFFFF)
  452. {
  453. return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  454. }
  455. else if ($cp > 0x7FF)
  456. {
  457. return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  458. }
  459. else if ($cp > 0x7F)
  460. {
  461. return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
  462. }
  463. else
  464. {
  465. return chr($cp);
  466. }
  467. }
  468. /**
  469. * Convert Numeric Character References to UTF-8 chars
  470. *
  471. * Notes:
  472. * - we do not convert NCRs recursively, if you pass &#38;#38; it will return &#38;
  473. * - we DO NOT check for the existence of the Unicode characters, therefore an entity may be converted to an inexistent codepoint
  474. *
  475. * @param string $text String to convert, encoded in UTF-8 (no normal form required)
  476. * @return string UTF-8 string where NCRs have been replaced with the actual chars
  477. */
  478. function utf8_decode_ncr($text)
  479. {
  480. return preg_replace_callback('/&#([0-9]{1,6}|x[0-9A-F]{1,5});/i', 'utf8_decode_ncr_callback', $text);
  481. }
  482. /**
  483. * Callback used in decode_ncr()
  484. *
  485. * Takes a NCR (in decimal or hexadecimal) and returns a UTF-8 char. Attention, $m is an array.
  486. * It will ignore most of invalid NCRs, but not all!
  487. *
  488. * @param array $m 0-based numerically indexed array passed by preg_replace_callback()
  489. * @return string UTF-8 char
  490. */
  491. function utf8_decode_ncr_callback($m)
  492. {
  493. $cp = (strncasecmp($m[1], 'x', 1)) ? $m[1] : hexdec(substr($m[1], 1));
  494. return utf8_chr($cp);
  495. }
  496. /**
  497. * Case folds a unicode string as per Unicode 5.0, section 3.13
  498. *
  499. * @param string $text text to be case folded
  500. * @param string $option determines how we will fold the cases
  501. * @return string case folded text
  502. */
  503. function utf8_case_fold($text, $option = 'full')
  504. {
  505. static $uniarray = array();
  506. global $phpbb_root_path, $phpEx;
  507. // common is always set
  508. if (!isset($uniarray['c']))
  509. {
  510. $uniarray['c'] = include($phpbb_root_path . 'includes/utf/data/case_fold_c.' . $phpEx);
  511. }
  512. // only set full if we need to
  513. if ($option === 'full' && !isset($uniarray['f']))
  514. {
  515. $uniarray['f'] = include($phpbb_root_path . 'includes/utf/data/case_fold_f.' . $phpEx);
  516. }
  517. // only set simple if we need to
  518. if ($option !== 'full' && !isset($uniarray['s']))
  519. {
  520. $uniarray['s'] = include($phpbb_root_path . 'includes/utf/data/case_fold_s.' . $phpEx);
  521. }
  522. // common is always replaced
  523. $text = strtr($text, $uniarray['c']);
  524. if ($option === 'full')
  525. {
  526. // full replaces a character with multiple characters
  527. $text = strtr($text, $uniarray['f']);
  528. }
  529. else
  530. {
  531. // simple replaces a character with another character
  532. $text = strtr($text, $uniarray['s']);
  533. }
  534. return $text;
  535. }
  536. /**
  537. * Takes the input and does a "special" case fold. It does minor normalization
  538. * and returns NFKC compatable text
  539. *
  540. * @param string $text text to be case folded
  541. * @param string $option determines how we will fold the cases
  542. * @return string case folded text
  543. */
  544. function utf8_case_fold_nfkc($text, $option = 'full')
  545. {
  546. static $fc_nfkc_closure = array(
  547. "\xCD\xBA" => "\x20\xCE\xB9",
  548. "\xCF\x92" => "\xCF\x85",
  549. "\xCF\x93" => "\xCF\x8D",
  550. "\xCF\x94" => "\xCF\x8B",
  551. "\xCF\xB2" => "\xCF\x83",
  552. "\xCF\xB9" => "\xCF\x83",
  553. "\xE1\xB4\xAC" => "\x61",
  554. "\xE1\xB4\xAD" => "\xC3\xA6",
  555. "\xE1\xB4\xAE" => "\x62",
  556. "\xE1\xB4\xB0" => "\x64",
  557. "\xE1\xB4\xB1" => "\x65",
  558. "\xE1\xB4\xB2" => "\xC7\x9D",
  559. "\xE1\xB4\xB3" => "\x67",
  560. "\xE1\xB4\xB4" => "\x68",
  561. "\xE1\xB4\xB5" => "\x69",
  562. "\xE1\xB4\xB6" => "\x6A",
  563. "\xE1\xB4\xB7" => "\x6B",
  564. "\xE1\xB4\xB8" => "\x6C",
  565. "\xE1\xB4\xB9" => "\x6D",
  566. "\xE1\xB4\xBA" => "\x6E",
  567. "\xE1\xB4\xBC" => "\x6F",
  568. "\xE1\xB4\xBD" => "\xC8\xA3",
  569. "\xE1\xB4\xBE" => "\x70",
  570. "\xE1\xB4\xBF" => "\x72",
  571. "\xE1\xB5\x80" => "\x74",
  572. "\xE1\xB5\x81" => "\x75",
  573. "\xE1\xB5\x82" => "\x77",
  574. "\xE2\x82\xA8" => "\x72\x73",
  575. "\xE2\x84\x82" => "\x63",
  576. "\xE2\x84\x83" => "\xC2\xB0\x63",
  577. "\xE2\x84\x87" => "\xC9\x9B",
  578. "\xE2\x84\x89" => "\xC2\xB0\x66",
  579. "\xE2\x84\x8B" => "\x68",
  580. "\xE2\x84\x8C" => "\x68",
  581. "\xE2\x84\x8D" => "\x68",
  582. "\xE2\x84\x90" => "\x69",
  583. "\xE2\x84\x91" => "\x69",
  584. "\xE2\x84\x92" => "\x6C",
  585. "\xE2\x84\x95" => "\x6E",
  586. "\xE2\x84\x96" => "\x6E\x6F",
  587. "\xE2\x84\x99" => "\x70",
  588. "\xE2\x84\x9A" => "\x71",
  589. "\xE2\x84\x9B" => "\x72",
  590. "\xE2\x84\x9C" => "\x72",
  591. "\xE2\x84\x9D" => "\x72",
  592. "\xE2\x84\xA0" => "\x73\x6D",
  593. "\xE2\x84\xA1" => "\x74\x65\x6C",
  594. "\xE2\x84\xA2" => "\x74\x6D",
  595. "\xE2\x84\xA4" => "\x7A",
  596. "\xE2\x84\xA8" => "\x7A",
  597. "\xE2\x84\xAC" => "\x62",
  598. "\xE2\x84\xAD" => "\x63",
  599. "\xE2\x84\xB0" => "\x65",
  600. "\xE2\x84\xB1" => "\x66",
  601. "\xE2\x84\xB3" => "\x6D",
  602. "\xE2\x84\xBB" => "\x66\x61\x78",
  603. "\xE2\x84\xBE" => "\xCE\xB3",
  604. "\xE2\x84\xBF" => "\xCF\x80",
  605. "\xE2\x85\x85" => "\x64",
  606. "\xE3\x89\x90" => "\x70\x74\x65",
  607. "\xE3\x8B\x8C" => "\x68\x67",
  608. "\xE3\x8B\x8E" => "\x65\x76",
  609. "\xE3\x8B\x8F" => "\x6C\x74\x64",
  610. "\xE3\x8D\xB1" => "\x68\x70\x61",
  611. "\xE3\x8D\xB3" => "\x61\x75",
  612. "\xE3\x8D\xB5" => "\x6F\x76",
  613. "\xE3\x8D\xBA" => "\x69\x75",
  614. "\xE3\x8E\x80" => "\x70\x61",
  615. "\xE3\x8E\x81" => "\x6E\x61",
  616. "\xE3\x8E\x82" => "\xCE\xBC\x61",
  617. "\xE3\x8E\x83" => "\x6D\x61",
  618. "\xE3\x8E\x84" => "\x6B\x61",
  619. "\xE3\x8E\x85" => "\x6B\x62",
  620. "\xE3\x8E\x86" => "\x6D\x62",
  621. "\xE3\x8E\x87" => "\x67\x62",
  622. "\xE3\x8E\x8A" => "\x70\x66",
  623. "\xE3\x8E\x8B" => "\x6E\x66",
  624. "\xE3\x8E\x8C" => "\xCE\xBC\x66",
  625. "\xE3\x8E\x90" => "\x68\x7A",
  626. "\xE3\x8E\x91" => "\x6B\x68\x7A",
  627. "\xE3\x8E\x92" => "\x6D\x68\x7A",
  628. "\xE3\x8E\x93" => "\x67\x68\x7A",
  629. "\xE3\x8E\x94" => "\x74\x68\x7A",
  630. "\xE3\x8E\xA9" => "\x70\x61",
  631. "\xE3\x8E\xAA" => "\x6B\x70\x61",
  632. "\xE3\x8E\xAB" => "\x6D\x70\x61",
  633. "\xE3\x8E\xAC" => "\x67\x70\x61",
  634. "\xE3\x8E\xB4" => "\x70\x76",
  635. "\xE3\x8E\xB5" => "\x6E\x76",
  636. "\xE3\x8E\xB6" => "\xCE\xBC\x76",
  637. "\xE3\x8E\xB7" => "\x6D\x76",
  638. "\xE3\x8E\xB8" => "\x6B\x76",
  639. "\xE3\x8E\xB9" => "\x6D\x76",
  640. "\xE3\x8E\xBA" => "\x70\x77",
  641. "\xE3\x8E\xBB" => "\x6E\x77",
  642. "\xE3\x8E\xBC" => "\xCE\xBC\x77",
  643. "\xE3\x8E\xBD" => "\x6D\x77",
  644. "\xE3\x8E\xBE" => "\x6B\x77",
  645. "\xE3\x8E\xBF" => "\x6D\x77",
  646. "\xE3\x8F\x80" => "\x6B\xCF\x89",
  647. "\xE3\x8F\x81" => "\x6D\xCF\x89",
  648. "\xE3\x8F\x83" => "\x62\x71",
  649. "\xE3\x8F\x86" => "\x63\xE2\x88\x95\x6B\x67",
  650. "\xE3\x8F\x87" => "\x63\x6F\x2E",
  651. "\xE3\x8F\x88" => "\x64\x62",
  652. "\xE3\x8F\x89" => "\x67\x79",
  653. "\xE3\x8F\x8B" => "\x68\x70",
  654. "\xE3\x8F\x8D" => "\x6B\x6B",
  655. "\xE3\x8F\x8E" => "\x6B\x6D",
  656. "\xE3\x8F\x97" => "\x70\x68",
  657. "\xE3\x8F\x99" => "\x70\x70\x6D",
  658. "\xE3\x8F\x9A" => "\x70\x72",
  659. "\xE3\x8F\x9C" => "\x73\x76",
  660. "\xE3\x8F\x9D" => "\x77\x62",
  661. "\xE3\x8F\x9E" => "\x76\xE2\x88\x95\x6D",
  662. "\xE3\x8F\x9F" => "\x61\xE2\x88\x95\x6D",
  663. "\xF0\x9D\x90\x80" => "\x61",
  664. "\xF0\x9D\x90\x81" => "\x62",
  665. "\xF0\x9D\x90\x82" => "\x63",
  666. "\xF0\x9D\x90\x83" => "\x64",
  667. "\xF0\x9D\x90\x84" => "\x65",
  668. "\xF0\x9D\x90\x85" => "\x66",
  669. "\xF0\x9D\x90\x86" => "\x67",
  670. "\xF0\x9D\x90\x87" => "\x68",
  671. "\xF0\x9D\x90\x88" => "\x69",
  672. "\xF0\x9D\x90\x89" => "\x6A",
  673. "\xF0\x9D\x90\x8A" => "\x6B",
  674. "\xF0\x9D\x90\x8B" => "\x6C",
  675. "\xF0\x9D\x90\x8C" => "\x6D",
  676. "\xF0\x9D\x90\x8D" => "\x6E",
  677. "\xF0\x9D\x90\x8E" => "\x6F",
  678. "\xF0\x9D\x90\x8F" => "\x70",
  679. "\xF0\x9D\x90\x90" => "\x71",
  680. "\xF0\x9D\x90\x91" => "\x72",
  681. "\xF0\x9D\x90\x92" => "\x73",
  682. "\xF0\x9D\x90\x93" => "\x74",
  683. "\xF0\x9D\x90\x94" => "\x75",
  684. "\xF0\x9D\x90\x95" => "\x76",
  685. "\xF0\x9D\x90\x96" => "\x77",
  686. "\xF0\x9D\x90\x97" => "\x78",
  687. "\xF0\x9D\x90\x98" => "\x79",
  688. "\xF0\x9D\x90\x99" => "\x7A",
  689. "\xF0\x9D\x90\xB4" => "\x61",
  690. "\xF0\x9D\x90\xB5" => "\x62",
  691. "\xF0\x9D\x90\xB6" => "\x63",
  692. "\xF0\x9D\x90\xB7" => "\x64",
  693. "\xF0\x9D\x90\xB8" => "\x65",
  694. "\xF0\x9D\x90\xB9" => "\x66",
  695. "\xF0\x9D\x90\xBA" => "\x67",
  696. "\xF0\x9D\x90\xBB" => "\x68",
  697. "\xF0\x9D\x90\xBC" => "\x69",
  698. "\xF0\x9D\x90\xBD" => "\x6A",
  699. "\xF0\x9D\x90\xBE" => "\x6B",
  700. "\xF0\x9D\x90\xBF" => "\x6C",
  701. "\xF0\x9D\x91\x80" => "\x6D",
  702. "\xF0\x9D\x91\x81" => "\x6E",
  703. "\xF0\x9D\x91\x82" => "\x6F",
  704. "\xF0\x9D\x91\x83" => "\x70",
  705. "\xF0\x9D\x91\x84" => "\x71",
  706. "\xF0\x9D\x91\x85" => "\x72",
  707. "\xF0\x9D\x91\x86" => "\x73",
  708. "\xF0\x9D\x91\x87" => "\x74",
  709. "\xF0\x9D\x91\x88" => "\x75",
  710. "\xF0\x9D\x91\x89" => "\x76",
  711. "\xF0\x9D\x91\x8A" => "\x77",
  712. "\xF0\x9D\x91\x8B" => "\x78",
  713. "\xF0\x9D\x91\x8C" => "\x79",
  714. "\xF0\x9D\x91\x8D" => "\x7A",
  715. "\xF0\x9D\x91\xA8" => "\x61",
  716. "\xF0\x9D\x91\xA9" => "\x62",
  717. "\xF0\x9D\x91\xAA" => "\x63",
  718. "\xF0\x9D\x91\xAB" => "\x64",
  719. "\xF0\x9D\x91\xAC" => "\x65",
  720. "\xF0\x9D\x91\xAD" => "\x66",
  721. "\xF0\x9D\x91\xAE" => "\x67",
  722. "\xF0\x9D\x91\xAF" => "\x68",
  723. "\xF0\x9D\x91\xB0" => "\x69",
  724. "\xF0\x9D\x91\xB1" => "\x6A",
  725. "\xF0\x9D\x91\xB2" => "\x6B",
  726. "\xF0\x9D\x91\xB3" => "\x6C",
  727. "\xF0\x9D\x91\xB4" => "\x6D",
  728. "\xF0\x9D\x91\xB5" => "\x6E",
  729. "\xF0\x9D\x91\xB6" => "\x6F",
  730. "\xF0\x9D\x91\xB7" => "\x70",
  731. "\xF0\x9D\x91\xB8" => "\x71",
  732. "\xF0\x9D\x91\xB9" => "\x72",
  733. "\xF0\x9D\x91\xBA" => "\x73",
  734. "\xF0\x9D\x91\xBB" => "\x74",
  735. "\xF0\x9D\x91\xBC" => "\x75",
  736. "\xF0\x9D\x91\xBD" => "\x76",
  737. "\xF0\x9D\x91\xBE" => "\x77",
  738. "\xF0\x9D\x91\xBF" => "\x78",
  739. "\xF0\x9D\x92\x80" => "\x79",
  740. "\xF0\x9D\x92\x81" => "\x7A",
  741. "\xF0\x9D\x92\x9C" => "\x61",
  742. "\xF0\x9D\x92\x9E" => "\x63",
  743. "\xF0\x9D\x92\x9F" => "\x64",
  744. "\xF0\x9D\x92\xA2" => "\x67",
  745. "\xF0\x9D\x92\xA5" => "\x6A",
  746. "\xF0\x9D\x92\xA6" => "\x6B",
  747. "\xF0\x9D\x92\xA9" => "\x6E",
  748. "\xF0\x9D\x92\xAA" => "\x6F",
  749. "\xF0\x9D\x92\xAB" => "\x70",
  750. "\xF0\x9D\x92\xAC" => "\x71",
  751. "\xF0\x9D\x92\xAE" => "\x73",
  752. "\xF0\x9D\x92\xAF" => "\x74",
  753. "\xF0\x9D\x92\xB0" => "\x75",
  754. "\xF0\x9D\x92\xB1" => "\x76",
  755. "\xF0\x9D\x92\xB2" => "\x77",
  756. "\xF0\x9D\x92\xB3" => "\x78",
  757. "\xF0\x9D\x92\xB4" => "\x79",
  758. "\xF0\x9D\x92\xB5" => "\x7A",
  759. "\xF0\x9D\x93\x90" => "\x61",
  760. "\xF0\x9D\x93\x91" => "\x62",
  761. "\xF0\x9D\x93\x92" => "\x63",
  762. "\xF0\x9D\x93\x93" => "\x64",
  763. "\xF0\x9D\x93\x94" => "\x65",
  764. "\xF0\x9D\x93\x95" => "\x66",
  765. "\xF0\x9D\x93\x96" => "\x67",
  766. "\xF0\x9D\x93\x97" => "\x68",
  767. "\xF0\x9D\x93\x98" => "\x69",
  768. "\xF0\x9D\x93\x99" => "\x6A",
  769. "\xF0\x9D\x93\x9A" => "\x6B",
  770. "\xF0\x9D\x93\x9B" => "\x6C",
  771. "\xF0\x9D\x93\x9C" => "\x6D",
  772. "\xF0\x9D\x93\x9D" => "\x6E",
  773. "\xF0\x9D\x93\x9E" => "\x6F",
  774. "\xF0\x9D\x93\x9F" => "\x70",
  775. "\xF0\x9D\x93\xA0" => "\x71",
  776. "\xF0\x9D\x93\xA1" => "\x72",
  777. "\xF0\x9D\x93\xA2" => "\x73",
  778. "\xF0\x9D\x93\xA3" => "\x74",
  779. "\xF0\x9D\x93\xA4" => "\x75",
  780. "\xF0\x9D\x93\xA5" => "\x76",
  781. "\xF0\x9D\x93\xA6" => "\x77",
  782. "\xF0\x9D\x93\xA7" => "\x78",
  783. "\xF0\x9D\x93\xA8" => "\x79",
  784. "\xF0\x9D\x93\xA9" => "\x7A",
  785. "\xF0\x9D\x94\x84" => "\x61",
  786. "\xF0\x9D\x94\x85" => "\x62",
  787. "\xF0\x9D\x94\x87" => "\x64",
  788. "\xF0\x9D\x94\x88" => "\x65",
  789. "\xF0\x9D\x94\x89" => "\x66",
  790. "\xF0\x9D\x94\x8A" => "\x67",
  791. "\xF0\x9D\x94\x8D" => "\x6A",
  792. "\xF0\x9D\x94\x8E" => "\x6B",
  793. "\xF0\x9D\x94\x8F" => "\x6C",
  794. "\xF0\x9D\x94\x90" => "\x6D",
  795. "\xF0\x9D\x94\x91" => "\x6E",
  796. "\xF0\x9D\x94\x92" => "\x6F",
  797. "\xF0\x9D\x94\x93" => "\x70",
  798. "\xF0\x9D\x94\x94" => "\x71",
  799. "\xF0\x9D\x94\x96" => "\x73",
  800. "\xF0\x9D\x94\x97" => "\x74",
  801. "\xF0\x9D\x94\x98" => "\x75",
  802. "\xF0\x9D\x94\x99" => "\x76",
  803. "\xF0\x9D\x94\x9A" => "\x77",
  804. "\xF0\x9D\x94\x9B" => "\x78",
  805. "\xF0\x9D\x94\x9C" => "\x79",
  806. "\xF0\x9D\x94\xB8" => "\x61",
  807. "\xF0\x9D\x94\xB9" => "\x62",
  808. "\xF0\x9D\x94\xBB" => "\x64",
  809. "\xF0\x9D\x94\xBC" => "\x65",
  810. "\xF0\x9D\x94\xBD" => "\x66",
  811. "\xF0\x9D\x94\xBE" => "\x67",
  812. "\xF0\x9D\x95\x80" => "\x69",
  813. "\xF0\x9D\x95\x81" => "\x6A",
  814. "\xF0\x9D\x95\x82" => "\x6B",
  815. "\xF0\x9D\x95\x83" => "\x6C",
  816. "\xF0\x9D\x95\x84" => "\x6D",
  817. "\xF0\x9D\x95\x86" => "\x6F",
  818. "\xF0\x9D\x95\x8A" => "\x73",
  819. "\xF0\x9D\x95\x8B" => "\x74",
  820. "\xF0\x9D\x95\x8C" => "\x75",
  821. "\xF0\x9D\x95\x8D" => "\x76",
  822. "\xF0\x9D\x95\x8E" => "\x77",
  823. "\xF0\x9D\x95\x8F" => "\x78",
  824. "\xF0\x9D\x95\x90" => "\x79",
  825. "\xF0\x9D\x95\xAC" => "\x61",
  826. "\xF0\x9D\x95\xAD" => "\x62",
  827. "\xF0\x9D\x95\xAE" => "\x63",
  828. "\xF0\x9D\x95\xAF" => "\x64",
  829. "\xF0\x9D\x95\xB0" => "\x65",
  830. "\xF0\x9D\x95\xB1" => "\x66",
  831. "\xF0\x9D\x95\xB2" => "\x67",
  832. "\xF0\x9D\x95\xB3" => "\x68",
  833. "\xF0\x9D\x95\xB4" => "\x69",
  834. "\xF0\x9D\x95\xB5" => "\x6A",
  835. "\xF0\x9D\x95\xB6" => "\x6B",
  836. "\xF0\x9D\x95\xB7" => "\x6C",
  837. "\xF0\x9D\x95\xB8" => "\x6D",
  838. "\xF0\x9D\x95\xB9" => "\x6E",
  839. "\xF0\x9D\x95\xBA" => "\x6F",
  840. "\xF0\x9D\x95\xBB" => "\x70",
  841. "\xF0\x9D\x95\xBC" => "\x71",
  842. "\xF0\x9D\x95\xBD" => "\x72",
  843. "\xF0\x9D\x95\xBE" => "\x73",
  844. "\xF0\x9D\x95\xBF" => "\x74",
  845. "\xF0\x9D\x96\x80" => "\x75",
  846. "\xF0\x9D\x96\x81" => "\x76",
  847. "\xF0\x9D\x96\x82" => "\x77",
  848. "\xF0\x9D\x96\x83" => "\x78",
  849. "\xF0\x9D\x96\x84" => "\x79",
  850. "\xF0\x9D\x96\x85" => "\x7A",
  851. "\xF0\x9D\x96\xA0" => "\x61",
  852. "\xF0\x9D\x96\xA1" => "\x62",
  853. "\xF0\x9D\x96\xA2" => "\x63",
  854. "\xF0\x9D\x96\xA3" => "\x64",
  855. "\xF0\x9D\x96\xA4" => "\x65",
  856. "\xF0\x9D\x96\xA5" => "\x66",
  857. "\xF0\x9D\x96\xA6" => "\x67",
  858. "\xF0\x9D\x96\xA7" => "\x68",
  859. "\xF0\x9D\x96\xA8" => "\x69",
  860. "\xF0\x9D\x96\xA9" => "\x6A",
  861. "\xF0\x9D\x96\xAA" => "\x6B",
  862. "\xF0\x9D\x96\xAB" => "\x6C",
  863. "\xF0\x9D\x96\xAC" => "\x6D",
  864. "\xF0\x9D\x96\xAD" => "\x6E",
  865. "\xF0\x9D\x96\xAE" => "\x6F",
  866. "\xF0\x9D\x96\xAF" => "\x70",
  867. "\xF0\x9D\x96\xB0" => "\x71",
  868. "\xF0\x9D\x96\xB1" => "\x72",
  869. "\xF0\x9D\x96\xB2" => "\x73",
  870. "\xF0\x9D\x96\xB3" => "\x74",
  871. "\xF0\x9D\x96\xB4" => "\x75",
  872. "\xF0\x9D\x96\xB5" => "\x76",
  873. "\xF0\x9D\x96\xB6" => "\x77",
  874. "\xF0\x9D\x96\xB7" => "\x78",
  875. "\xF0\x9D\x96\xB8" => "\x79",
  876. "\xF0\x9D\x96\xB9" => "\x7A",
  877. "\xF0\x9D\x97\x94" => "\x61",
  878. "\xF0\x9D\x97\x95" => "\x62",
  879. "\xF0\x9D\x97\x96" => "\x63",
  880. "\xF0\x9D\x97\x97" => "\x64",
  881. "\xF0\x9D\x97\x98" => "\x65",
  882. "\xF0\x9D\x97\x99" => "\x66",
  883. "\xF0\x9D\x97\x9A" => "\x67",
  884. "\xF0\x9D\x97\x9B" => "\x68",
  885. "\xF0\x9D\x97\x9C" => "\x69",
  886. "\xF0\x9D\x97\x9D" => "\x6A",
  887. "\xF0\x9D\x97\x9E" => "\x6B",
  888. "\xF0\x9D\x97\x9F" => "\x6C",
  889. "\xF0\x9D\x97\xA0" => "\x6D",
  890. "\xF0\x9D\x97\xA1" => "\x6E",
  891. "\xF0\x9D\x97\xA2" => "\x6F",
  892. "\xF0\x9D\x97\xA3" => "\x70",
  893. "\xF0\x9D\x97\xA4" => "\x71",
  894. "\xF0\x9D\x97\xA5" => "\x72",
  895. "\xF0\x9D\x97\xA6" => "\x73",
  896. "\xF0\x9D\x97\xA7" => "\x74",
  897. "\xF0\x9D\x97\xA8" => "\x75",
  898. "\xF0\x9D\x97\xA9" => "\x76",
  899. "\xF0\x9D\x97\xAA" => "\x77",
  900. "\xF0\x9D\x97\xAB" => "\x78",
  901. "\xF0\x9D\x97\xAC" => "\x79",
  902. "\xF0\x9D\x97\xAD" => "\x7A",
  903. "\xF0\x9D\x98\x88" => "\x61",
  904. "\xF0\x9D\x98\x89" => "\x62",
  905. "\xF0\x9D\x98\x8A" => "\x63",
  906. "\xF0\x9D\x98\x8B" => "\x64",
  907. "\xF0\x9D\x98\x8C" => "\x65",
  908. "\xF0\x9D\x98\x8D" => "\x66",
  909. "\xF0\x9D\x98\x8E" => "\x67",
  910. "\xF0\x9D\x98\x8F" => "\x68",
  911. "\xF0\x9D\x98\x90" => "\x69",
  912. "\xF0\x9D\x98\x91" => "\x6A",
  913. "\xF0\x9D\x98\x92" => "\x6B",
  914. "\xF0\x9D\x98\x93" => "\x6C",
  915. "\xF0\x9D\x98\x94" => "\x6D",
  916. "\xF0\x9D\x98\x95" => "\x6E",
  917. "\xF0\x9D\x98\x96" => "\x6F",
  918. "\xF0\x9D\x98\x97" => "\x70",
  919. "\xF0\x9D\x98\x98" => "\x71",
  920. "\xF0\x9D\x98\x99" => "\x72",
  921. "\xF0\x9D\x98\x9A" => "\x73",
  922. "\xF0\x9D\x98\x9B" => "\x74",
  923. "\xF0\x9D\x98\x9C" => "\x75",
  924. "\xF0\x9D\x98\x9D" => "\x76",
  925. "\xF0\x9D\x98\x9E" => "\x77",
  926. "\xF0\x9D\x98\x9F" => "\x78",
  927. "\xF0\x9D\x98\xA0" => "\x79",
  928. "\xF0\x9D\x98\xA1" => "\x7A",
  929. "\xF0\x9D\x98\xBC" => "\x61",
  930. "\xF0\x9D\x98\xBD" => "\x62",
  931. "\xF0\x9D\x98\xBE" => "\x63",
  932. "\xF0\x9D\x98\xBF" => "\x64",
  933. "\xF0\x9D\x99\x80" => "\x65",
  934. "\xF0\x9D\x99\x81" => "\x66",
  935. "\xF0\x9D\x99\x82" => "\x67",
  936. "\xF0\x9D\x99\x83" => "\x68",
  937. "\xF0\x9D\x99\x84" => "\x69",
  938. "\xF0\x9D\x99\x85" => "\x6A",
  939. "\xF0\x9D\x99\x86" => "\x6B",
  940. "\xF0\x9D\x99\x87" => "\x6C",
  941. "\xF0\x9D\x99\x88" => "\x6D",
  942. "\xF0\x9D\x99\x89" => "\x6E",
  943. "\xF0\x9D\x99\x8A" => "\x6F",
  944. "\xF0\x9D\x99\x8B" => "\x70",
  945. "\xF0\x9D\x99\x8C" => "\x71",
  946. "\xF0\x9D\x99\x8D" => "\x72",
  947. "\xF0\x9D\x99\x8E" => "\x73",
  948. "\xF0\x9D\x99\x8F" => "\x74",
  949. "\xF0\x9D\x99\x90" => "\x75",
  950. "\xF0\x9D\x99\x91" => "\x76",
  951. "\xF0\x9D\x99\x92" => "\x77",
  952. "\xF0\x9D\x99\x93" => "\x78",
  953. "\xF0\x9D\x99\x94" => "\x79",
  954. "\xF0\x9D\x99\x95" => "\x7A",
  955. "\xF0\x9D\x99\xB0" => "\x61",
  956. "\xF0\x9D\x99\xB1" => "\x62",
  957. "\xF0\x9D\x99\xB2" => "\x63",
  958. "\xF0\x9D\x99\xB3" => "\x64",
  959. "\xF0\x9D\x99\xB4" => "\x65",
  960. "\xF0\x9D\x99\xB5" => "\x66",
  961. "\xF0\x9D\x99\xB6" => "\x67",
  962. "\xF0\x9D\x99\xB7" => "\x68",
  963. "\xF0\x9D\x99\xB8" => "\x69",
  964. "\xF0\x9D\x99\xB9" => "\x6A",
  965. "\xF0\x9D\x99\xBA" => "\x6B",
  966. "\xF0\x9D\x99\xBB" => "\x6C",
  967. "\xF0\x9D\x99\xBC" => "\x6D",
  968. "\xF0\x9D\x99\xBD" => "\x6E",
  969. "\xF0\x9D\x99\xBE" => "\x6F",
  970. "\xF0\x9D\x99\xBF" => "\x70",
  971. "\xF0\x9D\x9A\x80" => "\x71",
  972. "\xF0\x9D\x9A\x81" => "\x72",
  973. "\xF0\x9D\x9A\x82" => "\x73",
  974. "\xF0\x9D\x9A\x83" => "\x74",
  975. "\xF0\x9D\x9A\x84" => "\x75",
  976. "\xF0\x9D\x9A\x85" => "\x76",
  977. "\xF0\x9D\x9A\x86" => "\x77",
  978. "\xF0\x9D\x9A\x87" => "\x78",
  979. "\xF0\x9D\x9A\x88" => "\x79",
  980. "\xF0\x9D\x9A\x89" => "\x7A",
  981. "\xF0\x9D\x9A\xA8" => "\xCE\xB1",
  982. "\xF0\x9D\x9A\xA9" => "\xCE\xB2",
  983. "\xF0\x9D\x9A\xAA" => "\xCE\xB3",
  984. "\xF0\x9D\x9A\xAB" => "\xCE\xB4",
  985. "\xF0\x9D\x9A\xAC" => "\xCE\xB5",
  986. "\xF0\x9D\x9A\xAD" => "\xCE\xB6",
  987. "\xF0\x9D\x9A\xAE" => "\xCE\xB7",
  988. "\xF0\x9D\x9A\xAF" => "\xCE\xB8",
  989. "\xF0\x9D\x9A\xB0" => "\xCE\xB9",
  990. "\xF0\x9D\x9A\xB1" => "\xCE\xBA",
  991. "\xF0\x9D\x9A\xB2" => "\xCE\xBB",
  992. "\xF0\x9D\x9A\xB3" => "\xCE\xBC",
  993. "\xF0\x9D\x9A\xB4" => "\xCE\xBD",
  994. "\xF0\x9D\x9A\xB5" => "\xCE\xBE",
  995. "\xF0\x9D\x9A\xB6" => "\xCE\xBF",
  996. "\xF0\x9D\x9A\xB7" => "\xCF\x80",
  997. "\xF0\x9D\x9A\xB8" => "\xCF\x81",
  998. "\xF0\x9D\x9A\xB9" => "\xCE\xB8",
  999. "\xF0\x9D\x9A\xBA" => "\xCF\x83",
  1000. "\xF0\x9D\x9A\xBB" => "\xCF\x84",
  1001. "\xF0\x9D\x9A\xBC" => "\xCF\x85",
  1002. "\xF0\x9D\x9A\xBD" => "\xCF\x86",
  1003. "\xF0\x9D\x9A\xBE" => "\xCF\x87",
  1004. "\xF0\x9D\x9A\xBF" => "\xCF\x88",
  1005. "\xF0\x9D\x9B\x80" => "\xCF\x89",
  1006. "\xF0\x9D\x9B\x93" => "\xCF\x83",
  1007. "\xF0\x9D\x9B\xA2" => "\xCE\xB1",
  1008. "\xF0\x9D\x9B\xA3" => "\xCE\xB2",
  1009. "\xF0\x9D\x9B\xA4" => "\xCE\xB3",
  1010. "\xF0\x9D\x9B\xA5" => "\xCE\xB4",
  1011. "\xF0\x9D\x9B\xA6" => "\xCE\xB5",
  1012. "\xF0\x9D\x9B\xA7" => "\xCE\xB6",
  1013. "\xF0\x9D\x9B\xA8" => "\xCE\xB7",
  1014. "\xF0\x9D\x9B\xA9" => "\xCE\xB8",
  1015. "\xF0\x9D\x9B\xAA" => "\xCE\xB9",
  1016. "\xF0\x9D\x9B\xAB" => "\xCE\xBA",
  1017. "\xF0\x9D\x9B\xAC" => "\xCE\xBB",
  1018. "\xF0\x9D\x9B\xAD" => "\xCE\xBC",
  1019. "\xF0\x9D\x9B\xAE" => "\xCE\xBD",
  1020. "\xF0\x9D\x9B\xAF" => "\xCE\xBE",
  1021. "\xF0\x9D\x9B\xB0" => "\xCE\xBF",
  1022. "\xF0\x9D\x9B\xB1" => "\xCF\x80",
  1023. "\xF0\x9D\x9B\xB2" => "\xCF\x81",
  1024. "\xF0\x9D\x9B\xB3" => "\xCE\xB8",
  1025. "\xF0\x9D\x9B\xB4" => "\xCF\x83",
  1026. "\xF0\x9D\x9B\xB5" => "\xCF\x84",
  1027. "\xF0\x9D\x9B\xB6" => "\xCF\x85",
  1028. "\xF0\x9D\x9B\xB7" => "\xCF\x86",
  1029. "\xF0\x9D\x9B\xB8" => "\xCF\x87",
  1030. "\xF0\x9D\x9B\xB9" => "\xCF\x88",
  1031. "\xF0\x9D\x9B\xBA" => "\xCF\x89",
  1032. "\xF0\x9D\x9C\x8D" => "\xCF\x83",
  1033. "\xF0\x9D\x9C\x9C" => "\xCE\xB1",
  1034. "\xF0\x9D\x9C\x9D" => "\xCE\xB2",
  1035. "\xF0\x9D\x9C\x9E" => "\xCE\xB3",
  1036. "\xF0\x9D\x9C\x9F" => "\xCE\xB4",
  1037. "\xF0\x9D\x9C\xA0" => "\xCE\xB5",
  1038. "\xF0\x9D\x9C\xA1" => "\xCE\xB6",
  1039. "\xF0\x9D\x9C\xA2" => "\xCE\xB7",
  1040. "\xF0\x9D\x9C\xA3" => "\xCE\xB8",
  1041. "\xF0\x9D\x9C\xA4" => "\xCE\xB9",
  1042. "\xF0\x9D\x9C\xA5" => "\xCE\xBA",
  1043. "\xF0\x9D\x9C\xA6" => "\xCE\xBB",
  1044. "\xF0\x9D\x9C\xA7" => "\xCE\xBC",
  1045. "\xF0\x9D\x9C\xA8" => "\xCE\xBD",
  1046. "\xF0\x9D\x9C\xA9" => "\xCE\xBE",
  1047. "\xF0\x9D\x9C\xAA" => "\xCE\xBF",
  1048. "\xF0\x9D\x9C\xAB" => "\xCF\x80",
  1049. "\xF0\x9D\x9C\xAC" => "\xCF\x81",
  1050. "\xF0\x9D\x9C\xAD" => "\xCE\xB8",
  1051. "\xF0\x9D\x9C\xAE" => "\xCF\x83",
  1052. "\xF0\x9D\x9C\xAF" => "\xCF\x84",
  1053. "\xF0\x9D\x9C\xB0" => "\xCF\x85",
  1054. "\xF0\x9D\x9C\xB1" => "\xCF\x86",
  1055. "\xF0\x9D\x9C\xB2" => "\xCF\x87",
  1056. "\xF0\x9D\x9C\xB3" => "\xCF\x88",
  1057. "\xF0\x9D\x9C\xB4" => "\xCF\x89",
  1058. "\xF0\x9D\x9D\x87" => "\xCF\x83",
  1059. "\xF0\x9D\x9D\x96" => "\xCE\xB1",
  1060. "\xF0\x9D\x9D\x97" => "\xCE\xB2",
  1061. "\xF0\x9D\x9D\x98" => "\xCE\xB3",
  1062. "\xF0\x9D\x9D\x99" => "\xCE\xB4",
  1063. "\xF0\x9D\x9D\x9A" => "\xCE\xB5",
  1064. "\xF0\x9D\x9D\x9B" => "\xCE\xB6",
  1065. "\xF0\x9D\x9D\x9C" => "\xCE\xB7",
  1066. "\xF0\x9D\x9D\x9D" => "\xCE\xB8",
  1067. "\xF0\x9D\x9D\x9E" => "\xCE\xB9",
  1068. "\xF0\x9D\x9D\x9F" => "\xCE\xBA",
  1069. "\xF0\x9D\x9D\xA0" => "\xCE\xBB",
  1070. "\xF0\x9D\x9D\xA1" => "\xCE\xBC",
  1071. "\xF0\x9D\x9D\xA2" => "\xCE\xBD",
  1072. "\xF0\x9D\x9D\xA3" => "\xCE\xBE",
  1073. "\xF0\x9D\x9D\xA4" => "\xCE\xBF",
  1074. "\xF0\x9D\x9D\xA5" => "\xCF\x80",
  1075. "\xF0\x9D\x9D\xA6" => "\xCF\x81",
  1076. "\xF0\x9D\x9D\xA7" => "\xCE\xB8",
  1077. "\xF0\x9D\x9D\xA8" => "\xCF\x83",
  1078. "\xF0\x9D\x9D\xA9" => "\xCF\x84",
  1079. "\xF0\x9D\x9D\xAA" => "\xCF\x85",
  1080. "\xF0\x9D\x9D\xAB" => "\xCF\x86",
  1081. "\xF0\x9D\x9D\xAC" => "\xCF\x87",
  1082. "\xF0\x9D\x9D\xAD" => "\xCF\x88",
  1083. "\xF0\x9D\x9D\xAE" => "\xCF\x89",
  1084. "\xF0\x9D\x9E\x81" => "\xCF\x83",
  1085. "\xF0\x9D\x9E\x90" => "\xCE\xB1",
  1086. "\xF0\x9D\x9E\x91" => "\xCE\xB2",
  1087. "\xF0\x9D\x9E\x92" => "\xCE\xB3",
  1088. "\xF0\x9D\x9E\x93" => "\xCE\xB4",
  1089. "\xF0\x9D\x9E\x94" => "\xCE\xB5",
  1090. "\xF0\x9D\x9E\x95" => "\xCE\xB6",
  1091. "\xF0\x9D\x9E\x96" => "\xCE\xB7",
  1092. "\xF0\x9D\x9E\x97" => "\xCE\xB8",
  1093. "\xF0\x9D\x9E\x98" => "\xCE\xB9",
  1094. "\xF0\x9D\x9E\x99" => "\xCE\xBA",
  1095. "\xF0\x9D\x9E\x9A" => "\xCE\xBB",
  1096. "\xF0\x9D\x9E\x9B" => "\xCE\xBC",
  1097. "\xF0\x9D\x9E\x9C" => "\xCE\xBD",
  1098. "\xF0\x9D\x9E\x9D" => "\xCE\xBE",
  1099. "\xF0\x9D\x9E\x9E" => "\xCE\xBF",
  1100. "\xF0\x9D\x9E\x9F" => "\xCF\x80",
  1101. "\xF0\x9D\x9E\xA0" => "\xCF\x81",
  1102. "\xF0\x9D\x9E\xA1" => "\xCE\xB8",
  1103. "\xF0\x9D\x9E\xA2" => "\xCF\x83",
  1104. "\xF0\x9D\x9E\xA3" => "\xCF\x84",
  1105. "\xF0\x9D\x9E\xA4" => "\xCF\x85",
  1106. "\xF0\x9D\x9E\xA5" => "\xCF\x86",
  1107. "\xF0\x9D\x9E\xA6" => "\xCF\x87",
  1108. "\xF0\x9D\x9E\xA7" => "\xCF\x88",
  1109. "\xF0\x9D\x9E\xA8" => "\xCF\x89",
  1110. "\xF0\x9D\x9E\xBB" => "\xCF\x83",
  1111. "\xF0\x9D\x9F\x8A" => "\xCF\x9D",
  1112. );
  1113. // do the case fold
  1114. $text = utf8_case_fold($text, $option);
  1115. // convert to NFKC
  1116. Normalizer::normalize($text, Normalizer::NFKC);
  1117. // FC_NFKC_Closure, http://www.unicode.org/Public/5.0.0/ucd/DerivedNormalizationProps.txt
  1118. $text = strtr($text, $fc_nfkc_closure);
  1119. return $text;
  1120. }
  1121. /**
  1122. * Assume the input is NFC:
  1123. * Takes the input and does a "special" case fold. It does minor normalization as well.
  1124. *
  1125. * @param string $text text to be case folded
  1126. * @param string $option determines how we will fold the cases
  1127. * @return string case folded text
  1128. */
  1129. function utf8_case_fold_nfc($text, $option = 'full')
  1130. {
  1131. static $uniarray = array();
  1132. static $ypogegrammeni = array(
  1133. "\xCD\xBA" => "\x20\xCD\x85",
  1134. "\xE1\xBE\x80" => "\xE1\xBC\x80\xCD\x85",
  1135. "\xE1\xBE\x81" => "\xE1\xBC\x81\xCD\x85",
  1136. "\xE1\xBE\x82" => "\xE1\xBC\x82\xCD\x85",
  1137. "\xE1\xBE\x83" => "\xE1\xBC\x83\xCD\x85",
  1138. "\xE1\xBE\x84" => "\xE1\xBC\x84\xCD\x85",
  1139. "\xE1\xBE\x85" => "\xE1\xBC\x85\xCD\x85",
  1140. "\xE1\xBE\x86" => "\xE1\xBC\x86\xCD\x85",
  1141. "\xE1\xBE\x87" => "\xE1\xBC\x87\xCD\x85",
  1142. "\xE1\xBE\x88" => "\xE1\xBC\x88\xCD\x85",
  1143. "\xE1\xBE\x89" => "\xE1\xBC\x89\xCD\x85",
  1144. "\xE1\xBE\x8A" => "\xE1\xBC\x8A\xCD\x85",
  1145. "\xE1\xBE\x8B" => "\xE1\xBC\x8B\xCD\x85",
  1146. "\xE1\xBE\x8C" => "\xE1\xBC\x8C\xCD\x85",
  1147. "\xE1\xBE\x8D" => "\xE1\xBC\x8D\xCD\x85",
  1148. "\xE1\xBE\x8E" => "\xE1\xBC\x8E\xCD\x85",
  1149. "\xE1\xBE\x8F" => "\xE1\xBC\x8F\xCD\x85",
  1150. "\xE1\xBE\x90" => "\xE1\xBC\xA0\xCD\x85",
  1151. "\xE1\xBE\x91" => "\xE1\xBC\xA1\xCD\x85",
  1152. "\xE1\xBE\x92" => "\xE1\xBC\xA2\xCD\x85",
  1153. "\xE1\xBE\x93" => "\xE1\xBC\xA3\xCD\x85",
  1154. "\xE1\xBE\x94" => "\xE1\xBC\xA4\xCD\x85",
  1155. "\xE1\xBE\x95" => "\xE1\xBC\xA5\xCD\x85",
  1156. "\xE1\xBE\x96" => "\xE1\xBC\xA6\xCD\x85",
  1157. "\xE1\xBE\x97" => "\xE1\xBC\xA7\xCD\x85",
  1158. "\xE1\xBE\x98" => "\xE1\xBC\xA8\xCD\x85",
  1159. "\xE1\xBE\x99" => "\xE1\xBC\xA9\xCD\x85",
  1160. "\xE1\xBE\x9A" => "\xE1\xBC\xAA\xCD\x85",
  1161. "\xE1\xBE\x9B" => "\xE1\xBC\xAB\xCD\x85",
  1162. "\xE1\xBE\x9C" => "\xE1\xBC\xAC\xCD\x85",
  1163. "\xE1\xBE\x9D" => "\xE1\xBC\xAD\xCD\x85",
  1164. "\xE1\xBE\x9E" => "\xE1\xBC\xAE\xCD\x85",
  1165. "\xE1\xBE\x9F" => "\xE1\xBC\xAF\xCD\x85",
  1166. "\xE1\xBE\xA0" => "\xE1\xBD\xA0\xCD\x85",
  1167. "\xE1\xBE\xA1" => "\xE1\xBD\xA1\xCD\x85",
  1168. "\xE1\xBE\xA2" => "\xE1\xBD\xA2\xCD\x85",
  1169. "\xE1\xBE\xA3" => "\xE1\xBD\xA3\xCD\x85",
  1170. "\xE1\xBE\xA4" => "\xE1\xBD\xA4\xCD\x85",
  1171. "\xE1\xBE\xA5" => "\xE1\xBD\xA5\xCD\x85",
  1172. "\xE1\xBE\xA6" => "\xE1\xBD\xA6\xCD\x85",
  1173. "\xE1\xBE\xA7" => "\xE1\xBD\xA7\xCD\x85",
  1174. "\xE1\xBE\xA8" => "\xE1\xBD\xA8\xCD\x85",
  1175. "\xE1\xBE\xA9" => "\xE1\xBD\xA9\xCD\x85",
  1176. "\xE1\xBE\xAA" => "\xE1\xBD\xAA\xCD\x85",
  1177. "\xE1\xBE\xAB" => "\xE1\xBD\xAB\xCD\x85",
  1178. "\xE1\xBE\xAC" => "\xE1\xBD\xAC\xCD\x85",
  1179. "\xE1\xBE\xAD" => "\xE1\xBD\xAD\xCD\x85",
  1180. "\xE1\xBE\xAE" => "\xE1\xBD\xAE\xCD\x85",
  1181. "\xE1\xBE\xAF" => "\xE1\xBD\xAF\xCD\x85",
  1182. "\xE1\xBE\xB2" => "\xE1\xBD\xB0\xCD\x85",
  1183. "\xE1\xBE\xB3" => "\xCE\xB1\xCD\x85",
  1184. "\xE1\xBE\xB4" => "\xCE\xAC\xCD\x85",
  1185. "\xE1\xBE\xB7" => "\xE1\xBE\xB6\xCD\x85",
  1186. "\xE1\xBE\xBC" => "\xCE\x91\xCD\x85",
  1187. "\xE1\xBF\x82" => "\xE1\xBD\xB4\xCD\x85",
  1188. "\xE1\xBF\x83" => "\xCE\xB7\xCD\x85",
  1189. "\xE1\xBF\x84" => "\xCE\xAE\xCD\x85",
  1190. "\xE1\xBF\x87" => "\xE1\xBF\x86\xCD\x85",
  1191. "\xE1\xBF\x8C" => "\xCE\x97\xCD\x85",
  1192. "\xE1\xBF\xB2" => "\xE1\xBD\xBC\xCD\x85",
  1193. "\xE1\xBF\xB3" => "\xCF\x89\xCD\x85",
  1194. "\xE1\xBF\xB4" => "\xCF\x8E\xCD\x85",
  1195. "\xE1\xBF\xB7" => "\xE1\xBF\xB6\xCD\x85",
  1196. "\xE1\xBF\xBC" => "\xCE\xA9\xCD\x85",
  1197. );
  1198. // perform a small trick, avoid further normalization on composed points that contain U+0345 in their decomposition
  1199. $text = strtr($text, $ypogegrammeni);
  1200. // do the case fold
  1201. $text = utf8_case_fold($text, $option);
  1202. return $text;
  1203. }
  1204. /**
  1205. * wrapper around PHP's native normalizer from intl
  1206. * previously a PECL extension, included in the core since PHP 5.3.0
  1207. * http://php.net/manual/en/normalizer.normalize.php
  1208. *
  1209. * @param mixed $strings a string or an array of strings to normalize
  1210. * @return mixed the normalized content, preserving array keys if array given.
  1211. */
  1212. function utf8_normalize_nfc($strings)
  1213. {
  1214. if (empty($strings))
  1215. {
  1216. return $strings;
  1217. }
  1218. if (!is_array($strings))
  1219. {
  1220. if (Normalizer::isNormalized($strings))
  1221. {
  1222. return $strings;
  1223. }
  1224. return (string) Normalizer::normalize($strings);
  1225. }
  1226. else
  1227. {
  1228. foreach ($strings as $key => $string)
  1229. {
  1230. if (is_array($string))
  1231. {
  1232. foreach ($string as $_key => $_string)
  1233. {
  1234. if (Normalizer::isNormalized($strings[$key][$_key]))
  1235. {
  1236. continue;
  1237. }
  1238. $strings[$key][$_key] = (string) Normalizer::normalize($strings[$key][$_key]);
  1239. }
  1240. }
  1241. else
  1242. {
  1243. if (Normalizer::isNormalized($strings[$key]))
  1244. {
  1245. continue;
  1246. }
  1247. $strings[$key] = (string) Normalizer::normalize($strings[$key]);
  1248. }
  1249. }
  1250. }
  1251. return $strings;
  1252. }
  1253. /**
  1254. * This function is used to generate a "clean" version of a string.
  1255. * Clean means that it is a case insensitive form (case folding) and that it is normalized (NFC).
  1256. * Additionally a homographs of one character are transformed into one specific character (preferably ASCII
  1257. * if it is an ASCII character).
  1258. *
  1259. * Please be aware that if you change something within this function or within
  1260. * functions used here you need to rebuild/update the username_clean column in the users table. And all other
  1261. * columns that store a clean string otherwise you will break this functionality.
  1262. *
  1263. * @param string $text An unclean string, mabye user input (has to be valid UTF-8!)
  1264. * @return string Cleaned up version of the input string
  1265. */
  1266. function utf8_clean_string($text)
  1267. {
  1268. global $phpbb_root_path, $phpEx;
  1269. static $homographs = array();
  1270. if (empty($homographs))
  1271. {
  1272. $homographs = include($phpbb_root_path . 'includes/utf/data/confusables.' . $phpEx);
  1273. }
  1274. $text = utf8_case_fold_nfkc($text);
  1275. $text = strtr($text, $homographs);
  1276. // Other control characters
  1277. $text = preg_replace('#(?:[\x00-\x1F\x7F]+|(?:\xC2[\x80-\x9F])+)#', '', $text);
  1278. // we need to reduce multiple spaces to a single one
  1279. $text = preg_replace('# {2,}#', ' ', $text);
  1280. // we can use trim here as all the other space characters should have been turned
  1281. // into normal ASCII spaces by now
  1282. return trim($text);
  1283. }
  1284. /**
  1285. * A wrapper for htmlspecialchars($value, ENT_COMPAT, 'UTF-8')
  1286. */
  1287. function utf8_htmlspecialchars($value)
  1288. {
  1289. return htmlspecialchars($value, ENT_COMPAT, 'UTF-8');
  1290. }
  1291. /**
  1292. * Trying to convert returned system message to utf8
  1293. *
  1294. * PHP assumes such messages are ISO-8859-1 so we'll do that too
  1295. * and if it breaks messages we'll blame it on them ;-)
  1296. */
  1297. function utf8_convert_message($message)
  1298. {
  1299. // First of all check if conversion is neded at all, as there is no point
  1300. // in converting ASCII messages from ISO-8859-1 to UTF-8
  1301. if (!preg_match('/[\x80-\xFF]/', $message))
  1302. {
  1303. return utf8_htmlspecialchars($message);
  1304. }
  1305. // else we need to convert some part of the message
  1306. return utf8_htmlspecialchars(utf8_recode($message, 'ISO-8859-1'));
  1307. }
  1308. /**
  1309. * UTF8-compatible wordwrap replacement
  1310. *
  1311. * @param string $string The input string
  1312. * @param int $width The column width. Defaults to 75.
  1313. * @param string $break The line is broken using the optional break parameter. Defaults to '\n'.
  1314. * @param bool $cut If the cut is set to TRUE, the string is always wrapped at the specified width. So if you have a word that is larger than the given width, it is broken apart.
  1315. *
  1316. * @return string the given string wrapped at the specified column.
  1317. *
  1318. */
  1319. function utf8_wordwrap($string, $width = 75, $break = "\n", $cut = false)
  1320. {
  1321. // We first need to explode on $break, not destroying existing (intended) breaks
  1322. $lines = explode($break, $string);
  1323. $new_lines = array(0 => '');
  1324. $index = 0;
  1325. foreach ($lines as $line)
  1326. {
  1327. $words = explode(' ', $line);
  1328. for ($i = 0, $size = count($words); $i < $size; $i++)
  1329. {
  1330. $word = $words[$i];
  1331. // If cut is true we need to cut the word if it is > width chars
  1332. if ($cut && utf8_strlen($word) > $width)
  1333. {
  1334. $words[$i] = utf8_substr($word, $width);
  1335. $word = utf8_substr($word, 0, $width);
  1336. $i--;
  1337. }
  1338. if (utf8_strlen($new_lines[$index] . $word) > $width)
  1339. {
  1340. $new_lines[$index] = substr($new_lines[$index], 0, -1);
  1341. $index++;
  1342. $new_lines[$index] = '';
  1343. }
  1344. $new_lines[$index] .= $word . ' ';
  1345. }
  1346. $new_lines[$index] = substr($new_lines[$index], 0, -1);
  1347. $index++;
  1348. $new_lines[$index] = '';
  1349. }
  1350. unset($new_lines[$index]);
  1351. return implode($break, $new_lines);
  1352. }
  1353. /**
  1354. * UTF8-safe basename() function
  1355. *
  1356. * basename() has some limitations and is dependent on the locale setting
  1357. * according to the PHP manual. Therefore we provide our own locale independent
  1358. * basename function.
  1359. *
  1360. * @param string $filename The filename basename() should be applied to
  1361. * @return string The basenamed filename
  1362. */
  1363. function utf8_basename($filename)
  1364. {
  1365. // We always check for forward slash AND backward slash
  1366. // because they could be mixed or "sneaked" in. ;)
  1367. // You know, never trust user input...
  1368. if (strpos($filename, '/') !== false)
  1369. {
  1370. $filename = utf8_substr($filename, utf8_strrpos($filename, '/') + 1);
  1371. }
  1372. if (strpos($filename, '\\') !== false)
  1373. {
  1374. $filename = utf8_substr($filename, utf8_strrpos($filename, '\\') + 1);
  1375. }
  1376. return $filename;
  1377. }