PageRenderTime 45ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/common/libraries/plugin/wiki/mediawiki/normal/UtfNormal.php

https://bitbucket.org/renaatdemuynck/chamilo
PHP | 844 lines | 555 code | 51 blank | 238 comment | 89 complexity | 6276573906ef53452e770e7d48635dd6 MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.1, LGPL-3.0, GPL-3.0, MIT, GPL-2.0
  1. <?php
  2. # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
  3. # http://www.mediawiki.org/
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 2 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License along
  16. # with this program; if not, write to the Free Software Foundation, Inc.,
  17. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. # http://www.gnu.org/copyleft/gpl.html
  19. /**
  20. * @defgroup UtfNormal UtfNormal
  21. */
  22. /** */
  23. require_once dirname(__FILE__) . '/UtfNormalUtil.php';
  24. global $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp;
  25. $utfCombiningClass = NULL;
  26. $utfCanonicalComp = NULL;
  27. $utfCanonicalDecomp = NULL;
  28. # Load compatibility decompositions on demand if they are needed.
  29. global $utfCompatibilityDecomp;
  30. $utfCompatibilityDecomp = NULL;
  31. /**
  32. * For using the ICU wrapper
  33. */
  34. define('UNORM_NONE', 1);
  35. define('UNORM_NFD', 2);
  36. define('UNORM_NFKD', 3);
  37. define('UNORM_NFC', 4);
  38. define('UNORM_DEFAULT', UNORM_NFC);
  39. define('UNORM_NFKC', 5);
  40. define('UNORM_FCD', 6);
  41. define('NORMALIZE_ICU', function_exists('utf8_normalize'));
  42. /**
  43. * Unicode normalization routines for working with UTF-8 strings.
  44. * Currently assumes that input strings are valid UTF-8!
  45. *
  46. * Not as fast as I'd like, but should be usable for most purposes.
  47. * UtfNormal::toNFC() will bail early if given ASCII text or text
  48. * it can quickly deterimine is already normalized.
  49. *
  50. * All functions can be called static.
  51. *
  52. * See description of forms at http://www.unicode.org/reports/tr15/
  53. *
  54. * @ingroup UtfNormal
  55. */
  56. class UtfNormal
  57. {
  58. /**
  59. * The ultimate convenience function! Clean up invalid UTF-8 sequences,
  60. * and convert to normal form C, canonical composition.
  61. *
  62. * Fast return for pure ASCII strings; some lesser optimizations for
  63. * strings containing only known-good characters. Not as fast as toNFC().
  64. *
  65. * @param $string String: a UTF-8 string
  66. * @return string a clean, shiny, normalized UTF-8 string
  67. */
  68. static function cleanUp($string)
  69. {
  70. if (NORMALIZE_ICU)
  71. {
  72. # We exclude a few chars that ICU would not.
  73. $string = preg_replace('/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string);
  74. $string = str_replace(UTF8_FFFE, UTF8_REPLACEMENT, $string);
  75. $string = str_replace(UTF8_FFFF, UTF8_REPLACEMENT, $string);
  76. # UnicodeString constructor fails if the string ends with a
  77. # head byte. Add a junk char at the end, we'll strip it off.
  78. return rtrim(utf8_normalize($string . "\x01", UNORM_NFC), "\x01");
  79. }
  80. elseif (UtfNormal :: quickIsNFCVerify($string))
  81. {
  82. # Side effect -- $string has had UTF-8 errors cleaned up.
  83. return $string;
  84. }
  85. else
  86. {
  87. return UtfNormal :: NFC($string);
  88. }
  89. }
  90. /**
  91. * Convert a UTF-8 string to normal form C, canonical composition.
  92. * Fast return for pure ASCII strings; some lesser optimizations for
  93. * strings containing only known-good characters.
  94. *
  95. * @param $string String: a valid UTF-8 string. Input is not validated.
  96. * @return string a UTF-8 string in normal form C
  97. */
  98. static function toNFC($string)
  99. {
  100. if (NORMALIZE_ICU)
  101. return utf8_normalize($string, UNORM_NFC);
  102. elseif (UtfNormal :: quickIsNFC($string))
  103. return $string;
  104. else
  105. return UtfNormal :: NFC($string);
  106. }
  107. /**
  108. * Convert a UTF-8 string to normal form D, canonical decomposition.
  109. * Fast return for pure ASCII strings.
  110. *
  111. * @param $string String: a valid UTF-8 string. Input is not validated.
  112. * @return string a UTF-8 string in normal form D
  113. */
  114. static function toNFD($string)
  115. {
  116. if (NORMALIZE_ICU)
  117. return utf8_normalize($string, UNORM_NFD);
  118. elseif (preg_match('/[\x80-\xff]/', $string))
  119. return UtfNormal :: NFD($string);
  120. else
  121. return $string;
  122. }
  123. /**
  124. * Convert a UTF-8 string to normal form KC, compatibility composition.
  125. * This may cause irreversible information loss, use judiciously.
  126. * Fast return for pure ASCII strings.
  127. *
  128. * @param $string String: a valid UTF-8 string. Input is not validated.
  129. * @return string a UTF-8 string in normal form KC
  130. */
  131. static function toNFKC($string)
  132. {
  133. if (NORMALIZE_ICU)
  134. return utf8_normalize($string, UNORM_NFKC);
  135. elseif (preg_match('/[\x80-\xff]/', $string))
  136. return UtfNormal :: NFKC($string);
  137. else
  138. return $string;
  139. }
  140. /**
  141. * Convert a UTF-8 string to normal form KD, compatibility decomposition.
  142. * This may cause irreversible information loss, use judiciously.
  143. * Fast return for pure ASCII strings.
  144. *
  145. * @param $string String: a valid UTF-8 string. Input is not validated.
  146. * @return string a UTF-8 string in normal form KD
  147. */
  148. static function toNFKD($string)
  149. {
  150. if (NORMALIZE_ICU)
  151. return utf8_normalize($string, UNORM_NFKD);
  152. elseif (preg_match('/[\x80-\xff]/', $string))
  153. return UtfNormal :: NFKD($string);
  154. else
  155. return $string;
  156. }
  157. /**
  158. * Load the basic composition data if necessary
  159. * @private
  160. */
  161. static function loadData()
  162. {
  163. global $utfCombiningClass;
  164. if (! isset($utfCombiningClass))
  165. {
  166. require_once (dirname(__FILE__) . '/UtfNormalData.inc');
  167. }
  168. }
  169. /**
  170. * Returns true if the string is _definitely_ in NFC.
  171. * Returns false if not or uncertain.
  172. * @param $string String: a valid UTF-8 string. Input is not validated.
  173. * @return bool
  174. */
  175. static function quickIsNFC($string)
  176. {
  177. # ASCII is always valid NFC!
  178. # If it's pure ASCII, let it through.
  179. if (! preg_match('/[\x80-\xff]/', $string))
  180. return true;
  181. UtfNormal :: loadData();
  182. global $utfCheckNFC, $utfCombiningClass;
  183. $len = strlen($string);
  184. for($i = 0; $i < $len; $i ++)
  185. {
  186. $c = $string{$i};
  187. $n = ord($c);
  188. if ($n < 0x80)
  189. {
  190. continue;
  191. }
  192. elseif ($n >= 0xf0)
  193. {
  194. $c = substr($string, $i, 4);
  195. $i += 3;
  196. }
  197. elseif ($n >= 0xe0)
  198. {
  199. $c = substr($string, $i, 3);
  200. $i += 2;
  201. }
  202. elseif ($n >= 0xc0)
  203. {
  204. $c = substr($string, $i, 2);
  205. $i ++;
  206. }
  207. if (isset($utfCheckNFC[$c]))
  208. {
  209. # If it's NO or MAYBE, bail and do the slow check.
  210. return false;
  211. }
  212. if (isset($utfCombiningClass[$c]))
  213. {
  214. # Combining character? We might have to do sorting, at least.
  215. return false;
  216. }
  217. }
  218. return true;
  219. }
  220. /**
  221. * Returns true if the string is _definitely_ in NFC.
  222. * Returns false if not or uncertain.
  223. * @param $string String: a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
  224. */
  225. static function quickIsNFCVerify(&$string)
  226. {
  227. # Screen out some characters that eg won't be allowed in XML
  228. $string = preg_replace('/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string);
  229. # ASCII is always valid NFC!
  230. # If we're only ever given plain ASCII, we can avoid the overhead
  231. # of initializing the decomposition tables by skipping out early.
  232. if (! preg_match('/[\x80-\xff]/', $string))
  233. return true;
  234. static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
  235. if (! isset($checkit))
  236. {
  237. # Load/build some scary lookup tables...
  238. UtfNormal :: loadData();
  239. global $utfCheckNFC, $utfCombiningClass;
  240. $utfCheckOrCombining = array_merge($utfCheckNFC, $utfCombiningClass);
  241. # Head bytes for sequences which we should do further validity checks
  242. $checkit = array_flip(array_map('chr', array(0xc0, 0xc1, 0xe0, 0xed, 0xef, 0xf0, 0xf1, 0xf2,
  243. 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff)));
  244. # Each UTF-8 head byte is followed by a certain
  245. # number of tail bytes.
  246. $tailBytes = array();
  247. for($n = 0; $n < 256; $n ++)
  248. {
  249. if ($n < 0xc0)
  250. {
  251. $remaining = 0;
  252. }
  253. elseif ($n < 0xe0)
  254. {
  255. $remaining = 1;
  256. }
  257. elseif ($n < 0xf0)
  258. {
  259. $remaining = 2;
  260. }
  261. elseif ($n < 0xf8)
  262. {
  263. $remaining = 3;
  264. }
  265. elseif ($n < 0xfc)
  266. {
  267. $remaining = 4;
  268. }
  269. elseif ($n < 0xfe)
  270. {
  271. $remaining = 5;
  272. }
  273. else
  274. {
  275. $remaining = 0;
  276. }
  277. $tailBytes[chr($n)] = $remaining;
  278. }
  279. }
  280. # Chop the text into pure-ASCII and non-ASCII areas;
  281. # large ASCII parts can be handled much more quickly.
  282. # Don't chop up Unicode areas for punctuation, though,
  283. # that wastes energy.
  284. $matches = array();
  285. preg_match_all('/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/', $string, $matches);
  286. $looksNormal = true;
  287. $base = 0;
  288. $replace = array();
  289. foreach ($matches[1] as $str)
  290. {
  291. $chunk = strlen($str);
  292. if ($str{0} < "\x80")
  293. {
  294. # ASCII chunk: guaranteed to be valid UTF-8
  295. # and in normal form C, so skip over it.
  296. $base += $chunk;
  297. continue;
  298. }
  299. # We'll have to examine the chunk byte by byte to ensure
  300. # that it consists of valid UTF-8 sequences, and to see
  301. # if any of them might not be normalized.
  302. #
  303. # Since PHP is not the fastest language on earth, some of
  304. # this code is a little ugly with inner loop optimizations.
  305. $head = '';
  306. $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
  307. for($i = - 1; -- $len;)
  308. {
  309. if ($remaining = $tailBytes[$c = $str{++ $i}])
  310. {
  311. # UTF-8 head byte!
  312. $sequence = $head = $c;
  313. do
  314. {
  315. # Look for the defined number of tail bytes...
  316. if (-- $len && ($c = $str{++ $i}) >= "\x80" && $c < "\xc0")
  317. {
  318. # Legal tail bytes are nice.
  319. $sequence .= $c;
  320. }
  321. else
  322. {
  323. if (0 == $len)
  324. {
  325. # Premature end of string!
  326. # Drop a replacement character into output to
  327. # represent the invalid UTF-8 sequence.
  328. $replace[] = array(
  329. UTF8_REPLACEMENT, $base + $i + 1 - strlen($sequence), strlen($sequence));
  330. break 2;
  331. }
  332. else
  333. {
  334. # Illegal tail byte; abandon the sequence.
  335. $replace[] = array(UTF8_REPLACEMENT,
  336. $base + $i - strlen($sequence), strlen($sequence));
  337. # Back up and reprocess this byte; it may itself
  338. # be a legal ASCII or UTF-8 sequence head.
  339. -- $i;
  340. ++ $len;
  341. continue 2;
  342. }
  343. }
  344. }
  345. while (-- $remaining);
  346. if (isset($checkit[$head]))
  347. {
  348. # Do some more detailed validity checks, for
  349. # invalid characters and illegal sequences.
  350. if ($head == "\xed")
  351. {
  352. # 0xed is relatively frequent in Korean, which
  353. # abuts the surrogate area, so we're doing
  354. # this check separately to speed things up.
  355. if ($sequence >= UTF8_SURROGATE_FIRST)
  356. {
  357. # Surrogates are legal only in UTF-16 code.
  358. # They are totally forbidden here in UTF-8
  359. # utopia.
  360. $replace[] = array(
  361. UTF8_REPLACEMENT, $base + $i + 1 - strlen($sequence), strlen($sequence));
  362. $head = '';
  363. continue;
  364. }
  365. }
  366. else
  367. {
  368. # Slower, but rarer checks...
  369. $n = ord($head);
  370. if (# "Overlong sequences" are those that are syntactically
  371. # correct but use more UTF-8 bytes than are necessary to
  372. # encode a character. NaĂŻve string comparisons can be
  373. # tricked into failing to see a match for an ASCII
  374. # character, for instance, which can be a security hole
  375. # if blacklist checks are being used.
  376. ($n < 0xc2 && $sequence <= UTF8_OVERLONG_A) || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B) || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C) ||
  377. # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
  378. ($n == 0xef && ($sequence == UTF8_FFFE) || ($sequence == UTF8_FFFF)) ||
  379. # Unicode has been limited to 21 bits; longer
  380. # sequences are not allowed.
  381. ($n >= 0xf0 && $sequence > UTF8_MAX))
  382. {
  383. $replace[] = array(UTF8_REPLACEMENT, $base + $i + 1 - strlen($sequence),
  384. strlen($sequence));
  385. $head = '';
  386. continue;
  387. }
  388. }
  389. }
  390. if (isset($utfCheckOrCombining[$sequence]))
  391. {
  392. # If it's NO or MAYBE, we'll have to rip
  393. # the string apart and put it back together.
  394. # That's going to be mighty slow.
  395. $looksNormal = false;
  396. }
  397. # The sequence is legal!
  398. $head = '';
  399. }
  400. elseif ($c < "\x80")
  401. {
  402. # ASCII byte.
  403. $head = '';
  404. }
  405. elseif ($c < "\xc0")
  406. {
  407. # Illegal tail bytes
  408. if ($head == '')
  409. {
  410. # Out of the blue!
  411. $replace[] = array(UTF8_REPLACEMENT, $base + $i, 1);
  412. }
  413. else
  414. {
  415. # Don't add if we're continuing a broken sequence;
  416. # we already put a replacement character when we looked
  417. # at the broken sequence.
  418. $replace[] = array('',
  419. $base + $i, 1);
  420. }
  421. }
  422. else
  423. {
  424. # Miscellaneous freaks.
  425. $replace[] = array(UTF8_REPLACEMENT, $base + $i, 1);
  426. $head = '';
  427. }
  428. }
  429. $base += $chunk;
  430. }
  431. if (count($replace))
  432. {
  433. # There were illegal UTF-8 sequences we need to fix up.
  434. $out = '';
  435. $last = 0;
  436. foreach ($replace as $rep)
  437. {
  438. list($replacement, $start, $length) = $rep;
  439. if ($last < $start)
  440. {
  441. $out .= substr($string, $last, $start - $last);
  442. }
  443. $out .= $replacement;
  444. $last = $start + $length;
  445. }
  446. if ($last < strlen($string))
  447. {
  448. $out .= substr($string, $last);
  449. }
  450. $string = $out;
  451. }
  452. return $looksNormal;
  453. }
  454. # These take a string and run the normalization on them, without
  455. # checking for validity or any optimization etc. Input must be
  456. # VALID UTF-8!
  457. /**
  458. * @param $string string
  459. * @return string
  460. * @private
  461. */
  462. static function NFC($string)
  463. {
  464. return UtfNormal :: fastCompose(UtfNormal :: NFD($string));
  465. }
  466. /**
  467. * @param $string string
  468. * @return string
  469. * @private
  470. */
  471. static function NFD($string)
  472. {
  473. UtfNormal :: loadData();
  474. global $utfCanonicalDecomp;
  475. return UtfNormal :: fastCombiningSort(UtfNormal :: fastDecompose($string, $utfCanonicalDecomp));
  476. }
  477. /**
  478. * @param $string string
  479. * @return string
  480. * @private
  481. */
  482. static function NFKC($string)
  483. {
  484. return UtfNormal :: fastCompose(UtfNormal :: NFKD($string));
  485. }
  486. /**
  487. * @param $string string
  488. * @return string
  489. * @private
  490. */
  491. static function NFKD($string)
  492. {
  493. global $utfCompatibilityDecomp;
  494. if (! isset($utfCompatibilityDecomp))
  495. {
  496. require_once ('UtfNormalDataK.inc');
  497. }
  498. return UtfNormal :: fastCombiningSort(UtfNormal :: fastDecompose($string, $utfCompatibilityDecomp));
  499. }
  500. /**
  501. * Perform decomposition of a UTF-8 string into either D or KD form
  502. * (depending on which decomposition map is passed to us).
  503. * Input is assumed to be *valid* UTF-8. Invalid code will break.
  504. * @private
  505. * @param $string String: valid UTF-8 string
  506. * @param $map Array: hash of expanded decomposition map
  507. * @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
  508. */
  509. static function fastDecompose($string, $map)
  510. {
  511. UtfNormal :: loadData();
  512. $len = strlen($string);
  513. $out = '';
  514. for($i = 0; $i < $len; $i ++)
  515. {
  516. $c = $string{$i};
  517. $n = ord($c);
  518. if ($n < 0x80)
  519. {
  520. # ASCII chars never decompose
  521. # THEY ARE IMMORTAL
  522. $out .= $c;
  523. continue;
  524. }
  525. elseif ($n >= 0xf0)
  526. {
  527. $c = substr($string, $i, 4);
  528. $i += 3;
  529. }
  530. elseif ($n >= 0xe0)
  531. {
  532. $c = substr($string, $i, 3);
  533. $i += 2;
  534. }
  535. elseif ($n >= 0xc0)
  536. {
  537. $c = substr($string, $i, 2);
  538. $i ++;
  539. }
  540. if (isset($map[$c]))
  541. {
  542. $out .= $map[$c];
  543. continue;
  544. }
  545. else
  546. {
  547. if ($c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST)
  548. {
  549. # Decompose a hangul syllable into jamo;
  550. # hardcoded for three-byte UTF-8 sequence.
  551. # A lookup table would be slightly faster,
  552. # but adds a lot of memory & disk needs.
  553. #
  554. $index = ((ord($c{0}) & 0x0f) << 12 | (ord($c{1}) & 0x3f) << 6 | (ord($c{2}) & 0x3f)) - UNICODE_HANGUL_FIRST;
  555. $l = intval($index / UNICODE_HANGUL_NCOUNT);
  556. $v = intval(($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
  557. $t = $index % UNICODE_HANGUL_TCOUNT;
  558. $out .= "\xe1\x84" . chr(0x80 + $l) . "\xe1\x85" . chr(0xa1 + $v);
  559. if ($t >= 25)
  560. {
  561. $out .= "\xe1\x87" . chr(0x80 + $t - 25);
  562. }
  563. elseif ($t)
  564. {
  565. $out .= "\xe1\x86" . chr(0xa7 + $t);
  566. }
  567. continue;
  568. }
  569. }
  570. $out .= $c;
  571. }
  572. return $out;
  573. }
  574. /**
  575. * Sorts combining characters into canonical order. This is the
  576. * final step in creating decomposed normal forms D and KD.
  577. * @private
  578. * @param $string String: a valid, decomposed UTF-8 string. Input is not validated.
  579. * @return string a UTF-8 string with combining characters sorted in canonical order
  580. */
  581. static function fastCombiningSort($string)
  582. {
  583. UtfNormal :: loadData();
  584. global $utfCombiningClass;
  585. $len = strlen($string);
  586. $out = '';
  587. $combiners = array();
  588. $lastClass = - 1;
  589. for($i = 0; $i < $len; $i ++)
  590. {
  591. $c = $string{$i};
  592. $n = ord($c);
  593. if ($n >= 0x80)
  594. {
  595. if ($n >= 0xf0)
  596. {
  597. $c = substr($string, $i, 4);
  598. $i += 3;
  599. }
  600. elseif ($n >= 0xe0)
  601. {
  602. $c = substr($string, $i, 3);
  603. $i += 2;
  604. }
  605. elseif ($n >= 0xc0)
  606. {
  607. $c = substr($string, $i, 2);
  608. $i ++;
  609. }
  610. if (isset($utfCombiningClass[$c]))
  611. {
  612. $lastClass = $utfCombiningClass[$c];
  613. if (isset($combiners[$lastClass]))
  614. {
  615. $combiners[$lastClass] .= $c;
  616. }
  617. else
  618. {
  619. $combiners[$lastClass] = $c;
  620. }
  621. continue;
  622. }
  623. }
  624. if ($lastClass)
  625. {
  626. ksort($combiners);
  627. $out .= implode('', $combiners);
  628. $combiners = array();
  629. }
  630. $out .= $c;
  631. $lastClass = 0;
  632. }
  633. if ($lastClass)
  634. {
  635. ksort($combiners);
  636. $out .= implode('', $combiners);
  637. }
  638. return $out;
  639. }
  640. /**
  641. * Produces canonically composed sequences, i.e. normal form C or KC.
  642. *
  643. * @private
  644. * @param $string String: a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
  645. * @return string a UTF-8 string with canonical precomposed characters used where possible
  646. */
  647. static function fastCompose($string)
  648. {
  649. UtfNormal :: loadData();
  650. global $utfCanonicalComp, $utfCombiningClass;
  651. $len = strlen($string);
  652. $out = '';
  653. $lastClass = - 1;
  654. $lastHangul = 0;
  655. $startChar = '';
  656. $combining = '';
  657. $x1 = ord(substr(UTF8_HANGUL_VBASE, 0, 1));
  658. $x2 = ord(substr(UTF8_HANGUL_TEND, 0, 1));
  659. for($i = 0; $i < $len; $i ++)
  660. {
  661. $c = $string{$i};
  662. $n = ord($c);
  663. if ($n < 0x80)
  664. {
  665. # No combining characters here...
  666. $out .= $startChar;
  667. $out .= $combining;
  668. $startChar = $c;
  669. $combining = '';
  670. $lastClass = 0;
  671. continue;
  672. }
  673. elseif ($n >= 0xf0)
  674. {
  675. $c = substr($string, $i, 4);
  676. $i += 3;
  677. }
  678. elseif ($n >= 0xe0)
  679. {
  680. $c = substr($string, $i, 3);
  681. $i += 2;
  682. }
  683. elseif ($n >= 0xc0)
  684. {
  685. $c = substr($string, $i, 2);
  686. $i ++;
  687. }
  688. $pair = $startChar . $c;
  689. if ($n > 0x80)
  690. {
  691. if (isset($utfCombiningClass[$c]))
  692. {
  693. # A combining char; see what we can do with it
  694. $class = $utfCombiningClass[$c];
  695. if (! empty($startChar) && $lastClass < $class && $class > 0 && isset($utfCanonicalComp[$pair]))
  696. {
  697. $startChar = $utfCanonicalComp[$pair];
  698. $class = 0;
  699. }
  700. else
  701. {
  702. $combining .= $c;
  703. }
  704. $lastClass = $class;
  705. $lastHangul = 0;
  706. continue;
  707. }
  708. }
  709. # New start char
  710. if ($lastClass == 0)
  711. {
  712. if (isset($utfCanonicalComp[$pair]))
  713. {
  714. $startChar = $utfCanonicalComp[$pair];
  715. $lastHangul = 0;
  716. continue;
  717. }
  718. if ($n >= $x1 && $n <= $x2)
  719. {
  720. # WARNING: Hangul code is painfully slow.
  721. # I apologize for this ugly, ugly code; however
  722. # performance is even more teh suck if we call
  723. # out to nice clean functions. Lookup tables are
  724. # marginally faster, but require a lot of space.
  725. #
  726. if ($c >= UTF8_HANGUL_VBASE && $c <= UTF8_HANGUL_VEND && $startChar >= UTF8_HANGUL_LBASE && $startChar <= UTF8_HANGUL_LEND)
  727. {
  728. #
  729. #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
  730. #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
  731. $lIndex = ord($startChar{2}) - 0x80;
  732. $vIndex = ord($c{2}) - 0xa1;
  733. $hangulPoint = UNICODE_HANGUL_FIRST + UNICODE_HANGUL_TCOUNT * (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
  734. # Hardcode the limited-range UTF-8 conversion:
  735. $startChar = chr($hangulPoint >> 12 & 0x0f | 0xe0) . chr($hangulPoint >> 6 & 0x3f | 0x80) . chr($hangulPoint & 0x3f | 0x80);
  736. $lastHangul = 0;
  737. continue;
  738. }
  739. elseif ($c >= UTF8_HANGUL_TBASE && $c <= UTF8_HANGUL_TEND && $startChar >= UTF8_HANGUL_FIRST && $startChar <= UTF8_HANGUL_LAST && ! $lastHangul)
  740. {
  741. # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
  742. $tIndex = ord($c{2}) - 0xa7;
  743. if ($tIndex < 0)
  744. $tIndex = ord($c{2}) - 0x80 + (0x11c0 - 0x11a7);
  745. # Increment the code point by $tIndex, without
  746. # the function overhead of decoding and recoding UTF-8
  747. #
  748. $tail = ord($startChar{2}) + $tIndex;
  749. if ($tail > 0xbf)
  750. {
  751. $tail -= 0x40;
  752. $mid = ord($startChar{1}) + 1;
  753. if ($mid > 0xbf)
  754. {
  755. $startChar{0} = chr(ord($startChar{0}) + 1);
  756. $mid -= 0x40;
  757. }
  758. $startChar{1} = chr($mid);
  759. }
  760. $startChar{2} = chr($tail);
  761. # If there's another jamo char after this, *don't* try to merge it.
  762. $lastHangul = 1;
  763. continue;
  764. }
  765. }
  766. }
  767. $out .= $startChar;
  768. $out .= $combining;
  769. $startChar = $c;
  770. $combining = '';
  771. $lastClass = 0;
  772. $lastHangul = 0;
  773. }
  774. $out .= $startChar . $combining;
  775. return $out;
  776. }
  777. /**
  778. * This is just used for the benchmark, comparing how long it takes to
  779. * interate through a string without really doing anything of substance.
  780. * @param $string string
  781. * @return string
  782. */
  783. static function placebo($string)
  784. {
  785. $len = strlen($string);
  786. $out = '';
  787. for($i = 0; $i < $len; $i ++)
  788. {
  789. $out .= $string{$i};
  790. }
  791. return $out;
  792. }
  793. }