PageRenderTime 48ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/src/utils/utf8.php

https://github.com/navyuginfo/libphutil
PHP | 790 lines | 438 code | 105 blank | 247 comment | 148 complexity | 61377bdbdfbaf66a708e6625ab92ca52 MD5 | raw file
Possible License(s): Apache-2.0
  1. <?php
  2. /**
  3. * Convert a string into valid UTF-8. This function is quite slow.
  4. *
  5. * When invalid byte subsequences are encountered, they will be replaced with
  6. * U+FFFD, the Unicode replacement character.
  7. *
  8. * @param string String to convert to valid UTF-8.
  9. * @return string String with invalid UTF-8 byte subsequences replaced with
  10. * U+FFFD.
  11. * @group utf8
  12. */
  13. function phutil_utf8ize($string) {
  14. if (phutil_is_utf8($string)) {
  15. return $string;
  16. }
  17. // There is no function to do this in iconv, mbstring or ICU to do this, so
  18. // do it (very very slowly) in pure PHP.
  19. // TODO: Provide an optional fast C implementation ala fb_utf8ize() if this
  20. // ever shows up in profiles?
  21. $result = array();
  22. $regex =
  23. "/([\x01-\x7F]".
  24. "|[\xC2-\xDF][\x80-\xBF]".
  25. "|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]".
  26. "|[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF])".
  27. "|(.)/";
  28. $offset = 0;
  29. $matches = null;
  30. while (preg_match($regex, $string, $matches, 0, $offset)) {
  31. if (!isset($matches[2])) {
  32. $result[] = $matches[1];
  33. } else {
  34. // Unicode replacement character, U+FFFD.
  35. $result[] = "\xEF\xBF\xBD";
  36. }
  37. $offset += strlen($matches[0]);
  38. }
  39. return implode('', $result);
  40. }
  41. /**
  42. * Determine if a string is valid UTF-8, with only basic multilingual plane
  43. * characters. This is particularly important because MySQL's `utf8` column
  44. * types silently truncate strings which contain characters outside of this
  45. * set.
  46. *
  47. * @param string String to test for being valid UTF-8 with only characters in
  48. * the basic multilingual plane.
  49. * @return bool True if the string is valid UTF-8 with only BMP characters.
  50. */
  51. function phutil_is_utf8_with_only_bmp_characters($string) {
  52. // NOTE: By default, PCRE segfaults on patterns like the one we would need
  53. // to use here at very small input sizes, at least on some systems (like
  54. // OS X). This is apparently because the internal implementation is recursive
  55. // and it blows the stack. See <https://bugs.php.net/bug.php?id=45735> for
  56. // some discussion. Since the input limit is extremely low (less than 50KB on
  57. // my system), do this check very very slowly in PHP instead.
  58. $len = strlen($string);
  59. for ($ii = 0; $ii < $len; $ii++) {
  60. $chr = ord($string[$ii]);
  61. if ($chr >= 0x01 && $chr <= 0x7F) {
  62. continue;
  63. } else if ($chr >= 0xC2 && $chr <= 0xDF) {
  64. ++$ii;
  65. if ($ii >= $len) {
  66. return false;
  67. }
  68. $chr = ord($string[$ii]);
  69. if ($chr >= 0x80 && $chr <= 0xBF) {
  70. continue;
  71. }
  72. return false;
  73. } else if ($chr > 0xE0 && $chr <= 0xEF) {
  74. ++$ii;
  75. if ($ii >= $len) {
  76. return false;
  77. }
  78. $chr = ord($string[$ii]);
  79. if ($chr >= 0x80 && $chr <= 0xBF) {
  80. ++$ii;
  81. if ($ii >= $len) {
  82. return false;
  83. }
  84. $chr = ord($string[$ii]);
  85. if ($chr >= 0x80 && $chr <= 0xBF) {
  86. continue;
  87. }
  88. }
  89. return false;
  90. } else if ($chr == 0xE0) {
  91. ++$ii;
  92. if ($ii >= $len) {
  93. return false;
  94. }
  95. $chr = ord($string[$ii]);
  96. // NOTE: This range starts at 0xA0, not 0x80. The values 0x80-0xA0 are
  97. // "valid", but not minimal representations, and MySQL rejects them. We're
  98. // special casing this part of the range.
  99. if ($chr >= 0xA0 && $chr <= 0xBF) {
  100. ++$ii;
  101. if ($ii >= $len) {
  102. return false;
  103. }
  104. $chr = ord($string[$ii]);
  105. if ($chr >= 0x80 && $chr <= 0xBF) {
  106. continue;
  107. }
  108. }
  109. return false;
  110. }
  111. return false;
  112. }
  113. return true;
  114. }
  115. /**
  116. * Determine if a string is valid UTF-8.
  117. *
  118. * @param string Some string which may or may not be valid UTF-8.
  119. * @return bool True if the string is valid UTF-8.
  120. * @group utf8
  121. */
  122. function phutil_is_utf8($string) {
  123. if (function_exists('mb_check_encoding')) {
  124. // If mbstring is available, this is significantly faster than using PHP
  125. // regexps.
  126. return mb_check_encoding($string, 'UTF-8');
  127. }
  128. // NOTE: This incorrectly accepts characters like \xE0\x80\x80, but should
  129. // not. The MB version works correctly.
  130. $regex =
  131. "/^(".
  132. "[\x01-\x7F]+".
  133. "|([\xC2-\xDF][\x80-\xBF])".
  134. "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF])".
  135. "|([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]))*\$/";
  136. return (bool)preg_match($regex, $string);
  137. }
  138. /**
  139. * Find the character length of a UTF-8 string.
  140. *
  141. * @param string A valid utf-8 string.
  142. * @return int The character length of the string.
  143. * @group utf8
  144. */
  145. function phutil_utf8_strlen($string) {
  146. return strlen(utf8_decode($string));
  147. }
  148. /**
  149. * Find the console display length of a UTF-8 string. This may differ from the
  150. * character length of the string if it contains double-width characters, like
  151. * many Chinese characters.
  152. *
  153. * This method is based on a C implementation here, which is based on the IEEE
  154. * standards. The source has more discussion and addresses more considerations
  155. * than this implementation does.
  156. *
  157. * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
  158. *
  159. * NOTE: We currently assume width 1 for East-Asian ambiguous characters.
  160. *
  161. * NOTE: This function is VERY slow.
  162. *
  163. * @param string A valid UTF-8 string.
  164. * @return int The console display length of the string.
  165. * @group utf8
  166. */
  167. function phutil_utf8_console_strlen($string) {
  168. // Formatting and colors don't contribute any width in the console.
  169. $string = preg_replace("/\x1B\[\d*m/", '', $string);
  170. // In the common case of an ASCII string, just return the string length.
  171. if (preg_match('/^[\x01-\x7F]*\z/', $string)) {
  172. return strlen($string);
  173. }
  174. $len = 0;
  175. // NOTE: To deal with combining characters, we're splitting the string into
  176. // glyphs first (characters with combiners) and then counting just the width
  177. // of the first character in each glyph.
  178. $display_glyphs = phutil_utf8v_combined($string);
  179. foreach ($display_glyphs as $display_glyph) {
  180. $glyph_codepoints = phutil_utf8v_codepoints($display_glyph);
  181. foreach ($glyph_codepoints as $c) {
  182. if ($c == 0) {
  183. break;
  184. }
  185. $len += 1 +
  186. ($c >= 0x1100 &&
  187. ($c <= 0x115f || /* Hangul Jamo init. consonants */
  188. $c == 0x2329 || $c == 0x232a ||
  189. ($c >= 0x2e80 && $c <= 0xa4cf &&
  190. $c != 0x303f) || /* CJK ... Yi */
  191. ($c >= 0xac00 && $c <= 0xd7a3) || /* Hangul Syllables */
  192. ($c >= 0xf900 && $c <= 0xfaff) || /* CJK Compatibility Ideographs */
  193. ($c >= 0xfe10 && $c <= 0xfe19) || /* Vertical forms */
  194. ($c >= 0xfe30 && $c <= 0xfe6f) || /* CJK Compatibility Forms */
  195. ($c >= 0xff00 && $c <= 0xff60) || /* Fullwidth Forms */
  196. ($c >= 0xffe0 && $c <= 0xffe6) ||
  197. ($c >= 0x20000 && $c <= 0x2fffd) ||
  198. ($c >= 0x30000 && $c <= 0x3fffd)));
  199. break;
  200. }
  201. }
  202. return $len;
  203. }
  204. /**
  205. * Split a UTF-8 string into an array of characters. Combining characters are
  206. * also split.
  207. *
  208. * @param string A valid utf-8 string.
  209. * @return list A list of characters in the string.
  210. * @group utf8
  211. */
  212. function phutil_utf8v($string) {
  213. $res = array();
  214. $len = strlen($string);
  215. $ii = 0;
  216. while ($ii < $len) {
  217. $byte = $string[$ii];
  218. if ($byte <= "\x7F") {
  219. $res[] = $byte;
  220. $ii += 1;
  221. continue;
  222. } else if ($byte < "\xC0") {
  223. throw new Exception('Invalid UTF-8 string passed to phutil_utf8v().');
  224. } else if ($byte <= "\xDF") {
  225. $seq_len = 2;
  226. } else if ($byte <= "\xEF") {
  227. $seq_len = 3;
  228. } else if ($byte <= "\xF7") {
  229. $seq_len = 4;
  230. } else if ($byte <= "\xFB") {
  231. $seq_len = 5;
  232. } else if ($byte <= "\xFD") {
  233. $seq_len = 6;
  234. } else {
  235. throw new Exception('Invalid UTF-8 string passed to phutil_utf8v().');
  236. }
  237. if ($ii + $seq_len > $len) {
  238. throw new Exception('Invalid UTF-8 string passed to phutil_utf8v().');
  239. }
  240. for ($jj = 1; $jj < $seq_len; ++$jj) {
  241. if ($string[$ii + $jj] >= "\xC0") {
  242. throw new Exception('Invalid UTF-8 string passed to phutil_utf8v().');
  243. }
  244. }
  245. $res[] = substr($string, $ii, $seq_len);
  246. $ii += $seq_len;
  247. }
  248. return $res;
  249. }
  250. /**
  251. * Split a UTF-8 string into an array of codepoints (as integers).
  252. *
  253. * @param string A valid UTF-8 string.
  254. * @return list A list of codepoints, as integers.
  255. * @group utf8
  256. */
  257. function phutil_utf8v_codepoints($string) {
  258. $str_v = phutil_utf8v($string);
  259. foreach ($str_v as $key => $char) {
  260. $c = ord($char[0]);
  261. $v = 0;
  262. if (($c & 0x80) == 0) {
  263. $v = $c;
  264. } else if (($c & 0xE0) == 0xC0) {
  265. $v = (($c & 0x1F) << 6)
  266. + ((ord($char[1]) & 0x3F));
  267. } else if (($c & 0xF0) == 0xE0) {
  268. $v = (($c & 0x0F) << 12)
  269. + ((ord($char[1]) & 0x3f) << 6)
  270. + ((ord($char[2]) & 0x3f));
  271. } else if (($c & 0xF8) == 0xF0) {
  272. $v = (($c & 0x07) << 18)
  273. + ((ord($char[1]) & 0x3F) << 12)
  274. + ((ord($char[2]) & 0x3F) << 6)
  275. + ((ord($char[3]) & 0x3f));
  276. } else if (($c & 0xFC) == 0xF8) {
  277. $v = (($c & 0x03) << 24)
  278. + ((ord($char[1]) & 0x3F) << 18)
  279. + ((ord($char[2]) & 0x3F) << 12)
  280. + ((ord($char[3]) & 0x3f) << 6)
  281. + ((ord($char[4]) & 0x3f));
  282. } else if (($c & 0xFE) == 0xFC) {
  283. $v = (($c & 0x01) << 30)
  284. + ((ord($char[1]) & 0x3F) << 24)
  285. + ((ord($char[2]) & 0x3F) << 18)
  286. + ((ord($char[3]) & 0x3f) << 12)
  287. + ((ord($char[4]) & 0x3f) << 6)
  288. + ((ord($char[5]) & 0x3f));
  289. }
  290. $str_v[$key] = $v;
  291. }
  292. return $str_v;
  293. }
  294. /**
  295. * Shorten a string to provide a summary, respecting UTF-8 characters. This
  296. * function attempts to truncate strings at word boundaries.
  297. *
  298. * NOTE: This function makes a best effort to apply some reasonable rules but
  299. * will not work well for the full range of unicode languages.
  300. *
  301. * @param string UTF-8 string to shorten.
  302. * @param int Maximum length of the result.
  303. * @param string If the string is shortened, add this at the end. Defaults to
  304. * horizontal ellipsis.
  305. * @return string A string with no more than the specified character length.
  306. *
  307. * @group utf8
  308. */
  309. function phutil_utf8_shorten($string, $length, $terminal = "\xE2\x80\xA6") {
  310. // If the string has fewer bytes than the minimum length, we can return
  311. // it unmodified without doing any heavy lifting.
  312. if (strlen($string) <= $length) {
  313. return $string;
  314. }
  315. $string_v = phutil_utf8v_combined($string);
  316. $string_len = count($string_v);
  317. if ($string_len <= $length) {
  318. // If the string is already shorter than the requested length, simply return
  319. // it unmodified.
  320. return $string;
  321. }
  322. // NOTE: This is not complete, and there are many other word boundary
  323. // characters and reasonable places to break words in the UTF-8 character
  324. // space. For now, this gives us reasonable behavior for latin langauges. We
  325. // don't necessarily have access to PCRE+Unicode so there isn't a great way
  326. // for us to look up character attributes.
  327. // If we encounter these, prefer to break on them instead of cutting the
  328. // string off in the middle of a word.
  329. static $break_characters = array(
  330. ' ' => true,
  331. "\n" => true,
  332. ';' => true,
  333. ':' => true,
  334. '[' => true,
  335. '(' => true,
  336. ',' => true,
  337. '-' => true,
  338. );
  339. // If we encounter these, shorten to this character exactly without appending
  340. // the terminal.
  341. static $stop_characters = array(
  342. '.' => true,
  343. '!' => true,
  344. '?' => true,
  345. );
  346. // Search backward in the string, looking for reasonable places to break it.
  347. $word_boundary = null;
  348. $stop_boundary = null;
  349. $terminal_len = phutil_utf8_strlen($terminal);
  350. // If we do a word break with a terminal, we have to look beyond at least the
  351. // number of characters in the terminal. If the terminal is longer than the
  352. // required length, we'll skip this whole block and return it on its own
  353. $terminal_area = $length - min($length, $terminal_len);
  354. for ($ii = $length; $ii >= 0; $ii--) {
  355. $c = $string_v[$ii];
  356. if (isset($break_characters[$c]) && ($ii <= $terminal_area)) {
  357. $word_boundary = $ii;
  358. } else if (isset($stop_characters[$c]) && ($ii < $length)) {
  359. $stop_boundary = $ii + 1;
  360. break;
  361. } else {
  362. if ($word_boundary !== null) {
  363. break;
  364. }
  365. }
  366. }
  367. if ($stop_boundary !== null) {
  368. // We found a character like ".". Cut the string there, without appending
  369. // the terminal.
  370. $string_part = array_slice($string_v, 0, $stop_boundary);
  371. return implode('', $string_part);
  372. }
  373. // If we didn't find any boundary characters or we found ONLY boundary
  374. // characters, just break at the maximum character length.
  375. if ($word_boundary === null || $word_boundary === 0) {
  376. $word_boundary = $terminal_area;
  377. }
  378. $string_part = array_slice($string_v, 0, $word_boundary);
  379. $string_part = implode('', $string_part);
  380. return $string_part.$terminal;
  381. }
  382. /**
  383. * Hard-wrap a block of UTF-8 text with embedded HTML tags and entities.
  384. *
  385. * @param string An HTML string with tags and entities.
  386. * @return list List of hard-wrapped lines.
  387. * @group utf8
  388. */
  389. function phutil_utf8_hard_wrap_html($string, $width) {
  390. $break_here = array();
  391. // Convert the UTF-8 string into a list of UTF-8 characters.
  392. $vector = phutil_utf8v($string);
  393. $len = count($vector);
  394. $char_pos = 0;
  395. for ($ii = 0; $ii < $len; ++$ii) {
  396. // An ampersand indicates an HTML entity; consume the whole thing (until
  397. // ";") but treat it all as one character.
  398. if ($vector[$ii] == '&') {
  399. do {
  400. ++$ii;
  401. } while ($vector[$ii] != ';');
  402. ++$char_pos;
  403. // An "<" indicates an HTML tag, consume the whole thing but don't treat
  404. // it as a character.
  405. } else if ($vector[$ii] == '<') {
  406. do {
  407. ++$ii;
  408. } while ($vector[$ii] != '>');
  409. } else {
  410. ++$char_pos;
  411. }
  412. // Keep track of where we need to break the string later.
  413. if ($char_pos == $width) {
  414. $break_here[$ii] = true;
  415. $char_pos = 0;
  416. }
  417. }
  418. $result = array();
  419. $string = '';
  420. foreach ($vector as $ii => $char) {
  421. $string .= $char;
  422. if (isset($break_here[$ii])) {
  423. $result[] = $string;
  424. $string = '';
  425. }
  426. }
  427. if (strlen($string)) {
  428. $result[] = $string;
  429. }
  430. return $result;
  431. }
  432. /**
  433. * Hard-wrap a block of UTF-8 text with no embedded HTML tags and entitites
  434. *
  435. * @param string A non HTML string
  436. * @param int Width of the hard-wrapped lines
  437. * @return list List of hard-wrapped lines.
  438. * @group utf8
  439. */
  440. function phutil_utf8_hard_wrap($string, $width) {
  441. $result = array();
  442. $lines = phutil_split_lines($string, $retain_endings = false);
  443. foreach ($lines as $line) {
  444. // Convert the UTF-8 string into a list of UTF-8 characters.
  445. $vector = phutil_utf8v($line);
  446. $len = count($vector);
  447. $buffer = '';
  448. for ($ii = 1; $ii <= $len; ++$ii) {
  449. $buffer .= $vector[$ii - 1];
  450. if (($ii % $width) === 0) {
  451. $result[] = $buffer;
  452. $buffer = '';
  453. }
  454. }
  455. if (strlen($buffer)) {
  456. $result[] = $buffer;
  457. }
  458. }
  459. return $result;
  460. }
  461. /**
  462. * Convert a string from one encoding (like ISO-8859-1) to another encoding
  463. * (like UTF-8).
  464. *
  465. * This is primarily a thin wrapper around `mb_convert_encoding()` which checks
  466. * you have the extension installed, since we try to require the extension
  467. * only if you actually need it (i.e., you want to work with encodings other
  468. * than UTF-8).
  469. *
  470. * NOTE: This function assumes that the input is in the given source encoding.
  471. * If it is not, it may not output in the specified target encoding. If you
  472. * need to perform a hard conversion to UTF-8, use this function in conjunction
  473. * with @{function:phutil_utf8ize}. We can detect failures caused by invalid
  474. * encoding names, but `mb_convert_encoding()` fails silently if the
  475. * encoding name identifies a real encoding but the string is not actually
  476. * encoded with that encoding.
  477. *
  478. * @param string String to re-encode.
  479. * @param string Target encoding name, like "UTF-8".
  480. * @param string Source endocing name, like "ISO-8859-1".
  481. * @return string Input string, with converted character encoding.
  482. *
  483. * @group utf8
  484. *
  485. * @phutil-external-symbol function mb_convert_encoding
  486. */
  487. function phutil_utf8_convert($string, $to_encoding, $from_encoding) {
  488. if (!$from_encoding) {
  489. throw new InvalidArgumentException(
  490. 'Attempting to convert a string encoding, but no source encoding '.
  491. 'was provided. Explicitly provide the source encoding.');
  492. }
  493. if (!$to_encoding) {
  494. throw new InvalidArgumentException(
  495. 'Attempting to convert a string encoding, but no target encoding '.
  496. 'was provided. Explicitly provide the target encoding.');
  497. }
  498. // Normalize encoding names so we can no-op the very common case of UTF8
  499. // to UTF8 (or any other conversion where both encodings are identical).
  500. $to_upper = strtoupper(str_replace('-', '', $to_encoding));
  501. $from_upper = strtoupper(str_replace('-', '', $from_encoding));
  502. if ($from_upper == $to_upper) {
  503. return $string;
  504. }
  505. if (!function_exists('mb_convert_encoding')) {
  506. throw new Exception(
  507. "Attempting to convert a string encoding from '{$from_encoding}' ".
  508. "to '{$to_encoding}', but the 'mbstring' PHP extension is not ".
  509. "available. Install mbstring to work with encodings other than ".
  510. "UTF-8.");
  511. }
  512. $result = @mb_convert_encoding($string, $to_encoding, $from_encoding);
  513. if ($result === false) {
  514. $message = error_get_last();
  515. if ($message) {
  516. $message = idx($message, 'message', 'Unknown error.');
  517. }
  518. throw new Exception(
  519. "String conversion from encoding '{$from_encoding}' to encoding ".
  520. "'{$to_encoding}' failed: {$message}");
  521. }
  522. return $result;
  523. }
  524. /**
  525. * Convert a string to title case in a UTF8-aware way. This function doesn't
  526. * necessarily do a great job, but the builtin implementation of ucwords() can
  527. * completely destroy inputs, so it just has to be better than that. Similar to
  528. * @{function:ucwords}.
  529. *
  530. * @param string UTF-8 input string.
  531. * @return string Input, in some semblance of title case.
  532. *
  533. * @group utf8
  534. */
  535. function phutil_utf8_ucwords($str) {
  536. // NOTE: mb_convert_case() discards uppercase letters in words when converting
  537. // to title case. For example, it will convert "AAA" into "Aaa", which is
  538. // undesirable.
  539. $v = phutil_utf8v($str);
  540. $result = '';
  541. $last = null;
  542. $ord_a = ord('a');
  543. $ord_z = ord('z');
  544. foreach ($v as $c) {
  545. $convert = false;
  546. if ($last === null || $last === ' ') {
  547. $o = ord($c[0]);
  548. if ($o >= $ord_a && $o <= $ord_z) {
  549. $convert = true;
  550. }
  551. }
  552. if ($convert) {
  553. $result .= phutil_utf8_strtoupper($c);
  554. } else {
  555. $result .= $c;
  556. }
  557. $last = $c;
  558. }
  559. return $result;
  560. }
  561. /**
  562. * Convert a string to lower case in a UTF8-aware way. Similar to
  563. * @{function:strtolower}.
  564. *
  565. * @param string UTF-8 input string.
  566. * @return string Input, in some semblance of lower case.
  567. *
  568. * @group utf8
  569. *
  570. * @phutil-external-symbol function mb_convert_case
  571. */
  572. function phutil_utf8_strtolower($str) {
  573. if (function_exists('mb_convert_case')) {
  574. return mb_convert_case($str, MB_CASE_LOWER, 'UTF-8');
  575. }
  576. static $map;
  577. if ($map === null) {
  578. $map = array_combine(
  579. range('A', 'Z'),
  580. range('a', 'z'));
  581. }
  582. return phutil_utf8_strtr($str, $map);
  583. }
  584. /**
  585. * Convert a string to upper case in a UTF8-aware way. Similar to
  586. * @{function:strtoupper}.
  587. *
  588. * @param string UTF-8 input string.
  589. * @return string Input, in some semblance of upper case.
  590. *
  591. * @group utf8
  592. *
  593. * @phutil-external-symbol function mb_convert_case
  594. */
  595. function phutil_utf8_strtoupper($str) {
  596. if (function_exists('mb_convert_case')) {
  597. return mb_convert_case($str, MB_CASE_UPPER, 'UTF-8');
  598. }
  599. static $map;
  600. if ($map === null) {
  601. $map = array_combine(
  602. range('a', 'z'),
  603. range('A', 'Z'));
  604. }
  605. return phutil_utf8_strtr($str, $map);
  606. }
  607. /**
  608. * Replace characters in a string in a UTF-aware way. Similar to
  609. * @{function:strtr}.
  610. *
  611. * @param string UTF-8 input string.
  612. * @param map<string, string> Map of characters to replace.
  613. * @return string Input with translated characters.
  614. *
  615. * @group utf8
  616. */
  617. function phutil_utf8_strtr($str, array $map) {
  618. $v = phutil_utf8v($str);
  619. $result = '';
  620. foreach ($v as $c) {
  621. if (isset($map[$c])) {
  622. $result .= $map[$c];
  623. } else {
  624. $result .= $c;
  625. }
  626. }
  627. return $result;
  628. }
  629. /**
  630. * Determine if a given unicode character is a combining character or not.
  631. *
  632. * @param string A single unicode character.
  633. * @return boolean True or false.
  634. *
  635. * @group utf8
  636. */
  637. function phutil_utf8_is_combining_character($character) {
  638. $components = phutil_utf8v_codepoints($character);
  639. // Combining Diacritical Marks (0300 - 036F).
  640. // Combining Diacritical Marks Supplement (1DC0 - 1DFF).
  641. // Combining Diacritical Marks for Symbols (20D0 - 20FF).
  642. // Combining Half Marks (FE20 - FE2F).
  643. foreach ($components as $codepoint) {
  644. if ($codepoint >= 0x0300 && $codepoint <= 0x036F ||
  645. $codepoint >= 0x1DC0 && $codepoint <= 0x1DFF ||
  646. $codepoint >= 0x20D0 && $codepoint <= 0x20FF ||
  647. $codepoint >= 0xFE20 && $codepoint <= 0xFE2F) {
  648. return true;
  649. }
  650. }
  651. return false;
  652. }
  653. /**
  654. * Split a UTF-8 string into an array of characters. Combining characters
  655. * are not split.
  656. *
  657. * @param string A valid utf-8 string.
  658. * @return list A list of characters in the string.
  659. *
  660. * @group utf8
  661. */
  662. function phutil_utf8v_combined($string) {
  663. $components = phutil_utf8v($string);
  664. $array_length = count($components);
  665. // If the first character in the string is a combining character,
  666. // prepend a space to the string.
  667. if (
  668. $array_length > 0 &&
  669. phutil_utf8_is_combining_character($components[0])) {
  670. $string = ' '.$string;
  671. $components = phutil_utf8v($string);
  672. $array_length++;
  673. }
  674. for ($index = 1; $index < $array_length; $index++) {
  675. if (phutil_utf8_is_combining_character($components[$index])) {
  676. $components[$index - 1] =
  677. $components[$index - 1].$components[$index];
  678. unset($components[$index]);
  679. $components = array_values($components);
  680. $index --;
  681. $array_length = count($components);
  682. }
  683. }
  684. return $components;
  685. }