PageRenderTime 54ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/src/Joomla/String/String.php

https://github.com/piotr-cz/joomla-framework
PHP | 945 lines | 430 code | 68 blank | 447 comment | 78 complexity | f46256cf7a496f70ad530f0d7bf9c351 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1
  1. <?php
  2. /**
  3. * Part of the Joomla Framework String Package
  4. *
  5. * @copyright Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
  6. * @license GNU General Public License version 2 or later; see LICENSE
  7. */
  8. namespace Joomla\String;
  9. // PHP mbstring and iconv local configuration
  10. // Check if mbstring extension is loaded and attempt to load it if not present except for windows
  11. if (extension_loaded('mbstring'))
  12. {
  13. // Make sure to suppress the output in case ini_set is disabled
  14. @ini_set('mbstring.internal_encoding', 'UTF-8');
  15. @ini_set('mbstring.http_input', 'UTF-8');
  16. @ini_set('mbstring.http_output', 'UTF-8');
  17. }
  18. // Same for iconv
  19. if (function_exists('iconv'))
  20. {
  21. // These are settings that can be set inside code
  22. iconv_set_encoding("internal_encoding", "UTF-8");
  23. iconv_set_encoding("input_encoding", "UTF-8");
  24. iconv_set_encoding("output_encoding", "UTF-8");
  25. }
  26. /**
  27. * Include the utf8 package
  28. */
  29. if (!defined('UTF8'))
  30. {
  31. require_once __DIR__ . '/phputf8/utf8.php';
  32. }
  33. if (!function_exists('utf8_strcasecmp'))
  34. {
  35. require_once __DIR__ . '/phputf8/strcasecmp.php';
  36. }
  37. /**
  38. * String handling class for utf-8 data
  39. * Wraps the phputf8 library
  40. * All functions assume the validity of utf-8 strings.
  41. *
  42. * @since 1.0
  43. */
  44. abstract class String
  45. {
  46. /**
  47. * Increment styles.
  48. *
  49. * @var array
  50. * @since 1.0
  51. */
  52. protected static $incrementStyles = array(
  53. 'dash' => array(
  54. '#-(\d+)$#',
  55. '-%d'
  56. ),
  57. 'default' => array(
  58. array('#\((\d+)\)$#', '#\(\d+\)$#'),
  59. array(' (%d)', '(%d)'),
  60. ),
  61. );
  62. /**
  63. * Increments a trailing number in a string.
  64. *
  65. * Used to easily create distinct labels when copying objects. The method has the following styles:
  66. *
  67. * default: "Label" becomes "Label (2)"
  68. * dash: "Label" becomes "Label-2"
  69. *
  70. * @param string $string The source string.
  71. * @param string $style The the style (default|dash).
  72. * @param integer $n If supplied, this number is used for the copy, otherwise it is the 'next' number.
  73. *
  74. * @return string The incremented string.
  75. *
  76. * @since 1.0
  77. */
  78. public static function increment($string, $style = 'default', $n = 0)
  79. {
  80. $styleSpec = isset(self::$incrementStyles[$style]) ? self::$incrementStyles[$style] : self::$incrementStyles['default'];
  81. // Regular expression search and replace patterns.
  82. if (is_array($styleSpec[0]))
  83. {
  84. $rxSearch = $styleSpec[0][0];
  85. $rxReplace = $styleSpec[0][1];
  86. }
  87. else
  88. {
  89. $rxSearch = $rxReplace = $styleSpec[0];
  90. }
  91. // New and old (existing) sprintf formats.
  92. if (is_array($styleSpec[1]))
  93. {
  94. $newFormat = $styleSpec[1][0];
  95. $oldFormat = $styleSpec[1][1];
  96. }
  97. else
  98. {
  99. $newFormat = $oldFormat = $styleSpec[1];
  100. }
  101. // Check if we are incrementing an existing pattern, or appending a new one.
  102. if (preg_match($rxSearch, $string, $matches))
  103. {
  104. $n = empty($n) ? ($matches[1] + 1) : $n;
  105. $string = preg_replace($rxReplace, sprintf($oldFormat, $n), $string);
  106. }
  107. else
  108. {
  109. $n = empty($n) ? 2 : $n;
  110. $string .= sprintf($newFormat, $n);
  111. }
  112. return $string;
  113. }
  114. /**
  115. * Tests whether a string contains only 7bit ASCII bytes.
  116. * You might use this to conditionally check whether a string
  117. * needs handling as UTF-8 or not, potentially offering performance
  118. * benefits by using the native PHP equivalent if it's just ASCII e.g.;
  119. *
  120. * <code>
  121. * if (String::is_ascii($someString))
  122. * {
  123. * // It's just ASCII - use the native PHP version
  124. * $someString = strtolower($someString);
  125. * }
  126. * else
  127. * {
  128. * $someString = String::strtolower($someString);
  129. * }
  130. * </code>
  131. *
  132. * @param string $str The string to test.
  133. *
  134. * @return boolean True if the string is all ASCII
  135. *
  136. * @since 1.0
  137. */
  138. public static function is_ascii($str)
  139. {
  140. // Search for any bytes which are outside the ASCII range...
  141. return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
  142. }
  143. /**
  144. * UTF-8 aware alternative to strpos.
  145. *
  146. * Find position of first occurrence of a string.
  147. *
  148. * @param string $str String being examined
  149. * @param string $search String being searched for
  150. * @param integer $offset Optional, specifies the position from which the search should be performed
  151. *
  152. * @return mixed Number of characters before the first match or FALSE on failure
  153. *
  154. * @see http://www.php.net/strpos
  155. * @since 1.0
  156. */
  157. public static function strpos($str, $search, $offset = false)
  158. {
  159. if ($offset === false)
  160. {
  161. return utf8_strpos($str, $search);
  162. }
  163. else
  164. {
  165. return utf8_strpos($str, $search, $offset);
  166. }
  167. }
  168. /**
  169. * UTF-8 aware alternative to strrpos
  170. * Finds position of last occurrence of a string
  171. *
  172. * @param string $str String being examined.
  173. * @param string $search String being searched for.
  174. * @param integer $offset Offset from the left of the string.
  175. *
  176. * @return mixed Number of characters before the last match or false on failure
  177. *
  178. * @see http://www.php.net/strrpos
  179. * @since 1.0
  180. */
  181. public static function strrpos($str, $search, $offset = 0)
  182. {
  183. return utf8_strrpos($str, $search, $offset);
  184. }
  185. /**
  186. * UTF-8 aware alternative to substr
  187. * Return part of a string given character offset (and optionally length)
  188. *
  189. * @param string $str String being processed
  190. * @param integer $offset Number of UTF-8 characters offset (from left)
  191. * @param integer $length Optional length in UTF-8 characters from offset
  192. *
  193. * @return mixed string or FALSE if failure
  194. *
  195. * @see http://www.php.net/substr
  196. * @since 1.0
  197. */
  198. public static function substr($str, $offset, $length = false)
  199. {
  200. if ($length === false)
  201. {
  202. return utf8_substr($str, $offset);
  203. }
  204. else
  205. {
  206. return utf8_substr($str, $offset, $length);
  207. }
  208. }
  209. /**
  210. * UTF-8 aware alternative to strtlower
  211. *
  212. * Make a string lowercase
  213. * Note: The concept of a characters "case" only exists is some alphabets
  214. * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
  215. * not exist in the Chinese alphabet, for example. See Unicode Standard
  216. * Annex #21: Case Mappings
  217. *
  218. * @param string $str String being processed
  219. *
  220. * @return mixed Either string in lowercase or FALSE is UTF-8 invalid
  221. *
  222. * @see http://www.php.net/strtolower
  223. * @since 1.0
  224. */
  225. public static function strtolower($str)
  226. {
  227. return utf8_strtolower($str);
  228. }
  229. /**
  230. * UTF-8 aware alternative to strtoupper
  231. * Make a string uppercase
  232. * Note: The concept of a characters "case" only exists is some alphabets
  233. * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
  234. * not exist in the Chinese alphabet, for example. See Unicode Standard
  235. * Annex #21: Case Mappings
  236. *
  237. * @param string $str String being processed
  238. *
  239. * @return mixed Either string in uppercase or FALSE is UTF-8 invalid
  240. *
  241. * @see http://www.php.net/strtoupper
  242. * @since 1.0
  243. */
  244. public static function strtoupper($str)
  245. {
  246. return utf8_strtoupper($str);
  247. }
  248. /**
  249. * UTF-8 aware alternative to strlen.
  250. *
  251. * Returns the number of characters in the string (NOT THE NUMBER OF BYTES),
  252. *
  253. * @param string $str UTF-8 string.
  254. *
  255. * @return integer Number of UTF-8 characters in string.
  256. *
  257. * @see http://www.php.net/strlen
  258. * @since 1.0
  259. */
  260. public static function strlen($str)
  261. {
  262. return utf8_strlen($str);
  263. }
  264. /**
  265. * UTF-8 aware alternative to str_ireplace
  266. * Case-insensitive version of str_replace
  267. *
  268. * @param string $search String to search
  269. * @param string $replace Existing string to replace
  270. * @param string $str New string to replace with
  271. * @param integer $count Optional count value to be passed by referene
  272. *
  273. * @return string UTF-8 String
  274. *
  275. * @see http://www.php.net/str_ireplace
  276. * @since 1.0
  277. */
  278. public static function str_ireplace($search, $replace, $str, $count = null)
  279. {
  280. require_once __DIR__ . '/phputf8/str_ireplace.php';
  281. if ($count === false)
  282. {
  283. return utf8_ireplace($search, $replace, $str);
  284. }
  285. else
  286. {
  287. return utf8_ireplace($search, $replace, $str, $count);
  288. }
  289. }
  290. /**
  291. * UTF-8 aware alternative to str_split
  292. * Convert a string to an array
  293. *
  294. * @param string $str UTF-8 encoded string to process
  295. * @param integer $split_len Number to characters to split string by
  296. *
  297. * @return array
  298. *
  299. * @see http://www.php.net/str_split
  300. * @since 1.0
  301. */
  302. public static function str_split($str, $split_len = 1)
  303. {
  304. require_once __DIR__ . '/phputf8/str_split.php';
  305. return utf8_str_split($str, $split_len);
  306. }
  307. /**
  308. * UTF-8/LOCALE aware alternative to strcasecmp
  309. * A case insensitive string comparison
  310. *
  311. * @param string $str1 string 1 to compare
  312. * @param string $str2 string 2 to compare
  313. * @param mixed $locale The locale used by strcoll or false to use classical comparison
  314. *
  315. * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
  316. *
  317. * @see http://www.php.net/strcasecmp
  318. * @see http://www.php.net/strcoll
  319. * @see http://www.php.net/setlocale
  320. * @since 1.0
  321. */
  322. public static function strcasecmp($str1, $str2, $locale = false)
  323. {
  324. if ($locale)
  325. {
  326. // Get current locale
  327. $locale0 = setlocale(LC_COLLATE, 0);
  328. if (!$locale = setlocale(LC_COLLATE, $locale))
  329. {
  330. $locale = $locale0;
  331. }
  332. // See if we have successfully set locale to UTF-8
  333. if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
  334. {
  335. $encoding = 'CP' . $m[1];
  336. }
  337. elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8'))
  338. {
  339. $encoding = 'UTF-8';
  340. }
  341. else
  342. {
  343. $encoding = 'nonrecodable';
  344. }
  345. // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
  346. if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
  347. {
  348. return strcoll(utf8_strtolower($str1), utf8_strtolower($str2));
  349. }
  350. else
  351. {
  352. return strcoll(
  353. self::transcode(utf8_strtolower($str1), 'UTF-8', $encoding),
  354. self::transcode(utf8_strtolower($str2), 'UTF-8', $encoding)
  355. );
  356. }
  357. }
  358. else
  359. {
  360. return utf8_strcasecmp($str1, $str2);
  361. }
  362. }
  363. /**
  364. * UTF-8/LOCALE aware alternative to strcmp
  365. * A case sensitive string comparison
  366. *
  367. * @param string $str1 string 1 to compare
  368. * @param string $str2 string 2 to compare
  369. * @param mixed $locale The locale used by strcoll or false to use classical comparison
  370. *
  371. * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
  372. *
  373. * @see http://www.php.net/strcmp
  374. * @see http://www.php.net/strcoll
  375. * @see http://www.php.net/setlocale
  376. * @since 1.0
  377. */
  378. public static function strcmp($str1, $str2, $locale = false)
  379. {
  380. if ($locale)
  381. {
  382. // Get current locale
  383. $locale0 = setlocale(LC_COLLATE, 0);
  384. if (!$locale = setlocale(LC_COLLATE, $locale))
  385. {
  386. $locale = $locale0;
  387. }
  388. // See if we have successfully set locale to UTF-8
  389. if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
  390. {
  391. $encoding = 'CP' . $m[1];
  392. }
  393. elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8'))
  394. {
  395. $encoding = 'UTF-8';
  396. }
  397. else
  398. {
  399. $encoding = 'nonrecodable';
  400. }
  401. // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
  402. if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
  403. {
  404. return strcoll($str1, $str2);
  405. }
  406. else
  407. {
  408. return strcoll(self::transcode($str1, 'UTF-8', $encoding), self::transcode($str2, 'UTF-8', $encoding));
  409. }
  410. }
  411. else
  412. {
  413. return strcmp($str1, $str2);
  414. }
  415. }
  416. /**
  417. * UTF-8 aware alternative to strcspn
  418. * Find length of initial segment not matching mask
  419. *
  420. * @param string $str The string to process
  421. * @param string $mask The mask
  422. * @param integer $start Optional starting character position (in characters)
  423. * @param integer $length Optional length
  424. *
  425. * @return integer The length of the initial segment of str1 which does not contain any of the characters in str2
  426. *
  427. * @see http://www.php.net/strcspn
  428. * @since 1.0
  429. */
  430. public static function strcspn($str, $mask, $start = null, $length = null)
  431. {
  432. require_once __DIR__ . '/phputf8/strcspn.php';
  433. if ($start === false && $length === false)
  434. {
  435. return utf8_strcspn($str, $mask);
  436. }
  437. elseif ($length === false)
  438. {
  439. return utf8_strcspn($str, $mask, $start);
  440. }
  441. else
  442. {
  443. return utf8_strcspn($str, $mask, $start, $length);
  444. }
  445. }
  446. /**
  447. * UTF-8 aware alternative to stristr
  448. * Returns all of haystack from the first occurrence of needle to the end.
  449. * needle and haystack are examined in a case-insensitive manner
  450. * Find first occurrence of a string using case insensitive comparison
  451. *
  452. * @param string $str The haystack
  453. * @param string $search The needle
  454. *
  455. * @return string the sub string
  456. *
  457. * @see http://www.php.net/stristr
  458. * @since 1.0
  459. */
  460. public static function stristr($str, $search)
  461. {
  462. require_once __DIR__ . '/phputf8/stristr.php';
  463. return utf8_stristr($str, $search);
  464. }
  465. /**
  466. * UTF-8 aware alternative to strrev
  467. * Reverse a string
  468. *
  469. * @param string $str String to be reversed
  470. *
  471. * @return string The string in reverse character order
  472. *
  473. * @see http://www.php.net/strrev
  474. * @since 1.0
  475. */
  476. public static function strrev($str)
  477. {
  478. require_once __DIR__ . '/phputf8/strrev.php';
  479. return utf8_strrev($str);
  480. }
  481. /**
  482. * UTF-8 aware alternative to strspn
  483. * Find length of initial segment matching mask
  484. *
  485. * @param string $str The haystack
  486. * @param string $mask The mask
  487. * @param integer $start Start optional
  488. * @param integer $length Length optional
  489. *
  490. * @return integer
  491. *
  492. * @see http://www.php.net/strspn
  493. * @since 1.0
  494. */
  495. public static function strspn($str, $mask, $start = null, $length = null)
  496. {
  497. require_once __DIR__ . '/phputf8/strspn.php';
  498. if ($start === null && $length === null)
  499. {
  500. return utf8_strspn($str, $mask);
  501. }
  502. elseif ($length === null)
  503. {
  504. return utf8_strspn($str, $mask, $start);
  505. }
  506. else
  507. {
  508. return utf8_strspn($str, $mask, $start, $length);
  509. }
  510. }
  511. /**
  512. * UTF-8 aware substr_replace
  513. * Replace text within a portion of a string
  514. *
  515. * @param string $str The haystack
  516. * @param string $repl The replacement string
  517. * @param integer $start Start
  518. * @param integer $length Length (optional)
  519. *
  520. * @return string
  521. *
  522. * @see http://www.php.net/substr_replace
  523. * @since 1.0
  524. */
  525. public static function substr_replace($str, $repl, $start, $length = null)
  526. {
  527. // Loaded by library loader
  528. if ($length === false)
  529. {
  530. return utf8_substr_replace($str, $repl, $start);
  531. }
  532. else
  533. {
  534. return utf8_substr_replace($str, $repl, $start, $length);
  535. }
  536. }
  537. /**
  538. * UTF-8 aware replacement for ltrim()
  539. *
  540. * Strip whitespace (or other characters) from the beginning of a string
  541. * You only need to use this if you are supplying the charlist
  542. * optional arg and it contains UTF-8 characters. Otherwise ltrim will
  543. * work normally on a UTF-8 string
  544. *
  545. * @param string $str The string to be trimmed
  546. * @param string $charlist The optional charlist of additional characters to trim
  547. *
  548. * @return string The trimmed string
  549. *
  550. * @see http://www.php.net/ltrim
  551. * @since 1.0
  552. */
  553. public static function ltrim($str, $charlist = false)
  554. {
  555. if (empty($charlist) && $charlist !== false)
  556. {
  557. return $str;
  558. }
  559. require_once __DIR__ . '/phputf8/trim.php';
  560. if ($charlist === false)
  561. {
  562. return utf8_ltrim($str);
  563. }
  564. else
  565. {
  566. return utf8_ltrim($str, $charlist);
  567. }
  568. }
  569. /**
  570. * UTF-8 aware replacement for rtrim()
  571. * Strip whitespace (or other characters) from the end of a string
  572. * You only need to use this if you are supplying the charlist
  573. * optional arg and it contains UTF-8 characters. Otherwise rtrim will
  574. * work normally on a UTF-8 string
  575. *
  576. * @param string $str The string to be trimmed
  577. * @param string $charlist The optional charlist of additional characters to trim
  578. *
  579. * @return string The trimmed string
  580. *
  581. * @see http://www.php.net/rtrim
  582. * @since 1.0
  583. */
  584. public static function rtrim($str, $charlist = false)
  585. {
  586. if (empty($charlist) && $charlist !== false)
  587. {
  588. return $str;
  589. }
  590. require_once __DIR__ . '/phputf8/trim.php';
  591. if ($charlist === false)
  592. {
  593. return utf8_rtrim($str);
  594. }
  595. else
  596. {
  597. return utf8_rtrim($str, $charlist);
  598. }
  599. }
  600. /**
  601. * UTF-8 aware replacement for trim()
  602. * Strip whitespace (or other characters) from the beginning and end of a string
  603. * Note: you only need to use this if you are supplying the charlist
  604. * optional arg and it contains UTF-8 characters. Otherwise trim will
  605. * work normally on a UTF-8 string
  606. *
  607. * @param string $str The string to be trimmed
  608. * @param string $charlist The optional charlist of additional characters to trim
  609. *
  610. * @return string The trimmed string
  611. *
  612. * @see http://www.php.net/trim
  613. * @since 1.0
  614. */
  615. public static function trim($str, $charlist = false)
  616. {
  617. if (empty($charlist) && $charlist !== false)
  618. {
  619. return $str;
  620. }
  621. require_once __DIR__ . '/phputf8/trim.php';
  622. if ($charlist === false)
  623. {
  624. return utf8_trim($str);
  625. }
  626. else
  627. {
  628. return utf8_trim($str, $charlist);
  629. }
  630. }
  631. /**
  632. * UTF-8 aware alternative to ucfirst
  633. * Make a string's first character uppercase or all words' first character uppercase
  634. *
  635. * @param string $str String to be processed
  636. * @param string $delimiter The words delimiter (null means do not split the string)
  637. * @param string $newDelimiter The new words delimiter (null means equal to $delimiter)
  638. *
  639. * @return string If $delimiter is null, return the string with first character as upper case (if applicable)
  640. * else consider the string of words separated by the delimiter, apply the ucfirst to each words
  641. * and return the string with the new delimiter
  642. *
  643. * @see http://www.php.net/ucfirst
  644. * @since 1.0
  645. */
  646. public static function ucfirst($str, $delimiter = null, $newDelimiter = null)
  647. {
  648. require_once __DIR__ . '/phputf8/ucfirst.php';
  649. if ($delimiter === null)
  650. {
  651. return utf8_ucfirst($str);
  652. }
  653. else
  654. {
  655. if ($newDelimiter === null)
  656. {
  657. $newDelimiter = $delimiter;
  658. }
  659. return implode($newDelimiter, array_map('utf8_ucfirst', explode($delimiter, $str)));
  660. }
  661. }
  662. /**
  663. * UTF-8 aware alternative to ucwords
  664. * Uppercase the first character of each word in a string
  665. *
  666. * @param string $str String to be processed
  667. *
  668. * @return string String with first char of each word uppercase
  669. *
  670. * @see http://www.php.net/ucwords
  671. * @since 1.0
  672. */
  673. public static function ucwords($str)
  674. {
  675. require_once __DIR__ . '/phputf8/ucwords.php';
  676. return utf8_ucwords($str);
  677. }
  678. /**
  679. * Transcode a string.
  680. *
  681. * @param string $source The string to transcode.
  682. * @param string $from_encoding The source encoding.
  683. * @param string $to_encoding The target encoding.
  684. *
  685. * @return mixed The transcoded string, or null if the source was not a string.
  686. *
  687. * @link https://bugs.php.net/bug.php?id=48147
  688. *
  689. * @since 1.0
  690. */
  691. public static function transcode($source, $from_encoding, $to_encoding)
  692. {
  693. if (is_string($source))
  694. {
  695. switch (ICONV_IMPL)
  696. {
  697. case 'glibc':
  698. return @iconv($from_encoding, $to_encoding . '//TRANSLIT,IGNORE', $source);
  699. case 'libiconv':
  700. default:
  701. return iconv($from_encoding, $to_encoding . '//IGNORE//TRANSLIT', $source);
  702. }
  703. }
  704. return null;
  705. }
  706. /**
  707. * Tests a string as to whether it's valid UTF-8 and supported by the Unicode standard.
  708. *
  709. * Note: this function has been modified to simple return true or false.
  710. *
  711. * @param string $str UTF-8 encoded string.
  712. *
  713. * @return boolean true if valid
  714. *
  715. * @author <hsivonen@iki.fi>
  716. * @see http://hsivonen.iki.fi/php-utf8/
  717. * @see compliant
  718. * @since 1.0
  719. */
  720. public static function valid($str)
  721. {
  722. // Cached expected number of octets after the current octet
  723. // until the beginning of the next UTF8 character sequence
  724. $mState = 0;
  725. // Cached Unicode character
  726. $mUcs4 = 0;
  727. // Cached expected number of octets in the current sequence
  728. $mBytes = 1;
  729. $len = strlen($str);
  730. for ($i = 0; $i < $len; $i++)
  731. {
  732. $in = ord($str{$i});
  733. if ($mState == 0)
  734. {
  735. // When mState is zero we expect either a US-ASCII character or a
  736. // multi-octet sequence.
  737. if (0 == (0x80 & ($in)))
  738. {
  739. // US-ASCII, pass straight through.
  740. $mBytes = 1;
  741. }
  742. elseif (0xC0 == (0xE0 & ($in)))
  743. {
  744. // First octet of 2 octet sequence
  745. $mUcs4 = ($in);
  746. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  747. $mState = 1;
  748. $mBytes = 2;
  749. }
  750. elseif (0xE0 == (0xF0 & ($in)))
  751. {
  752. // First octet of 3 octet sequence
  753. $mUcs4 = ($in);
  754. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  755. $mState = 2;
  756. $mBytes = 3;
  757. }
  758. elseif (0xF0 == (0xF8 & ($in)))
  759. {
  760. // First octet of 4 octet sequence
  761. $mUcs4 = ($in);
  762. $mUcs4 = ($mUcs4 & 0x07) << 18;
  763. $mState = 3;
  764. $mBytes = 4;
  765. }
  766. elseif (0xF8 == (0xFC & ($in)))
  767. {
  768. /* First octet of 5 octet sequence.
  769. *
  770. * This is illegal because the encoded codepoint must be either
  771. * (a) not the shortest form or
  772. * (b) outside the Unicode range of 0-0x10FFFF.
  773. * Rather than trying to resynchronize, we will carry on until the end
  774. * of the sequence and let the later error handling code catch it.
  775. */
  776. $mUcs4 = ($in);
  777. $mUcs4 = ($mUcs4 & 0x03) << 24;
  778. $mState = 4;
  779. $mBytes = 5;
  780. }
  781. elseif (0xFC == (0xFE & ($in)))
  782. {
  783. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  784. $mUcs4 = ($in);
  785. $mUcs4 = ($mUcs4 & 1) << 30;
  786. $mState = 5;
  787. $mBytes = 6;
  788. }
  789. else
  790. {
  791. /*
  792. * Current octet is neither in the US-ASCII range nor a legal first
  793. * octet of a multi-octet sequence.
  794. */
  795. return false;
  796. }
  797. }
  798. else
  799. {
  800. // When mState is non-zero, we expect a continuation of the multi-octet
  801. // sequence
  802. if (0x80 == (0xC0 & ($in)))
  803. {
  804. // Legal continuation.
  805. $shift = ($mState - 1) * 6;
  806. $tmp = $in;
  807. $tmp = ($tmp & 0x0000003F) << $shift;
  808. $mUcs4 |= $tmp;
  809. /**
  810. * End of the multi-octet sequence. mUcs4 now contains the final
  811. * Unicode codepoint to be output
  812. */
  813. if (0 == --$mState)
  814. {
  815. /*
  816. * Check for illegal sequences and codepoints.
  817. */
  818. // From Unicode 3.1, non-shortest form is illegal
  819. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || ((3 == $mBytes) && ($mUcs4 < 0x0800)) || ((4 == $mBytes) && ($mUcs4 < 0x10000))
  820. || (4 < $mBytes)
  821. || (($mUcs4 & 0xFFFFF800) == 0xD800) // From Unicode 3.2, surrogate characters are illegal
  822. || ($mUcs4 > 0x10FFFF)) // Codepoints outside the Unicode range are illegal
  823. {
  824. return false;
  825. }
  826. // Initialize UTF8 cache.
  827. $mState = 0;
  828. $mUcs4 = 0;
  829. $mBytes = 1;
  830. }
  831. }
  832. else
  833. {
  834. /**
  835. *((0xC0 & (*in) != 0x80) && (mState != 0))
  836. * Incomplete multi-octet sequence.
  837. */
  838. return false;
  839. }
  840. }
  841. }
  842. return true;
  843. }
  844. /**
  845. * Tests whether a string complies as UTF-8. This will be much
  846. * faster than utf8_is_valid but will pass five and six octet
  847. * UTF-8 sequences, which are not supported by Unicode and
  848. * so cannot be displayed correctly in a browser. In other words
  849. * it is not as strict as utf8_is_valid but it's faster. If you use
  850. * it to validate user input, you place yourself at the risk that
  851. * attackers will be able to inject 5 and 6 byte sequences (which
  852. * may or may not be a significant risk, depending on what you are
  853. * are doing)
  854. *
  855. * @param string $str UTF-8 string to check
  856. *
  857. * @return boolean TRUE if string is valid UTF-8
  858. *
  859. * @see valid
  860. * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
  861. * @since 1.0
  862. */
  863. public static function compliant($str)
  864. {
  865. if (strlen($str) == 0)
  866. {
  867. return true;
  868. }
  869. /*
  870. * If even just the first character can be matched, when the /u
  871. * modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
  872. * invalid, nothing at all will match, even if the string contains
  873. * some valid sequences
  874. */
  875. return (preg_match('/^.{1}/us', $str, $ar) == 1);
  876. }
  877. }