PageRenderTime 56ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/src/Joomla/String/String.php

https://github.com/dianaprajescu/joomla-framework
PHP | 939 lines | 424 code | 68 blank | 447 comment | 76 complexity | 7de709054b3a645a2b24126c64822a32 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1
  1. <?php
  2. /**
  3. * Part of the Joomla Framework String Package
  4. *
  5. * @copyright Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
  6. * @license GNU General Public License version 2 or later; see LICENSE
  7. */
  8. namespace Joomla\String;
  9. // PHP mbstring and iconv local configuration
  10. // Check if mbstring extension is loaded and attempt to load it if not present except for windows
  11. if (extension_loaded('mbstring'))
  12. {
  13. // Make sure to suppress the output in case ini_set is disabled
  14. @ini_set('mbstring.internal_encoding', 'UTF-8');
  15. @ini_set('mbstring.http_input', 'UTF-8');
  16. @ini_set('mbstring.http_output', 'UTF-8');
  17. }
  18. // Same for iconv
  19. if (function_exists('iconv'))
  20. {
  21. // These are settings that can be set inside code
  22. iconv_set_encoding("internal_encoding", "UTF-8");
  23. iconv_set_encoding("input_encoding", "UTF-8");
  24. iconv_set_encoding("output_encoding", "UTF-8");
  25. }
  26. /**
  27. * Include the utf8 package
  28. */
  29. jimport('phputf8.utf8');
  30. jimport('phputf8.strcasecmp');
  31. /**
  32. * String handling class for utf-8 data
  33. * Wraps the phputf8 library
  34. * All functions assume the validity of utf-8 strings.
  35. *
  36. * @since 1.0
  37. */
  38. abstract class String
  39. {
  40. /**
  41. * Increment styles.
  42. *
  43. * @var array
  44. * @since 1.0
  45. */
  46. protected static $incrementStyles = array(
  47. 'dash' => array(
  48. '#-(\d+)$#',
  49. '-%d'
  50. ),
  51. 'default' => array(
  52. array('#\((\d+)\)$#', '#\(\d+\)$#'),
  53. array(' (%d)', '(%d)'),
  54. ),
  55. );
  56. /**
  57. * Increments a trailing number in a string.
  58. *
  59. * Used to easily create distinct labels when copying objects. The method has the following styles:
  60. *
  61. * default: "Label" becomes "Label (2)"
  62. * dash: "Label" becomes "Label-2"
  63. *
  64. * @param string $string The source string.
  65. * @param string $style The the style (default|dash).
  66. * @param integer $n If supplied, this number is used for the copy, otherwise it is the 'next' number.
  67. *
  68. * @return string The incremented string.
  69. *
  70. * @since 1.0
  71. */
  72. public static function increment($string, $style = 'default', $n = 0)
  73. {
  74. $styleSpec = isset(self::$incrementStyles[$style]) ? self::$incrementStyles[$style] : self::$incrementStyles['default'];
  75. // Regular expression search and replace patterns.
  76. if (is_array($styleSpec[0]))
  77. {
  78. $rxSearch = $styleSpec[0][0];
  79. $rxReplace = $styleSpec[0][1];
  80. }
  81. else
  82. {
  83. $rxSearch = $rxReplace = $styleSpec[0];
  84. }
  85. // New and old (existing) sprintf formats.
  86. if (is_array($styleSpec[1]))
  87. {
  88. $newFormat = $styleSpec[1][0];
  89. $oldFormat = $styleSpec[1][1];
  90. }
  91. else
  92. {
  93. $newFormat = $oldFormat = $styleSpec[1];
  94. }
  95. // Check if we are incrementing an existing pattern, or appending a new one.
  96. if (preg_match($rxSearch, $string, $matches))
  97. {
  98. $n = empty($n) ? ($matches[1] + 1) : $n;
  99. $string = preg_replace($rxReplace, sprintf($oldFormat, $n), $string);
  100. }
  101. else
  102. {
  103. $n = empty($n) ? 2 : $n;
  104. $string .= sprintf($newFormat, $n);
  105. }
  106. return $string;
  107. }
  108. /**
  109. * Tests whether a string contains only 7bit ASCII bytes.
  110. * You might use this to conditionally check whether a string
  111. * needs handling as UTF-8 or not, potentially offering performance
  112. * benefits by using the native PHP equivalent if it's just ASCII e.g.;
  113. *
  114. * <code>
  115. * if (String::is_ascii($someString))
  116. * {
  117. * // It's just ASCII - use the native PHP version
  118. * $someString = strtolower($someString);
  119. * }
  120. * else
  121. * {
  122. * $someString = String::strtolower($someString);
  123. * }
  124. * </code>
  125. *
  126. * @param string $str The string to test.
  127. *
  128. * @return boolean True if the string is all ASCII
  129. *
  130. * @since 1.0
  131. */
  132. public static function is_ascii($str)
  133. {
  134. // Search for any bytes which are outside the ASCII range...
  135. return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
  136. }
  137. /**
  138. * UTF-8 aware alternative to strpos.
  139. *
  140. * Find position of first occurrence of a string.
  141. *
  142. * @param string $str String being examined
  143. * @param string $search String being searched for
  144. * @param integer $offset Optional, specifies the position from which the search should be performed
  145. *
  146. * @return mixed Number of characters before the first match or FALSE on failure
  147. *
  148. * @see http://www.php.net/strpos
  149. * @since 1.0
  150. */
  151. public static function strpos($str, $search, $offset = false)
  152. {
  153. if ($offset === false)
  154. {
  155. return utf8_strpos($str, $search);
  156. }
  157. else
  158. {
  159. return utf8_strpos($str, $search, $offset);
  160. }
  161. }
  162. /**
  163. * UTF-8 aware alternative to strrpos
  164. * Finds position of last occurrence of a string
  165. *
  166. * @param string $str String being examined.
  167. * @param string $search String being searched for.
  168. * @param integer $offset Offset from the left of the string.
  169. *
  170. * @return mixed Number of characters before the last match or false on failure
  171. *
  172. * @see http://www.php.net/strrpos
  173. * @since 1.0
  174. */
  175. public static function strrpos($str, $search, $offset = 0)
  176. {
  177. return utf8_strrpos($str, $search, $offset);
  178. }
  179. /**
  180. * UTF-8 aware alternative to substr
  181. * Return part of a string given character offset (and optionally length)
  182. *
  183. * @param string $str String being processed
  184. * @param integer $offset Number of UTF-8 characters offset (from left)
  185. * @param integer $length Optional length in UTF-8 characters from offset
  186. *
  187. * @return mixed string or FALSE if failure
  188. *
  189. * @see http://www.php.net/substr
  190. * @since 1.0
  191. */
  192. public static function substr($str, $offset, $length = false)
  193. {
  194. if ($length === false)
  195. {
  196. return utf8_substr($str, $offset);
  197. }
  198. else
  199. {
  200. return utf8_substr($str, $offset, $length);
  201. }
  202. }
  203. /**
  204. * UTF-8 aware alternative to strtlower
  205. *
  206. * Make a string lowercase
  207. * Note: The concept of a characters "case" only exists is some alphabets
  208. * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
  209. * not exist in the Chinese alphabet, for example. See Unicode Standard
  210. * Annex #21: Case Mappings
  211. *
  212. * @param string $str String being processed
  213. *
  214. * @return mixed Either string in lowercase or FALSE is UTF-8 invalid
  215. *
  216. * @see http://www.php.net/strtolower
  217. * @since 1.0
  218. */
  219. public static function strtolower($str)
  220. {
  221. return utf8_strtolower($str);
  222. }
  223. /**
  224. * UTF-8 aware alternative to strtoupper
  225. * Make a string uppercase
  226. * Note: The concept of a characters "case" only exists is some alphabets
  227. * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
  228. * not exist in the Chinese alphabet, for example. See Unicode Standard
  229. * Annex #21: Case Mappings
  230. *
  231. * @param string $str String being processed
  232. *
  233. * @return mixed Either string in uppercase or FALSE is UTF-8 invalid
  234. *
  235. * @see http://www.php.net/strtoupper
  236. * @since 1.0
  237. */
  238. public static function strtoupper($str)
  239. {
  240. return utf8_strtoupper($str);
  241. }
  242. /**
  243. * UTF-8 aware alternative to strlen.
  244. *
  245. * Returns the number of characters in the string (NOT THE NUMBER OF BYTES),
  246. *
  247. * @param string $str UTF-8 string.
  248. *
  249. * @return integer Number of UTF-8 characters in string.
  250. *
  251. * @see http://www.php.net/strlen
  252. * @since 1.0
  253. */
  254. public static function strlen($str)
  255. {
  256. return utf8_strlen($str);
  257. }
  258. /**
  259. * UTF-8 aware alternative to str_ireplace
  260. * Case-insensitive version of str_replace
  261. *
  262. * @param string $search String to search
  263. * @param string $replace Existing string to replace
  264. * @param string $str New string to replace with
  265. * @param integer $count Optional count value to be passed by referene
  266. *
  267. * @return string UTF-8 String
  268. *
  269. * @see http://www.php.net/str_ireplace
  270. * @since 1.0
  271. */
  272. public static function str_ireplace($search, $replace, $str, $count = null)
  273. {
  274. jimport('phputf8.str_ireplace');
  275. if ($count === false)
  276. {
  277. return utf8_ireplace($search, $replace, $str);
  278. }
  279. else
  280. {
  281. return utf8_ireplace($search, $replace, $str, $count);
  282. }
  283. }
  284. /**
  285. * UTF-8 aware alternative to str_split
  286. * Convert a string to an array
  287. *
  288. * @param string $str UTF-8 encoded string to process
  289. * @param integer $split_len Number to characters to split string by
  290. *
  291. * @return array
  292. *
  293. * @see http://www.php.net/str_split
  294. * @since 1.0
  295. */
  296. public static function str_split($str, $split_len = 1)
  297. {
  298. jimport('phputf8.str_split');
  299. return utf8_str_split($str, $split_len);
  300. }
  301. /**
  302. * UTF-8/LOCALE aware alternative to strcasecmp
  303. * A case insensitive string comparison
  304. *
  305. * @param string $str1 string 1 to compare
  306. * @param string $str2 string 2 to compare
  307. * @param mixed $locale The locale used by strcoll or false to use classical comparison
  308. *
  309. * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
  310. *
  311. * @see http://www.php.net/strcasecmp
  312. * @see http://www.php.net/strcoll
  313. * @see http://www.php.net/setlocale
  314. * @since 1.0
  315. */
  316. public static function strcasecmp($str1, $str2, $locale = false)
  317. {
  318. if ($locale)
  319. {
  320. // Get current locale
  321. $locale0 = setlocale(LC_COLLATE, 0);
  322. if (!$locale = setlocale(LC_COLLATE, $locale))
  323. {
  324. $locale = $locale0;
  325. }
  326. // See if we have successfully set locale to UTF-8
  327. if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
  328. {
  329. $encoding = 'CP' . $m[1];
  330. }
  331. elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8'))
  332. {
  333. $encoding = 'UTF-8';
  334. }
  335. else
  336. {
  337. $encoding = 'nonrecodable';
  338. }
  339. // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
  340. if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
  341. {
  342. return strcoll(utf8_strtolower($str1), utf8_strtolower($str2));
  343. }
  344. else
  345. {
  346. return strcoll(
  347. self::transcode(utf8_strtolower($str1), 'UTF-8', $encoding),
  348. self::transcode(utf8_strtolower($str2), 'UTF-8', $encoding)
  349. );
  350. }
  351. }
  352. else
  353. {
  354. return utf8_strcasecmp($str1, $str2);
  355. }
  356. }
  357. /**
  358. * UTF-8/LOCALE aware alternative to strcmp
  359. * A case sensitive string comparison
  360. *
  361. * @param string $str1 string 1 to compare
  362. * @param string $str2 string 2 to compare
  363. * @param mixed $locale The locale used by strcoll or false to use classical comparison
  364. *
  365. * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
  366. *
  367. * @see http://www.php.net/strcmp
  368. * @see http://www.php.net/strcoll
  369. * @see http://www.php.net/setlocale
  370. * @since 1.0
  371. */
  372. public static function strcmp($str1, $str2, $locale = false)
  373. {
  374. if ($locale)
  375. {
  376. // Get current locale
  377. $locale0 = setlocale(LC_COLLATE, 0);
  378. if (!$locale = setlocale(LC_COLLATE, $locale))
  379. {
  380. $locale = $locale0;
  381. }
  382. // See if we have successfully set locale to UTF-8
  383. if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
  384. {
  385. $encoding = 'CP' . $m[1];
  386. }
  387. elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8'))
  388. {
  389. $encoding = 'UTF-8';
  390. }
  391. else
  392. {
  393. $encoding = 'nonrecodable';
  394. }
  395. // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
  396. if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
  397. {
  398. return strcoll($str1, $str2);
  399. }
  400. else
  401. {
  402. return strcoll(self::transcode($str1, 'UTF-8', $encoding), self::transcode($str2, 'UTF-8', $encoding));
  403. }
  404. }
  405. else
  406. {
  407. return strcmp($str1, $str2);
  408. }
  409. }
  410. /**
  411. * UTF-8 aware alternative to strcspn
  412. * Find length of initial segment not matching mask
  413. *
  414. * @param string $str The string to process
  415. * @param string $mask The mask
  416. * @param integer $start Optional starting character position (in characters)
  417. * @param integer $length Optional length
  418. *
  419. * @return integer The length of the initial segment of str1 which does not contain any of the characters in str2
  420. *
  421. * @see http://www.php.net/strcspn
  422. * @since 1.0
  423. */
  424. public static function strcspn($str, $mask, $start = null, $length = null)
  425. {
  426. jimport('phputf8.strcspn');
  427. if ($start === false && $length === false)
  428. {
  429. return utf8_strcspn($str, $mask);
  430. }
  431. elseif ($length === false)
  432. {
  433. return utf8_strcspn($str, $mask, $start);
  434. }
  435. else
  436. {
  437. return utf8_strcspn($str, $mask, $start, $length);
  438. }
  439. }
  440. /**
  441. * UTF-8 aware alternative to stristr
  442. * Returns all of haystack from the first occurrence of needle to the end.
  443. * needle and haystack are examined in a case-insensitive manner
  444. * Find first occurrence of a string using case insensitive comparison
  445. *
  446. * @param string $str The haystack
  447. * @param string $search The needle
  448. *
  449. * @return string the sub string
  450. *
  451. * @see http://www.php.net/stristr
  452. * @since 1.0
  453. */
  454. public static function stristr($str, $search)
  455. {
  456. jimport('phputf8.stristr');
  457. return utf8_stristr($str, $search);
  458. }
  459. /**
  460. * UTF-8 aware alternative to strrev
  461. * Reverse a string
  462. *
  463. * @param string $str String to be reversed
  464. *
  465. * @return string The string in reverse character order
  466. *
  467. * @see http://www.php.net/strrev
  468. * @since 1.0
  469. */
  470. public static function strrev($str)
  471. {
  472. jimport('phputf8.strrev');
  473. return utf8_strrev($str);
  474. }
  475. /**
  476. * UTF-8 aware alternative to strspn
  477. * Find length of initial segment matching mask
  478. *
  479. * @param string $str The haystack
  480. * @param string $mask The mask
  481. * @param integer $start Start optional
  482. * @param integer $length Length optional
  483. *
  484. * @return integer
  485. *
  486. * @see http://www.php.net/strspn
  487. * @since 1.0
  488. */
  489. public static function strspn($str, $mask, $start = null, $length = null)
  490. {
  491. jimport('phputf8.strspn');
  492. if ($start === null && $length === null)
  493. {
  494. return utf8_strspn($str, $mask);
  495. }
  496. elseif ($length === null)
  497. {
  498. return utf8_strspn($str, $mask, $start);
  499. }
  500. else
  501. {
  502. return utf8_strspn($str, $mask, $start, $length);
  503. }
  504. }
  505. /**
  506. * UTF-8 aware substr_replace
  507. * Replace text within a portion of a string
  508. *
  509. * @param string $str The haystack
  510. * @param string $repl The replacement string
  511. * @param integer $start Start
  512. * @param integer $length Length (optional)
  513. *
  514. * @return string
  515. *
  516. * @see http://www.php.net/substr_replace
  517. * @since 1.0
  518. */
  519. public static function substr_replace($str, $repl, $start, $length = null)
  520. {
  521. // Loaded by library loader
  522. if ($length === false)
  523. {
  524. return utf8_substr_replace($str, $repl, $start);
  525. }
  526. else
  527. {
  528. return utf8_substr_replace($str, $repl, $start, $length);
  529. }
  530. }
  531. /**
  532. * UTF-8 aware replacement for ltrim()
  533. *
  534. * Strip whitespace (or other characters) from the beginning of a string
  535. * You only need to use this if you are supplying the charlist
  536. * optional arg and it contains UTF-8 characters. Otherwise ltrim will
  537. * work normally on a UTF-8 string
  538. *
  539. * @param string $str The string to be trimmed
  540. * @param string $charlist The optional charlist of additional characters to trim
  541. *
  542. * @return string The trimmed string
  543. *
  544. * @see http://www.php.net/ltrim
  545. * @since 1.0
  546. */
  547. public static function ltrim($str, $charlist = false)
  548. {
  549. if (empty($charlist) && $charlist !== false)
  550. {
  551. return $str;
  552. }
  553. jimport('phputf8.trim');
  554. if ($charlist === false)
  555. {
  556. return utf8_ltrim($str);
  557. }
  558. else
  559. {
  560. return utf8_ltrim($str, $charlist);
  561. }
  562. }
  563. /**
  564. * UTF-8 aware replacement for rtrim()
  565. * Strip whitespace (or other characters) from the end of a string
  566. * You only need to use this if you are supplying the charlist
  567. * optional arg and it contains UTF-8 characters. Otherwise rtrim will
  568. * work normally on a UTF-8 string
  569. *
  570. * @param string $str The string to be trimmed
  571. * @param string $charlist The optional charlist of additional characters to trim
  572. *
  573. * @return string The trimmed string
  574. *
  575. * @see http://www.php.net/rtrim
  576. * @since 1.0
  577. */
  578. public static function rtrim($str, $charlist = false)
  579. {
  580. if (empty($charlist) && $charlist !== false)
  581. {
  582. return $str;
  583. }
  584. jimport('phputf8.trim');
  585. if ($charlist === false)
  586. {
  587. return utf8_rtrim($str);
  588. }
  589. else
  590. {
  591. return utf8_rtrim($str, $charlist);
  592. }
  593. }
  594. /**
  595. * UTF-8 aware replacement for trim()
  596. * Strip whitespace (or other characters) from the beginning and end of a string
  597. * Note: you only need to use this if you are supplying the charlist
  598. * optional arg and it contains UTF-8 characters. Otherwise trim will
  599. * work normally on a UTF-8 string
  600. *
  601. * @param string $str The string to be trimmed
  602. * @param string $charlist The optional charlist of additional characters to trim
  603. *
  604. * @return string The trimmed string
  605. *
  606. * @see http://www.php.net/trim
  607. * @since 1.0
  608. */
  609. public static function trim($str, $charlist = false)
  610. {
  611. if (empty($charlist) && $charlist !== false)
  612. {
  613. return $str;
  614. }
  615. jimport('phputf8.trim');
  616. if ($charlist === false)
  617. {
  618. return utf8_trim($str);
  619. }
  620. else
  621. {
  622. return utf8_trim($str, $charlist);
  623. }
  624. }
  625. /**
  626. * UTF-8 aware alternative to ucfirst
  627. * Make a string's first character uppercase or all words' first character uppercase
  628. *
  629. * @param string $str String to be processed
  630. * @param string $delimiter The words delimiter (null means do not split the string)
  631. * @param string $newDelimiter The new words delimiter (null means equal to $delimiter)
  632. *
  633. * @return string If $delimiter is null, return the string with first character as upper case (if applicable)
  634. * else consider the string of words separated by the delimiter, apply the ucfirst to each words
  635. * and return the string with the new delimiter
  636. *
  637. * @see http://www.php.net/ucfirst
  638. * @since 1.0
  639. */
  640. public static function ucfirst($str, $delimiter = null, $newDelimiter = null)
  641. {
  642. jimport('phputf8.ucfirst');
  643. if ($delimiter === null)
  644. {
  645. return utf8_ucfirst($str);
  646. }
  647. else
  648. {
  649. if ($newDelimiter === null)
  650. {
  651. $newDelimiter = $delimiter;
  652. }
  653. return implode($newDelimiter, array_map('utf8_ucfirst', explode($delimiter, $str)));
  654. }
  655. }
  656. /**
  657. * UTF-8 aware alternative to ucwords
  658. * Uppercase the first character of each word in a string
  659. *
  660. * @param string $str String to be processed
  661. *
  662. * @return string String with first char of each word uppercase
  663. *
  664. * @see http://www.php.net/ucwords
  665. * @since 1.0
  666. */
  667. public static function ucwords($str)
  668. {
  669. jimport('phputf8.ucwords');
  670. return utf8_ucwords($str);
  671. }
  672. /**
  673. * Transcode a string.
  674. *
  675. * @param string $source The string to transcode.
  676. * @param string $from_encoding The source encoding.
  677. * @param string $to_encoding The target encoding.
  678. *
  679. * @return mixed The transcoded string, or null if the source was not a string.
  680. *
  681. * @link https://bugs.php.net/bug.php?id=48147
  682. *
  683. * @since 1.0
  684. */
  685. public static function transcode($source, $from_encoding, $to_encoding)
  686. {
  687. if (is_string($source))
  688. {
  689. switch (ICONV_IMPL)
  690. {
  691. case 'glibc':
  692. return @iconv($from_encoding, $to_encoding . '//TRANSLIT,IGNORE', $source);
  693. case 'libiconv':
  694. default:
  695. return iconv($from_encoding, $to_encoding . '//IGNORE//TRANSLIT', $source);
  696. }
  697. }
  698. return null;
  699. }
  700. /**
  701. * Tests a string as to whether it's valid UTF-8 and supported by the Unicode standard.
  702. *
  703. * Note: this function has been modified to simple return true or false.
  704. *
  705. * @param string $str UTF-8 encoded string.
  706. *
  707. * @return boolean true if valid
  708. *
  709. * @author <hsivonen@iki.fi>
  710. * @see http://hsivonen.iki.fi/php-utf8/
  711. * @see compliant
  712. * @since 1.0
  713. */
  714. public static function valid($str)
  715. {
  716. // Cached expected number of octets after the current octet
  717. // until the beginning of the next UTF8 character sequence
  718. $mState = 0;
  719. // Cached Unicode character
  720. $mUcs4 = 0;
  721. // Cached expected number of octets in the current sequence
  722. $mBytes = 1;
  723. $len = strlen($str);
  724. for ($i = 0; $i < $len; $i++)
  725. {
  726. $in = ord($str{$i});
  727. if ($mState == 0)
  728. {
  729. // When mState is zero we expect either a US-ASCII character or a
  730. // multi-octet sequence.
  731. if (0 == (0x80 & ($in)))
  732. {
  733. // US-ASCII, pass straight through.
  734. $mBytes = 1;
  735. }
  736. elseif (0xC0 == (0xE0 & ($in)))
  737. {
  738. // First octet of 2 octet sequence
  739. $mUcs4 = ($in);
  740. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  741. $mState = 1;
  742. $mBytes = 2;
  743. }
  744. elseif (0xE0 == (0xF0 & ($in)))
  745. {
  746. // First octet of 3 octet sequence
  747. $mUcs4 = ($in);
  748. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  749. $mState = 2;
  750. $mBytes = 3;
  751. }
  752. elseif (0xF0 == (0xF8 & ($in)))
  753. {
  754. // First octet of 4 octet sequence
  755. $mUcs4 = ($in);
  756. $mUcs4 = ($mUcs4 & 0x07) << 18;
  757. $mState = 3;
  758. $mBytes = 4;
  759. }
  760. elseif (0xF8 == (0xFC & ($in)))
  761. {
  762. /* First octet of 5 octet sequence.
  763. *
  764. * This is illegal because the encoded codepoint must be either
  765. * (a) not the shortest form or
  766. * (b) outside the Unicode range of 0-0x10FFFF.
  767. * Rather than trying to resynchronize, we will carry on until the end
  768. * of the sequence and let the later error handling code catch it.
  769. */
  770. $mUcs4 = ($in);
  771. $mUcs4 = ($mUcs4 & 0x03) << 24;
  772. $mState = 4;
  773. $mBytes = 5;
  774. }
  775. elseif (0xFC == (0xFE & ($in)))
  776. {
  777. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  778. $mUcs4 = ($in);
  779. $mUcs4 = ($mUcs4 & 1) << 30;
  780. $mState = 5;
  781. $mBytes = 6;
  782. }
  783. else
  784. {
  785. /*
  786. * Current octet is neither in the US-ASCII range nor a legal first
  787. * octet of a multi-octet sequence.
  788. */
  789. return false;
  790. }
  791. }
  792. else
  793. {
  794. // When mState is non-zero, we expect a continuation of the multi-octet
  795. // sequence
  796. if (0x80 == (0xC0 & ($in)))
  797. {
  798. // Legal continuation.
  799. $shift = ($mState - 1) * 6;
  800. $tmp = $in;
  801. $tmp = ($tmp & 0x0000003F) << $shift;
  802. $mUcs4 |= $tmp;
  803. /**
  804. * End of the multi-octet sequence. mUcs4 now contains the final
  805. * Unicode codepoint to be output
  806. */
  807. if (0 == --$mState)
  808. {
  809. /*
  810. * Check for illegal sequences and codepoints.
  811. */
  812. // From Unicode 3.1, non-shortest form is illegal
  813. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || ((3 == $mBytes) && ($mUcs4 < 0x0800)) || ((4 == $mBytes) && ($mUcs4 < 0x10000))
  814. || (4 < $mBytes)
  815. || (($mUcs4 & 0xFFFFF800) == 0xD800) // From Unicode 3.2, surrogate characters are illegal
  816. || ($mUcs4 > 0x10FFFF)) // Codepoints outside the Unicode range are illegal
  817. {
  818. return false;
  819. }
  820. // Initialize UTF8 cache.
  821. $mState = 0;
  822. $mUcs4 = 0;
  823. $mBytes = 1;
  824. }
  825. }
  826. else
  827. {
  828. /**
  829. *((0xC0 & (*in) != 0x80) && (mState != 0))
  830. * Incomplete multi-octet sequence.
  831. */
  832. return false;
  833. }
  834. }
  835. }
  836. return true;
  837. }
  838. /**
  839. * Tests whether a string complies as UTF-8. This will be much
  840. * faster than utf8_is_valid but will pass five and six octet
  841. * UTF-8 sequences, which are not supported by Unicode and
  842. * so cannot be displayed correctly in a browser. In other words
  843. * it is not as strict as utf8_is_valid but it's faster. If you use
  844. * it to validate user input, you place yourself at the risk that
  845. * attackers will be able to inject 5 and 6 byte sequences (which
  846. * may or may not be a significant risk, depending on what you are
  847. * are doing)
  848. *
  849. * @param string $str UTF-8 string to check
  850. *
  851. * @return boolean TRUE if string is valid UTF-8
  852. *
  853. * @see valid
  854. * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
  855. * @since 1.0
  856. */
  857. public static function compliant($str)
  858. {
  859. if (strlen($str) == 0)
  860. {
  861. return true;
  862. }
  863. /*
  864. * If even just the first character can be matched, when the /u
  865. * modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
  866. * invalid, nothing at all will match, even if the string contains
  867. * some valid sequences
  868. */
  869. return (preg_match('/^.{1}/us', $str, $ar) == 1);
  870. }
  871. }