PageRenderTime 45ms CodeModel.GetById 10ms RepoModel.GetById 1ms app.codeStats 0ms

/libraries/joomla/string/string.php

http://github.com/joomla/joomla-platform
PHP | 944 lines | 443 code | 66 blank | 435 comment | 76 complexity | f7fe206569e95e436e29fb7aa9f6f473 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1
  1. <?php
  2. /**
  3. * @package Joomla.Platform
  4. * @subpackage String
  5. *
  6. * @copyright Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
  7. * @license GNU General Public License version 2 or later; see LICENSE
  8. */
  9. defined('JPATH_PLATFORM') or die;
  10. // PHP mbstring and iconv local configuration
  11. // Check if mbstring extension is loaded and attempt to load it if not present except for windows
  12. if (extension_loaded('mbstring'))
  13. {
  14. // Make sure to suppress the output in case ini_set is disabled
  15. @ini_set('mbstring.internal_encoding', 'UTF-8');
  16. @ini_set('mbstring.http_input', 'UTF-8');
  17. @ini_set('mbstring.http_output', 'UTF-8');
  18. }
  19. // Same for iconv
  20. if (function_exists('iconv'))
  21. {
  22. // These are settings that can be set inside code
  23. iconv_set_encoding("internal_encoding", "UTF-8");
  24. iconv_set_encoding("input_encoding", "UTF-8");
  25. iconv_set_encoding("output_encoding", "UTF-8");
  26. }
  27. /**
  28. * Include the utf8 package
  29. */
  30. jimport('phputf8.utf8');
  31. jimport('phputf8.strcasecmp');
  32. /**
  33. * String handling class for utf-8 data
  34. * Wraps the phputf8 library
  35. * All functions assume the validity of utf-8 strings.
  36. *
  37. * @package Joomla.Platform
  38. * @subpackage String
  39. * @since 11.1
  40. */
  41. abstract class JString
  42. {
  43. /**
  44. * Increment styles.
  45. *
  46. * @var array
  47. * @since 11.3
  48. */
  49. protected static $incrementStyles = array(
  50. 'dash' => array(
  51. '#-(\d+)$#',
  52. '-%d'
  53. ),
  54. 'default' => array(
  55. array('#\((\d+)\)$#', '#\(\d+\)$#'),
  56. array(' (%d)', '(%d)'),
  57. ),
  58. );
  59. /**
  60. * Increments a trailing number in a string.
  61. *
  62. * Used to easily create distinct labels when copying objects. The method has the following styles:
  63. *
  64. * default: "Label" becomes "Label (2)"
  65. * dash: "Label" becomes "Label-2"
  66. *
  67. * @param string $string The source string.
  68. * @param string $style The the style (default|dash).
  69. * @param integer $n If supplied, this number is used for the copy, otherwise it is the 'next' number.
  70. *
  71. * @return string The incremented string.
  72. *
  73. * @since 11.3
  74. */
  75. public static function increment($string, $style = 'default', $n = 0)
  76. {
  77. $styleSpec = isset(self::$incrementStyles[$style]) ? self::$incrementStyles[$style] : self::$incrementStyles['default'];
  78. // Regular expression search and replace patterns.
  79. if (is_array($styleSpec[0]))
  80. {
  81. $rxSearch = $styleSpec[0][0];
  82. $rxReplace = $styleSpec[0][1];
  83. }
  84. else
  85. {
  86. $rxSearch = $rxReplace = $styleSpec[0];
  87. }
  88. // New and old (existing) sprintf formats.
  89. if (is_array($styleSpec[1]))
  90. {
  91. $newFormat = $styleSpec[1][0];
  92. $oldFormat = $styleSpec[1][1];
  93. }
  94. else
  95. {
  96. $newFormat = $oldFormat = $styleSpec[1];
  97. }
  98. // Check if we are incrementing an existing pattern, or appending a new one.
  99. if (preg_match($rxSearch, $string, $matches))
  100. {
  101. $n = empty($n) ? ($matches[1] + 1) : $n;
  102. $string = preg_replace($rxReplace, sprintf($oldFormat, $n), $string);
  103. }
  104. else
  105. {
  106. $n = empty($n) ? 2 : $n;
  107. $string .= sprintf($newFormat, $n);
  108. }
  109. return $string;
  110. }
  111. /**
  112. * UTF-8 aware alternative to strpos.
  113. *
  114. * Find position of first occurrence of a string.
  115. *
  116. * @param string $str String being examined
  117. * @param string $search String being searched for
  118. * @param integer $offset Optional, specifies the position from which the search should be performed
  119. *
  120. * @return mixed Number of characters before the first match or FALSE on failure
  121. *
  122. * @see http://www.php.net/strpos
  123. * @since 11.1
  124. */
  125. public static function strpos($str, $search, $offset = false)
  126. {
  127. if ($offset === false)
  128. {
  129. return utf8_strpos($str, $search);
  130. }
  131. else
  132. {
  133. return utf8_strpos($str, $search, $offset);
  134. }
  135. }
  136. /**
  137. * UTF-8 aware alternative to strrpos
  138. * Finds position of last occurrence of a string
  139. *
  140. * @param string $str String being examined.
  141. * @param string $search String being searched for.
  142. * @param integer $offset Offset from the left of the string.
  143. *
  144. * @return mixed Number of characters before the last match or false on failure
  145. *
  146. * @see http://www.php.net/strrpos
  147. * @since 11.1
  148. */
  149. public static function strrpos($str, $search, $offset = 0)
  150. {
  151. return utf8_strrpos($str, $search, $offset);
  152. }
  153. /**
  154. * UTF-8 aware alternative to substr
  155. * Return part of a string given character offset (and optionally length)
  156. *
  157. * @param string $str String being processed
  158. * @param integer $offset Number of UTF-8 characters offset (from left)
  159. * @param integer $length Optional length in UTF-8 characters from offset
  160. *
  161. * @return mixed string or FALSE if failure
  162. *
  163. * @see http://www.php.net/substr
  164. * @since 11.1
  165. */
  166. public static function substr($str, $offset, $length = false)
  167. {
  168. if ($length === false)
  169. {
  170. return utf8_substr($str, $offset);
  171. }
  172. else
  173. {
  174. return utf8_substr($str, $offset, $length);
  175. }
  176. }
  177. /**
  178. * UTF-8 aware alternative to strtlower
  179. *
  180. * Make a string lowercase
  181. * Note: The concept of a characters "case" only exists is some alphabets
  182. * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
  183. * not exist in the Chinese alphabet, for example. See Unicode Standard
  184. * Annex #21: Case Mappings
  185. *
  186. * @param string $str String being processed
  187. *
  188. * @return mixed Either string in lowercase or FALSE is UTF-8 invalid
  189. *
  190. * @see http://www.php.net/strtolower
  191. * @since 11.1
  192. */
  193. public static function strtolower($str)
  194. {
  195. return utf8_strtolower($str);
  196. }
  197. /**
  198. * UTF-8 aware alternative to strtoupper
  199. * Make a string uppercase
  200. * Note: The concept of a characters "case" only exists is some alphabets
  201. * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
  202. * not exist in the Chinese alphabet, for example. See Unicode Standard
  203. * Annex #21: Case Mappings
  204. *
  205. * @param string $str String being processed
  206. *
  207. * @return mixed Either string in uppercase or FALSE is UTF-8 invalid
  208. *
  209. * @see http://www.php.net/strtoupper
  210. * @since 11.1
  211. */
  212. public static function strtoupper($str)
  213. {
  214. return utf8_strtoupper($str);
  215. }
  216. /**
  217. * UTF-8 aware alternative to strlen.
  218. *
  219. * Returns the number of characters in the string (NOT THE NUMBER OF BYTES),
  220. *
  221. * @param string $str UTF-8 string.
  222. *
  223. * @return integer Number of UTF-8 characters in string.
  224. *
  225. * @see http://www.php.net/strlen
  226. * @since 11.1
  227. */
  228. public static function strlen($str)
  229. {
  230. return utf8_strlen($str);
  231. }
  232. /**
  233. * UTF-8 aware alternative to str_ireplace
  234. * Case-insensitive version of str_replace
  235. *
  236. * @param string $search String to search
  237. * @param string $replace Existing string to replace
  238. * @param string $str New string to replace with
  239. * @param integer $count Optional count value to be passed by referene
  240. *
  241. * @return string UTF-8 String
  242. *
  243. * @see http://www.php.net/str_ireplace
  244. * @since 11.1
  245. */
  246. public static function str_ireplace($search, $replace, $str, $count = null)
  247. {
  248. jimport('phputf8.str_ireplace');
  249. if ($count === false)
  250. {
  251. return utf8_ireplace($search, $replace, $str);
  252. }
  253. else
  254. {
  255. return utf8_ireplace($search, $replace, $str, $count);
  256. }
  257. }
  258. /**
  259. * UTF-8 aware alternative to str_split
  260. * Convert a string to an array
  261. *
  262. * @param string $str UTF-8 encoded string to process
  263. * @param integer $split_len Number to characters to split string by
  264. *
  265. * @return array
  266. *
  267. * @see http://www.php.net/str_split
  268. * @since 11.1
  269. */
  270. public static function str_split($str, $split_len = 1)
  271. {
  272. jimport('phputf8.str_split');
  273. return utf8_str_split($str, $split_len);
  274. }
  275. /**
  276. * UTF-8/LOCALE aware alternative to strcasecmp
  277. * A case insensitive string comparison
  278. *
  279. * @param string $str1 string 1 to compare
  280. * @param string $str2 string 2 to compare
  281. * @param mixed $locale The locale used by strcoll or false to use classical comparison
  282. *
  283. * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
  284. *
  285. * @see http://www.php.net/strcasecmp
  286. * @see http://www.php.net/strcoll
  287. * @see http://www.php.net/setlocale
  288. * @since 11.1
  289. */
  290. public static function strcasecmp($str1, $str2, $locale = false)
  291. {
  292. if ($locale)
  293. {
  294. // Get current locale
  295. $locale0 = setlocale(LC_COLLATE, 0);
  296. if (!$locale = setlocale(LC_COLLATE, $locale))
  297. {
  298. $locale = $locale0;
  299. }
  300. // See if we have successfully set locale to UTF-8
  301. if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
  302. {
  303. $encoding = 'CP' . $m[1];
  304. }
  305. elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8'))
  306. {
  307. $encoding = 'UTF-8';
  308. }
  309. else
  310. {
  311. $encoding = 'nonrecodable';
  312. }
  313. // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
  314. if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
  315. {
  316. return strcoll(utf8_strtolower($str1), utf8_strtolower($str2));
  317. }
  318. else
  319. {
  320. return strcoll(
  321. self::transcode(utf8_strtolower($str1), 'UTF-8', $encoding),
  322. self::transcode(utf8_strtolower($str2), 'UTF-8', $encoding)
  323. );
  324. }
  325. }
  326. else
  327. {
  328. return utf8_strcasecmp($str1, $str2);
  329. }
  330. }
  331. /**
  332. * UTF-8/LOCALE aware alternative to strcmp
  333. * A case sensitive string comparison
  334. *
  335. * @param string $str1 string 1 to compare
  336. * @param string $str2 string 2 to compare
  337. * @param mixed $locale The locale used by strcoll or false to use classical comparison
  338. *
  339. * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
  340. *
  341. * @see http://www.php.net/strcmp
  342. * @see http://www.php.net/strcoll
  343. * @see http://www.php.net/setlocale
  344. * @since 11.1
  345. */
  346. public static function strcmp($str1, $str2, $locale = false)
  347. {
  348. if ($locale)
  349. {
  350. // Get current locale
  351. $locale0 = setlocale(LC_COLLATE, 0);
  352. if (!$locale = setlocale(LC_COLLATE, $locale))
  353. {
  354. $locale = $locale0;
  355. }
  356. // See if we have successfully set locale to UTF-8
  357. if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
  358. {
  359. $encoding = 'CP' . $m[1];
  360. }
  361. elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8'))
  362. {
  363. $encoding = 'UTF-8';
  364. }
  365. else
  366. {
  367. $encoding = 'nonrecodable';
  368. }
  369. // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
  370. if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
  371. {
  372. return strcoll($str1, $str2);
  373. }
  374. else
  375. {
  376. return strcoll(self::transcode($str1, 'UTF-8', $encoding), self::transcode($str2, 'UTF-8', $encoding));
  377. }
  378. }
  379. else
  380. {
  381. return strcmp($str1, $str2);
  382. }
  383. }
  384. /**
  385. * UTF-8 aware alternative to strcspn
  386. * Find length of initial segment not matching mask
  387. *
  388. * @param string $str The string to process
  389. * @param string $mask The mask
  390. * @param integer $start Optional starting character position (in characters)
  391. * @param integer $length Optional length
  392. *
  393. * @return integer The length of the initial segment of str1 which does not contain any of the characters in str2
  394. *
  395. * @see http://www.php.net/strcspn
  396. * @since 11.1
  397. */
  398. public static function strcspn($str, $mask, $start = null, $length = null)
  399. {
  400. jimport('phputf8.strcspn');
  401. if ($start === false && $length === false)
  402. {
  403. return utf8_strcspn($str, $mask);
  404. }
  405. elseif ($length === false)
  406. {
  407. return utf8_strcspn($str, $mask, $start);
  408. }
  409. else
  410. {
  411. return utf8_strcspn($str, $mask, $start, $length);
  412. }
  413. }
  414. /**
  415. * UTF-8 aware alternative to stristr
  416. * Returns all of haystack from the first occurrence of needle to the end.
  417. * needle and haystack are examined in a case-insensitive manner
  418. * Find first occurrence of a string using case insensitive comparison
  419. *
  420. * @param string $str The haystack
  421. * @param string $search The needle
  422. *
  423. * @return string the sub string
  424. *
  425. * @see http://www.php.net/stristr
  426. * @since 11.1
  427. */
  428. public static function stristr($str, $search)
  429. {
  430. jimport('phputf8.stristr');
  431. return utf8_stristr($str, $search);
  432. }
  433. /**
  434. * UTF-8 aware alternative to strrev
  435. * Reverse a string
  436. *
  437. * @param string $str String to be reversed
  438. *
  439. * @return string The string in reverse character order
  440. *
  441. * @see http://www.php.net/strrev
  442. * @since 11.1
  443. */
  444. public static function strrev($str)
  445. {
  446. jimport('phputf8.strrev');
  447. return utf8_strrev($str);
  448. }
  449. /**
  450. * UTF-8 aware alternative to strspn
  451. * Find length of initial segment matching mask
  452. *
  453. * @param string $str The haystack
  454. * @param string $mask The mask
  455. * @param integer $start Start optional
  456. * @param integer $length Length optional
  457. *
  458. * @return integer
  459. *
  460. * @see http://www.php.net/strspn
  461. * @since 11.1
  462. */
  463. public static function strspn($str, $mask, $start = null, $length = null)
  464. {
  465. jimport('phputf8.strspn');
  466. if ($start === null && $length === null)
  467. {
  468. return utf8_strspn($str, $mask);
  469. }
  470. elseif ($length === null)
  471. {
  472. return utf8_strspn($str, $mask, $start);
  473. }
  474. else
  475. {
  476. return utf8_strspn($str, $mask, $start, $length);
  477. }
  478. }
  479. /**
  480. * UTF-8 aware substr_replace
  481. * Replace text within a portion of a string
  482. *
  483. * @param string $str The haystack
  484. * @param string $repl The replacement string
  485. * @param integer $start Start
  486. * @param integer $length Length (optional)
  487. *
  488. * @return string
  489. *
  490. * @see http://www.php.net/substr_replace
  491. * @since 11.1
  492. */
  493. public static function substr_replace($str, $repl, $start, $length = null)
  494. {
  495. // Loaded by library loader
  496. if ($length === false)
  497. {
  498. return utf8_substr_replace($str, $repl, $start);
  499. }
  500. else
  501. {
  502. return utf8_substr_replace($str, $repl, $start, $length);
  503. }
  504. }
  505. /**
  506. * UTF-8 aware replacement for ltrim()
  507. *
  508. * Strip whitespace (or other characters) from the beginning of a string
  509. * You only need to use this if you are supplying the charlist
  510. * optional arg and it contains UTF-8 characters. Otherwise ltrim will
  511. * work normally on a UTF-8 string
  512. *
  513. * @param string $str The string to be trimmed
  514. * @param string $charlist The optional charlist of additional characters to trim
  515. *
  516. * @return string The trimmed string
  517. *
  518. * @see http://www.php.net/ltrim
  519. * @since 11.1
  520. */
  521. public static function ltrim($str, $charlist = false)
  522. {
  523. if (empty($charlist) && $charlist !== false)
  524. {
  525. return $str;
  526. }
  527. jimport('phputf8.trim');
  528. if ($charlist === false)
  529. {
  530. return utf8_ltrim($str);
  531. }
  532. else
  533. {
  534. return utf8_ltrim($str, $charlist);
  535. }
  536. }
  537. /**
  538. * UTF-8 aware replacement for rtrim()
  539. * Strip whitespace (or other characters) from the end of a string
  540. * You only need to use this if you are supplying the charlist
  541. * optional arg and it contains UTF-8 characters. Otherwise rtrim will
  542. * work normally on a UTF-8 string
  543. *
  544. * @param string $str The string to be trimmed
  545. * @param string $charlist The optional charlist of additional characters to trim
  546. *
  547. * @return string The trimmed string
  548. *
  549. * @see http://www.php.net/rtrim
  550. * @since 11.1
  551. */
  552. public static function rtrim($str, $charlist = false)
  553. {
  554. if (empty($charlist) && $charlist !== false)
  555. {
  556. return $str;
  557. }
  558. jimport('phputf8.trim');
  559. if ($charlist === false)
  560. {
  561. return utf8_rtrim($str);
  562. }
  563. else
  564. {
  565. return utf8_rtrim($str, $charlist);
  566. }
  567. }
  568. /**
  569. * UTF-8 aware replacement for trim()
  570. * Strip whitespace (or other characters) from the beginning and end of a string
  571. * Note: you only need to use this if you are supplying the charlist
  572. * optional arg and it contains UTF-8 characters. Otherwise trim will
  573. * work normally on a UTF-8 string
  574. *
  575. * @param string $str The string to be trimmed
  576. * @param string $charlist The optional charlist of additional characters to trim
  577. *
  578. * @return string The trimmed string
  579. *
  580. * @see http://www.php.net/trim
  581. * @since 11.1
  582. */
  583. public static function trim($str, $charlist = false)
  584. {
  585. if (empty($charlist) && $charlist !== false)
  586. {
  587. return $str;
  588. }
  589. jimport('phputf8.trim');
  590. if ($charlist === false)
  591. {
  592. return utf8_trim($str);
  593. }
  594. else
  595. {
  596. return utf8_trim($str, $charlist);
  597. }
  598. }
  599. /**
  600. * UTF-8 aware alternative to ucfirst
  601. * Make a string's first character uppercase or all words' first character uppercase
  602. *
  603. * @param string $str String to be processed
  604. * @param string $delimiter The words delimiter (null means do not split the string)
  605. * @param string $newDelimiter The new words delimiter (null means equal to $delimiter)
  606. *
  607. * @return string If $delimiter is null, return the string with first character as upper case (if applicable)
  608. * else consider the string of words separated by the delimiter, apply the ucfirst to each words
  609. * and return the string with the new delimiter
  610. *
  611. * @see http://www.php.net/ucfirst
  612. * @since 11.1
  613. */
  614. public static function ucfirst($str, $delimiter = null, $newDelimiter = null)
  615. {
  616. jimport('phputf8.ucfirst');
  617. if ($delimiter === null)
  618. {
  619. return utf8_ucfirst($str);
  620. }
  621. else
  622. {
  623. if ($newDelimiter === null)
  624. {
  625. $newDelimiter = $delimiter;
  626. }
  627. return implode($newDelimiter, array_map('utf8_ucfirst', explode($delimiter, $str)));
  628. }
  629. }
  630. /**
  631. * UTF-8 aware alternative to ucwords
  632. * Uppercase the first character of each word in a string
  633. *
  634. * @param string $str String to be processed
  635. *
  636. * @return string String with first char of each word uppercase
  637. *
  638. * @see http://www.php.net/ucwords
  639. * @since 11.1
  640. */
  641. public static function ucwords($str)
  642. {
  643. jimport('phputf8.ucwords');
  644. return utf8_ucwords($str);
  645. }
  646. /**
  647. * Transcode a string.
  648. *
  649. * @param string $source The string to transcode.
  650. * @param string $from_encoding The source encoding.
  651. * @param string $to_encoding The target encoding.
  652. *
  653. * @return mixed The transcoded string, or null if the source was not a string.
  654. *
  655. * @link https://bugs.php.net/bug.php?id=48147
  656. *
  657. * @since 11.1
  658. */
  659. public static function transcode($source, $from_encoding, $to_encoding)
  660. {
  661. if (is_string($source))
  662. {
  663. switch (ICONV_IMPL)
  664. {
  665. case 'glibc':
  666. return @iconv($from_encoding, $to_encoding . '//TRANSLIT,IGNORE', $source);
  667. case 'libiconv':
  668. default:
  669. return iconv($from_encoding, $to_encoding . '//IGNORE//TRANSLIT', $source);
  670. }
  671. }
  672. return null;
  673. }
  674. /**
  675. * Tests a string as to whether it's valid UTF-8 and supported by the Unicode standard.
  676. *
  677. * Note: this function has been modified to simple return true or false.
  678. *
  679. * @param string $str UTF-8 encoded string.
  680. *
  681. * @return boolean true if valid
  682. *
  683. * @author <hsivonen@iki.fi>
  684. * @see http://hsivonen.iki.fi/php-utf8/
  685. * @see compliant
  686. * @since 11.1
  687. */
  688. public static function valid($str)
  689. {
  690. // Cached expected number of octets after the current octet
  691. // until the beginning of the next UTF8 character sequence
  692. $mState = 0;
  693. // Cached Unicode character
  694. $mUcs4 = 0;
  695. // Cached expected number of octets in the current sequence
  696. $mBytes = 1;
  697. $len = strlen($str);
  698. for ($i = 0; $i < $len; $i++)
  699. {
  700. $in = ord($str{$i});
  701. if ($mState == 0)
  702. {
  703. // When mState is zero we expect either a US-ASCII character or a
  704. // multi-octet sequence.
  705. if (0 == (0x80 & ($in)))
  706. {
  707. // US-ASCII, pass straight through.
  708. $mBytes = 1;
  709. }
  710. elseif (0xC0 == (0xE0 & ($in)))
  711. {
  712. // First octet of 2 octet sequence
  713. $mUcs4 = ($in);
  714. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  715. $mState = 1;
  716. $mBytes = 2;
  717. }
  718. elseif (0xE0 == (0xF0 & ($in)))
  719. {
  720. // First octet of 3 octet sequence
  721. $mUcs4 = ($in);
  722. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  723. $mState = 2;
  724. $mBytes = 3;
  725. }
  726. elseif (0xF0 == (0xF8 & ($in)))
  727. {
  728. // First octet of 4 octet sequence
  729. $mUcs4 = ($in);
  730. $mUcs4 = ($mUcs4 & 0x07) << 18;
  731. $mState = 3;
  732. $mBytes = 4;
  733. }
  734. elseif (0xF8 == (0xFC & ($in)))
  735. {
  736. /* First octet of 5 octet sequence.
  737. *
  738. * This is illegal because the encoded codepoint must be either
  739. * (a) not the shortest form or
  740. * (b) outside the Unicode range of 0-0x10FFFF.
  741. * Rather than trying to resynchronize, we will carry on until the end
  742. * of the sequence and let the later error handling code catch it.
  743. */
  744. $mUcs4 = ($in);
  745. $mUcs4 = ($mUcs4 & 0x03) << 24;
  746. $mState = 4;
  747. $mBytes = 5;
  748. }
  749. elseif (0xFC == (0xFE & ($in)))
  750. {
  751. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  752. $mUcs4 = ($in);
  753. $mUcs4 = ($mUcs4 & 1) << 30;
  754. $mState = 5;
  755. $mBytes = 6;
  756. }
  757. else
  758. {
  759. /* Current octet is neither in the US-ASCII range nor a legal first
  760. * octet of a multi-octet sequence.
  761. */
  762. return false;
  763. }
  764. }
  765. else
  766. {
  767. // When mState is non-zero, we expect a continuation of the multi-octet
  768. // sequence
  769. if (0x80 == (0xC0 & ($in)))
  770. {
  771. // Legal continuation.
  772. $shift = ($mState - 1) * 6;
  773. $tmp = $in;
  774. $tmp = ($tmp & 0x0000003F) << $shift;
  775. $mUcs4 |= $tmp;
  776. /**
  777. * End of the multi-octet sequence. mUcs4 now contains the final
  778. * Unicode codepoint to be output
  779. */
  780. if (0 == --$mState)
  781. {
  782. /*
  783. * Check for illegal sequences and codepoints.
  784. */
  785. // From Unicode 3.1, non-shortest form is illegal
  786. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || ((3 == $mBytes) && ($mUcs4 < 0x0800)) || ((4 == $mBytes) && ($mUcs4 < 0x10000))
  787. || (4 < $mBytes)
  788. || (($mUcs4 & 0xFFFFF800) == 0xD800) // From Unicode 3.2, surrogate characters are illegal
  789. || ($mUcs4 > 0x10FFFF)) // Codepoints outside the Unicode range are illegal
  790. {
  791. return false;
  792. }
  793. // Initialize UTF8 cache.
  794. $mState = 0;
  795. $mUcs4 = 0;
  796. $mBytes = 1;
  797. }
  798. }
  799. else
  800. {
  801. /**
  802. *((0xC0 & (*in) != 0x80) && (mState != 0))
  803. * Incomplete multi-octet sequence.
  804. */
  805. return false;
  806. }
  807. }
  808. }
  809. return true;
  810. }
  811. /**
  812. * Tests whether a string complies as UTF-8. This will be much
  813. * faster than utf8_is_valid but will pass five and six octet
  814. * UTF-8 sequences, which are not supported by Unicode and
  815. * so cannot be displayed correctly in a browser. In other words
  816. * it is not as strict as utf8_is_valid but it's faster. If you use
  817. * it to validate user input, you place yourself at the risk that
  818. * attackers will be able to inject 5 and 6 byte sequences (which
  819. * may or may not be a significant risk, depending on what you are
  820. * are doing)
  821. *
  822. * @param string $str UTF-8 string to check
  823. *
  824. * @return boolean TRUE if string is valid UTF-8
  825. *
  826. * @see valid
  827. * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
  828. * @since 11.1
  829. */
  830. public static function compliant($str)
  831. {
  832. if (strlen($str) == 0)
  833. {
  834. return true;
  835. }
  836. /*
  837. * If even just the first character can be matched, when the /u
  838. * modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
  839. * invalid, nothing at all will match, even if the string contains
  840. * some valid sequences
  841. */
  842. return (preg_match('/^.{1}/us', $str, $ar) == 1);
  843. }
  844. /**
  845. * Does a UTF-8 safe version of PHP parse_url function
  846. *
  847. * @param string $url URL to parse
  848. *
  849. * @return mixed Associative array or false if badly formed URL.
  850. *
  851. * @see http://us3.php.net/manual/en/function.parse-url.php
  852. * @since 11.1
  853. */
  854. public static function parse_url($url)
  855. {
  856. $result = false;
  857. // Build arrays of values we need to decode before parsing
  858. $entities = array('%21', '%2A', '%27', '%28', '%29', '%3B', '%3A', '%40', '%26', '%3D', '%24', '%2C', '%2F', '%3F', '%23', '%5B', '%5D');
  859. $replacements = array('!', '*', "'", "(", ")", ";", ":", "@", "&", "=", "$", ",", "/", "?", "#", "[", "]");
  860. // Create encoded URL with special URL characters decoded so it can be parsed
  861. // All other characters will be encoded
  862. $encodedURL = str_replace($entities, $replacements, urlencode($url));
  863. // Parse the encoded URL
  864. $encodedParts = parse_url($encodedURL);
  865. // Now, decode each value of the resulting array
  866. if ($encodedParts)
  867. {
  868. foreach ($encodedParts as $key => $value)
  869. {
  870. $result[$key] = urldecode(str_replace($replacements, $entities, $value));
  871. }
  872. }
  873. return $result;
  874. }
  875. }