PageRenderTime 57ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/libraries/joomla/string/string.php

https://github.com/eddieajau/joomla-platform
PHP | 907 lines | 421 code | 52 blank | 434 comment | 93 complexity | 8be28133668e4b137220fff4f08d760f MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, BSD-3-Clause
  1. <?php
  2. /**
  3. * @package Joomla.Platform
  4. * @subpackage String
  5. *
  6. * @copyright Copyright (C) 2005 - 2011 Open Source Matters, Inc. All rights reserved.
  7. * @license GNU General Public License version 2 or later; see LICENSE
  8. */
  9. defined('JPATH_PLATFORM') or die;
  10. //
  11. // PHP mbstring and iconv local configuration
  12. //
  13. // Check if mbstring extension is loaded and attempt to load it if not present except for windows
  14. if (extension_loaded('mbstring') || ((!strtoupper(substr(PHP_OS, 0, 3)) === 'WIN' && dl('mbstring.so'))))
  15. {
  16. // Make sure to surpress the output in case ini_set is disabled
  17. @ini_set('mbstring.internal_encoding', 'UTF-8');
  18. @ini_set('mbstring.http_input', 'UTF-8');
  19. @ini_set('mbstring.http_output', 'UTF-8');
  20. }
  21. // Same for iconv
  22. if (function_exists('iconv') || ((!strtoupper(substr(PHP_OS, 0, 3)) === 'WIN' && dl('iconv.so'))))
  23. {
  24. // These are settings that can be set inside code
  25. iconv_set_encoding("internal_encoding", "UTF-8");
  26. iconv_set_encoding("input_encoding", "UTF-8");
  27. iconv_set_encoding("output_encoding", "UTF-8");
  28. }
  29. /**
  30. * Include the utf8 package
  31. */
  32. jimport('phputf8.utf8');
  33. jimport('phputf8.strcasecmp');
  34. /**
  35. * String handling class for utf-8 data
  36. * Wraps the phputf8 library
  37. * All functions assume the validity of utf-8 strings.
  38. *
  39. * @package Joomla.Platform
  40. * @subpackage String
  41. * @since 11.1
  42. */
  43. abstract class JString
  44. {
  45. /**
  46. * Increment styles.
  47. *
  48. * @var array
  49. * @since 11.3
  50. */
  51. protected static $incrementStyles = array(
  52. 'dash' => array(
  53. '#-(\d+)$#',
  54. '-%d'
  55. ),
  56. 'default' => array(
  57. array('#\((\d+)\)$#', '#\(\d+\)$#'),
  58. array(' (%d)', '(%d)'),
  59. ),
  60. );
  61. /**
  62. * Increments a trailing number in a string.
  63. *
  64. * Used to easily create distinct labels when copying objects. The method has the following styles:
  65. *
  66. * default: "Label" becomes "Label (2)"
  67. * dash: "Label" becomes "Label-2"
  68. *
  69. * @param string $string The source string.
  70. * @param string $style The the style (default|dash).
  71. * @param integer $n If supplied, this number is used for the copy, otherwise it is the 'next' number.
  72. *
  73. * @return string The incremented string.
  74. *
  75. * @since 11.3
  76. */
  77. public static function increment($string, $style = 'default', $n = 0)
  78. {
  79. $styleSpec = isset(self::$incrementStyles[$style]) ? self::$incrementStyles[$style] : self::$incrementStyles['default'];
  80. // Regular expression search and replace patterns.
  81. if (is_array($styleSpec[0]))
  82. {
  83. $rxSearch = $styleSpec[0][0];
  84. $rxReplace = $styleSpec[0][1];
  85. }
  86. else
  87. {
  88. $rxSearch = $rxReplace = $styleSpec[0];
  89. }
  90. // New and old (existing) sprintf formats.
  91. if (is_array($styleSpec[1]))
  92. {
  93. $newFormat = $styleSpec[1][0];
  94. $oldFormat = $styleSpec[1][1];
  95. }
  96. else
  97. {
  98. $newFormat = $oldFormat = $styleSpec[1];
  99. }
  100. // Check if we are incrementing an existing pattern, or appending a new one.
  101. if (preg_match($rxSearch, $string, $matches))
  102. {
  103. $n = empty($n) ? ($matches[1] + 1) : $n;
  104. $string = preg_replace($rxReplace, sprintf($oldFormat, $n), $string);
  105. }
  106. else
  107. {
  108. $n = empty($n) ? 2 : $n;
  109. $string .= sprintf($newFormat, $n);
  110. }
  111. return $string;
  112. }
  113. /**
  114. * UTF-8 aware alternative to strpos.
  115. *
  116. * Find position of first occurrence of a string.
  117. *
  118. * @param string $str String being examined
  119. * @param string $search String being searced for
  120. * @param integer $offset Optional, specifies the position from which the search should be performed
  121. *
  122. * @return mixed Number of characters before the first match or FALSE on failure
  123. *
  124. * @see http://www.php.net/strpos
  125. * @since 11.1
  126. */
  127. public static function strpos($str, $search, $offset = false)
  128. {
  129. if ($offset === false)
  130. {
  131. return utf8_strpos($str, $search);
  132. }
  133. else
  134. {
  135. return utf8_strpos($str, $search, $offset);
  136. }
  137. }
  138. /**
  139. * UTF-8 aware alternative to strrpos
  140. * Finds position of last occurrence of a string
  141. *
  142. * @param string $str String being examined.
  143. * @param string $search String being searched for.
  144. * @param integer $offset Offset from the left of the string.
  145. *
  146. * @return mixed Number of characters before the last match or false on failure
  147. *
  148. * @see http://www.php.net/strrpos
  149. * @since 11.1
  150. */
  151. public static function strrpos($str, $search, $offset = 0)
  152. {
  153. return utf8_strrpos($str, $search, $offset);
  154. }
  155. /**
  156. * UTF-8 aware alternative to substr
  157. * Return part of a string given character offset (and optionally length)
  158. *
  159. * @param string $str String being processed
  160. * @param integer $offset Number of UTF-8 characters offset (from left)
  161. * @param integer $length Optional length in UTF-8 characters from offset
  162. *
  163. * @return mixed string or FALSE if failure
  164. *
  165. * @see http://www.php.net/substr
  166. * @since 11.1
  167. */
  168. public static function substr($str, $offset, $length = false)
  169. {
  170. if ($length === false)
  171. {
  172. return utf8_substr($str, $offset);
  173. }
  174. else
  175. {
  176. return utf8_substr($str, $offset, $length);
  177. }
  178. }
  179. /**
  180. * UTF-8 aware alternative to strtlower
  181. *
  182. * Make a string lowercase
  183. * Note: The concept of a characters "case" only exists is some alphabets
  184. * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
  185. * not exist in the Chinese alphabet, for example. See Unicode Standard
  186. * Annex #21: Case Mappings
  187. *
  188. * @param string $str String being processed
  189. *
  190. * @return mixed Either string in lowercase or FALSE is UTF-8 invalid
  191. *
  192. * @see http://www.php.net/strtolower
  193. * @since 11.1
  194. */
  195. public static function strtolower($str)
  196. {
  197. return utf8_strtolower($str);
  198. }
  199. /**
  200. * UTF-8 aware alternative to strtoupper
  201. * Make a string uppercase
  202. * Note: The concept of a characters "case" only exists is some alphabets
  203. * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
  204. * not exist in the Chinese alphabet, for example. See Unicode Standard
  205. * Annex #21: Case Mappings
  206. *
  207. * @param string $str String being processed
  208. *
  209. * @return mixed Either string in uppercase or FALSE is UTF-8 invalid
  210. *
  211. * @see http://www.php.net/strtoupper
  212. * @since 11.1
  213. */
  214. public static function strtoupper($str)
  215. {
  216. return utf8_strtoupper($str);
  217. }
  218. /**
  219. * UTF-8 aware alternative to strlen.
  220. *
  221. * Returns the number of characters in the string (NOT THE NUMBER OF BYTES),
  222. *
  223. * @param string $str UTF-8 string.
  224. *
  225. * @return integer Number of UTF-8 characters in string.
  226. *
  227. * @see http://www.php.net/strlen
  228. * @since 11.1
  229. */
  230. public static function strlen($str)
  231. {
  232. return utf8_strlen($str);
  233. }
  234. /**
  235. * UTF-8 aware alternative to str_ireplace
  236. * Case-insensitive version of str_replace
  237. *
  238. * @param string $search String to search
  239. * @param string $replace Existing string to replace
  240. * @param string $str New string to replace with
  241. * @param integer $count Optional count value to be passed by referene
  242. *
  243. * @return string UTF-8 String
  244. *
  245. * @see http://www.php.net/str_ireplace
  246. * @since 11.1
  247. */
  248. public static function str_ireplace($search, $replace, $str, $count = null)
  249. {
  250. jimport('phputf8.str_ireplace');
  251. if ($count === false)
  252. {
  253. return utf8_ireplace($search, $replace, $str);
  254. }
  255. else
  256. {
  257. return utf8_ireplace($search, $replace, $str, $count);
  258. }
  259. }
  260. /**
  261. * UTF-8 aware alternative to str_split
  262. * Convert a string to an array
  263. *
  264. * @param string $str UTF-8 encoded string to process
  265. * @param integer $split_len Number to characters to split string by
  266. *
  267. * @return array
  268. *
  269. * @see http://www.php.net/str_split
  270. * @since 11.1
  271. */
  272. public static function str_split($str, $split_len = 1)
  273. {
  274. jimport('phputf8.str_split');
  275. return utf8_str_split($str, $split_len);
  276. }
  277. /**
  278. * UTF-8/LOCALE aware alternative to strcasecmp
  279. * A case insensivite string comparison
  280. *
  281. * @param string $str1 string 1 to compare
  282. * @param string $str2 string 2 to compare
  283. * @param mixed $locale The locale used by strcoll or false to use classical comparison
  284. *
  285. * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
  286. *
  287. * @see http://www.php.net/strcasecmp
  288. * @see http://www.php.net/strcoll
  289. * @see http://www.php.net/setlocale
  290. * @since 11.1
  291. */
  292. public static function strcasecmp($str1, $str2, $locale = false)
  293. {
  294. if ($locale)
  295. {
  296. // Get current locale
  297. $locale0 = setlocale(LC_COLLATE, 0);
  298. if (!$locale = setlocale(LC_COLLATE, $locale))
  299. {
  300. $locale = $locale0;
  301. }
  302. // See if we have successfully set locale to UTF-8
  303. if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
  304. {
  305. $encoding = 'CP' . $m[1];
  306. }
  307. else if (stristr($locale, 'UTF-8'))
  308. {
  309. $encoding = 'UTF-8';
  310. }
  311. else
  312. {
  313. $encoding = 'nonrecodable';
  314. }
  315. // if we sucesfuly set encoding it to utf-8 or encoding is sth weird don't recode
  316. if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
  317. {
  318. return strcoll(utf8_strtolower($str1), utf8_strtolower($str2));
  319. }
  320. else
  321. {
  322. return strcoll(
  323. self::transcode(utf8_strtolower($str1), 'UTF-8', $encoding),
  324. self::transcode(utf8_strtolower($str2), 'UTF-8', $encoding)
  325. );
  326. }
  327. }
  328. else
  329. {
  330. return utf8_strcasecmp($str1, $str2);
  331. }
  332. }
  333. /**
  334. * UTF-8/LOCALE aware alternative to strcmp
  335. * A case sensitive string comparison
  336. *
  337. * @param string $str1 string 1 to compare
  338. * @param string $str2 string 2 to compare
  339. * @param mixed $locale The locale used by strcoll or false to use classical comparison
  340. *
  341. * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
  342. *
  343. * @see http://www.php.net/strcmp
  344. * @see http://www.php.net/strcoll
  345. * @see http://www.php.net/setlocale
  346. * @since 11.1
  347. */
  348. public static function strcmp($str1, $str2, $locale = false)
  349. {
  350. if ($locale)
  351. {
  352. // Get current locale
  353. $locale0 = setlocale(LC_COLLATE, 0);
  354. if (!$locale = setlocale(LC_COLLATE, $locale))
  355. {
  356. $locale = $locale0;
  357. }
  358. // See if we have successfully set locale to UTF-8
  359. if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
  360. {
  361. $encoding = 'CP' . $m[1];
  362. }
  363. else if (stristr($locale, 'UTF-8'))
  364. {
  365. $encoding = 'UTF-8';
  366. }
  367. else
  368. {
  369. $encoding = 'nonrecodable';
  370. }
  371. // If we sucesfuly set encoding it to utf-8 or encoding is sth weird don't recode
  372. if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
  373. {
  374. return strcoll($str1, $str2);
  375. }
  376. else
  377. {
  378. return strcoll(self::transcode($str1, 'UTF-8', $encoding), self::transcode($str2, 'UTF-8', $encoding));
  379. }
  380. }
  381. else
  382. {
  383. return strcmp($str1, $str2);
  384. }
  385. }
  386. /**
  387. * UTF-8 aware alternative to strcspn
  388. * Find length of initial segment not matching mask
  389. *
  390. * @param string $str The string to process
  391. * @param string $mask The mask
  392. * @param integer $start Optional starting character position (in characters)
  393. * @param integer $length Optional length
  394. *
  395. * @return integer The length of the initial segment of str1 which does not contain any of the characters in str2
  396. *
  397. * @see http://www.php.net/strcspn
  398. * @since 11.1
  399. */
  400. public static function strcspn($str, $mask, $start = null, $length = null)
  401. {
  402. jimport('phputf8.strcspn');
  403. if ($start === false && $length === false)
  404. {
  405. return utf8_strcspn($str, $mask);
  406. }
  407. else if ($length === false)
  408. {
  409. return utf8_strcspn($str, $mask, $start);
  410. }
  411. else
  412. {
  413. return utf8_strcspn($str, $mask, $start, $length);
  414. }
  415. }
  416. /**
  417. * UTF-8 aware alternative to stristr
  418. * Returns all of haystack from the first occurrence of needle to the end.
  419. * needle and haystack are examined in a case-insensitive manner
  420. * Find first occurrence of a string using case insensitive comparison
  421. *
  422. * @param string $str The haystack
  423. * @param string $search The needle
  424. *
  425. * @return string the sub string
  426. *
  427. * @see http://www.php.net/stristr
  428. * @since 11.1
  429. */
  430. public static function stristr($str, $search)
  431. {
  432. jimport('phputf8.stristr');
  433. return utf8_stristr($str, $search);
  434. }
  435. /**
  436. * UTF-8 aware alternative to strrev
  437. * Reverse a string
  438. *
  439. * @param string $str String to be reversed
  440. *
  441. * @return string The string in reverse character order
  442. *
  443. * @see http://www.php.net/strrev
  444. * @since 11.1
  445. */
  446. public static function strrev($str)
  447. {
  448. jimport('phputf8.strrev');
  449. return utf8_strrev($str);
  450. }
  451. /**
  452. * UTF-8 aware alternative to strspn
  453. * Find length of initial segment matching mask
  454. *
  455. * @param string $str The haystack
  456. * @param string $mask The mask
  457. * @param integer $start Start optional
  458. * @param integer $length Length optional
  459. *
  460. * @return integer
  461. *
  462. * @see http://www.php.net/strspn
  463. * @since 11.1
  464. */
  465. public static function strspn($str, $mask, $start = null, $length = null)
  466. {
  467. jimport('phputf8.strspn');
  468. if ($start === null && $length === null)
  469. {
  470. return utf8_strspn($str, $mask);
  471. }
  472. else if ($length === null)
  473. {
  474. return utf8_strspn($str, $mask, $start);
  475. }
  476. else
  477. {
  478. return utf8_strspn($str, $mask, $start, $length);
  479. }
  480. }
  481. /**
  482. * UTF-8 aware substr_replace
  483. * Replace text within a portion of a string
  484. *
  485. * @param string $str The haystack
  486. * @param string $repl The replacement string
  487. * @param integer $start Start
  488. * @param integer $length Length (optional)
  489. *
  490. * @return string
  491. *
  492. * @see http://www.php.net/substr_replace
  493. * @since 11.1
  494. */
  495. public static function substr_replace($str, $repl, $start, $length = null)
  496. {
  497. // loaded by library loader
  498. if ($length === false)
  499. {
  500. return utf8_substr_replace($str, $repl, $start);
  501. }
  502. else
  503. {
  504. return utf8_substr_replace($str, $repl, $start, $length);
  505. }
  506. }
  507. /**
  508. * UTF-8 aware replacement for ltrim()
  509. *
  510. * Strip whitespace (or other characters) from the beginning of a string
  511. * You only need to use this if you are supplying the charlist
  512. * optional arg and it contains UTF-8 characters. Otherwise ltrim will
  513. * work normally on a UTF-8 string
  514. *
  515. * @param string $str The string to be trimmed
  516. * @param string $charlist The optional charlist of additional characters to trim
  517. *
  518. * @return string The trimmed string
  519. *
  520. * @see http://www.php.net/ltrim
  521. * @since 11.1
  522. */
  523. public static function ltrim($str, $charlist = false)
  524. {
  525. if (empty($charlist) && $charlist !== false)
  526. {
  527. return $str;
  528. }
  529. jimport('phputf8.trim');
  530. if ($charlist === false)
  531. {
  532. return utf8_ltrim($str);
  533. }
  534. else
  535. {
  536. return utf8_ltrim($str, $charlist);
  537. }
  538. }
  539. /**
  540. * UTF-8 aware replacement for rtrim()
  541. * Strip whitespace (or other characters) from the end of a string
  542. * You only need to use this if you are supplying the charlist
  543. * optional arg and it contains UTF-8 characters. Otherwise rtrim will
  544. * work normally on a UTF-8 string
  545. *
  546. * @param string $str The string to be trimmed
  547. * @param string $charlist The optional charlist of additional characters to trim
  548. *
  549. * @return string The trimmed string
  550. *
  551. * @see http://www.php.net/rtrim
  552. * @since 11.1
  553. */
  554. public static function rtrim($str, $charlist = false)
  555. {
  556. if (empty($charlist) && $charlist !== false)
  557. {
  558. return $str;
  559. }
  560. jimport('phputf8.trim');
  561. if ($charlist === false)
  562. {
  563. return utf8_rtrim($str);
  564. }
  565. else
  566. {
  567. return utf8_rtrim($str, $charlist);
  568. }
  569. }
  570. /**
  571. * UTF-8 aware replacement for trim()
  572. * Strip whitespace (or other characters) from the beginning and end of a string
  573. * Note: you only need to use this if you are supplying the charlist
  574. * optional arg and it contains UTF-8 characters. Otherwise trim will
  575. * work normally on a UTF-8 string
  576. *
  577. * @param string $str The string to be trimmed
  578. * @param string $charlist The optional charlist of additional characters to trim
  579. *
  580. * @return string The trimmed string
  581. *
  582. * @see http://www.php.net/trim
  583. * @since 11.1
  584. */
  585. public static function trim($str, $charlist = false)
  586. {
  587. if (empty($charlist) && $charlist !== false)
  588. {
  589. return $str;
  590. }
  591. jimport('phputf8.trim');
  592. if ($charlist === false)
  593. {
  594. return utf8_trim($str);
  595. }
  596. else
  597. {
  598. return utf8_trim($str, $charlist);
  599. }
  600. }
  601. /**
  602. * UTF-8 aware alternative to ucfirst
  603. * Make a string's first character uppercase
  604. *
  605. * @param string $str String to be processed
  606. *
  607. * @return string String with first character as upper case (if applicable)
  608. *
  609. * @see http://www.php.net/ucfirst
  610. * @since 11.1
  611. */
  612. public static function ucfirst($str)
  613. {
  614. jimport('phputf8.ucfirst');
  615. return utf8_ucfirst($str);
  616. }
  617. /**
  618. * UTF-8 aware alternative to ucwords
  619. * Uppercase the first character of each word in a string
  620. *
  621. * @param string $str String to be processed
  622. *
  623. * @return string String with first char of each word uppercase
  624. *
  625. * @see http://www.php.net/ucwords
  626. * @since 11.1
  627. */
  628. public static function ucwords($str)
  629. {
  630. jimport('phputf8.ucwords');
  631. return utf8_ucwords($str);
  632. }
  633. /**
  634. * Transcode a string.
  635. *
  636. * @param string $source The string to transcode.
  637. * @param string $from_encoding The source encoding.
  638. * @param string $to_encoding The target encoding.
  639. *
  640. * @return mixed The transcoded string, or null if the source was not a string.
  641. *
  642. * @since 11.1
  643. */
  644. public static function transcode($source, $from_encoding, $to_encoding)
  645. {
  646. if (is_string($source))
  647. {
  648. /*
  649. * "//TRANSLIT" is appended to the $to_encoding to ensure that when iconv comes
  650. * across a character that cannot be represented in the target charset, it can
  651. * be approximated through one or several similarly looking characters.
  652. */
  653. return iconv($from_encoding, $to_encoding . '//TRANSLIT', $source);
  654. }
  655. return null;
  656. }
  657. /**
  658. * Tests a string as to whether it's valid UTF-8 and supported by the Unicode standard.
  659. *
  660. * Note: this function has been modified to simple return true or false.
  661. *
  662. * @param string $str UTF-8 encoded string.
  663. *
  664. * @return boolean true if valid
  665. *
  666. * @author <hsivonen@iki.fi>
  667. * @see http://hsivonen.iki.fi/php-utf8/
  668. * @see compliant
  669. * @since 11.1
  670. */
  671. public static function valid($str)
  672. {
  673. // Cached expected number of octets after the current octet
  674. // until the beginning of the next UTF8 character sequence
  675. $mState = 0;
  676. // Cached Unicode character
  677. $mUcs4 = 0;
  678. // Cached expected number of octets in the current sequence
  679. $mBytes = 1;
  680. $len = strlen($str);
  681. for ($i = 0; $i < $len; $i++)
  682. {
  683. $in = ord($str{$i});
  684. if ($mState == 0)
  685. {
  686. // When mState is zero we expect either a US-ASCII character or a
  687. // multi-octet sequence.
  688. if (0 == (0x80 & ($in)))
  689. {
  690. // US-ASCII, pass straight through.
  691. $mBytes = 1;
  692. }
  693. else if (0xC0 == (0xE0 & ($in)))
  694. {
  695. // First octet of 2 octet sequence
  696. $mUcs4 = ($in);
  697. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  698. $mState = 1;
  699. $mBytes = 2;
  700. }
  701. else if (0xE0 == (0xF0 & ($in)))
  702. {
  703. // First octet of 3 octet sequence
  704. $mUcs4 = ($in);
  705. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  706. $mState = 2;
  707. $mBytes = 3;
  708. }
  709. else if (0xF0 == (0xF8 & ($in)))
  710. {
  711. // First octet of 4 octet sequence
  712. $mUcs4 = ($in);
  713. $mUcs4 = ($mUcs4 & 0x07) << 18;
  714. $mState = 3;
  715. $mBytes = 4;
  716. }
  717. else if (0xF8 == (0xFC & ($in)))
  718. {
  719. /* First octet of 5 octet sequence.
  720. *
  721. * This is illegal because the encoded codepoint must be either
  722. * (a) not the shortest form or
  723. * (b) outside the Unicode range of 0-0x10FFFF.
  724. * Rather than trying to resynchronize, we will carry on until the end
  725. * of the sequence and let the later error handling code catch it.
  726. */
  727. $mUcs4 = ($in);
  728. $mUcs4 = ($mUcs4 & 0x03) << 24;
  729. $mState = 4;
  730. $mBytes = 5;
  731. }
  732. else if (0xFC == (0xFE & ($in)))
  733. {
  734. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  735. $mUcs4 = ($in);
  736. $mUcs4 = ($mUcs4 & 1) << 30;
  737. $mState = 5;
  738. $mBytes = 6;
  739. }
  740. else
  741. {
  742. /* Current octet is neither in the US-ASCII range nor a legal first
  743. * octet of a multi-octet sequence.
  744. */
  745. return false;
  746. }
  747. }
  748. else
  749. {
  750. // When mState is non-zero, we expect a continuation of the multi-octet
  751. // sequence
  752. if (0x80 == (0xC0 & ($in)))
  753. {
  754. // Legal continuation.
  755. $shift = ($mState - 1) * 6;
  756. $tmp = $in;
  757. $tmp = ($tmp & 0x0000003F) << $shift;
  758. $mUcs4 |= $tmp;
  759. /**
  760. * End of the multi-octet sequence. mUcs4 now contains the final
  761. * Unicode codepoint to be output
  762. */
  763. if (0 == --$mState)
  764. {
  765. /*
  766. * Check for illegal sequences and codepoints.
  767. */
  768. // From Unicode 3.1, non-shortest form is illegal
  769. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || ((3 == $mBytes) && ($mUcs4 < 0x0800)) || ((4 == $mBytes) && ($mUcs4 < 0x10000))
  770. || (4 < $mBytes)
  771. || (($mUcs4 & 0xFFFFF800) == 0xD800) // From Unicode 3.2, surrogate characters are illegal
  772. || ($mUcs4 > 0x10FFFF) // Codepoints outside the Unicode range are illegal
  773. )
  774. {
  775. return false;
  776. }
  777. // Initialize UTF8 cache.
  778. $mState = 0;
  779. $mUcs4 = 0;
  780. $mBytes = 1;
  781. }
  782. }
  783. else
  784. {
  785. /**
  786. *((0xC0 & (*in) != 0x80) && (mState != 0))
  787. * Incomplete multi-octet sequence.
  788. */
  789. return false;
  790. }
  791. }
  792. }
  793. return true;
  794. }
  795. /**
  796. * Tests whether a string complies as UTF-8. This will be much
  797. * faster than utf8_is_valid but will pass five and six octet
  798. * UTF-8 sequences, which are not supported by Unicode and
  799. * so cannot be displayed correctly in a browser. In other words
  800. * it is not as strict as utf8_is_valid but it's faster. If you use
  801. * it to validate user input, you place yourself at the risk that
  802. * attackers will be able to inject 5 and 6 byte sequences (which
  803. * may or may not be a significant risk, depending on what you are
  804. * are doing)
  805. *
  806. * @param string $str UTF-8 string to check
  807. *
  808. * @return boolean TRUE if string is valid UTF-8
  809. *
  810. * @see valid
  811. * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
  812. * @since 11.1
  813. */
  814. public static function compliant($str)
  815. {
  816. if (strlen($str) == 0)
  817. {
  818. return true;
  819. }
  820. // If even just the first character can be matched, when the /u
  821. // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
  822. // invalid, nothing at all will match, even if the string contains
  823. // some valid sequences
  824. return (preg_match('/^.{1}/us', $str, $ar) == 1);
  825. }
  826. /**
  827. * Does a UTF-8 safe version of PHP parse_url function
  828. *
  829. * @param string $url URL to parse
  830. *
  831. * @return mixed Associative array or false if badly formed URL.
  832. *
  833. * @see http://us3.php.net/manual/en/function.parse-url.php
  834. * @since 11.1
  835. */
  836. public static function parse_url($url)
  837. {
  838. $result = array();
  839. // Build arrays of values we need to decode before parsing
  840. $entities = array('%21', '%2A', '%27', '%28', '%29', '%3B', '%3A', '%40', '%26', '%3D', '%24', '%2C', '%2F', '%3F', '%25', '%23', '%5B',
  841. '%5D');
  842. $replacements = array('!', '*', "'", "(", ")", ";", ":", "@", "&", "=", "$", ",", "/", "?", "%", "#", "[", "]");
  843. // Create encoded URL with special URL characters decoded so it can be parsed
  844. // All other charcters will be encoded
  845. $encodedURL = str_replace($entities, $replacements, urlencode($url));
  846. // Parse the encoded URL
  847. $encodedParts = parse_url($encodedURL);
  848. // Now, decode each value of the resulting array
  849. foreach ($encodedParts as $key => $value)
  850. {
  851. $result[$key] = urldecode($value);
  852. }
  853. return $result;
  854. }
  855. }