PageRenderTime 67ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/libraries/joomla/string/string.php

https://bitbucket.org/izubizarreta/https-bitbucket.org-bityvip-alpes
PHP | 976 lines | 449 code | 54 blank | 473 comment | 77 complexity | 536a209382aae3d30fd93ca89296d211 MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause, LGPL-2.1, MIT, LGPL-3.0, LGPL-2.0, JSON
  1. <?php
  2. /**
  3. * @package Joomla.Platform
  4. * @subpackage String
  5. *
  6. * @copyright Copyright (C) 2005 - 2012 Open Source Matters, Inc. All rights reserved.
  7. * @license GNU General Public License version 2 or later; see LICENSE
  8. */
  9. defined('JPATH_PLATFORM') or die;
  10. //
  11. // PHP mbstring and iconv local configuration
  12. //
  13. // Check if mbstring extension is loaded and attempt to load it if not present except for windows
  14. if (extension_loaded('mbstring') || ((!strtoupper(substr(PHP_OS, 0, 3)) === 'WIN' && dl('mbstring.so'))))
  15. {
  16. // Make sure to suppress the output in case ini_set is disabled
  17. @ini_set('mbstring.internal_encoding', 'UTF-8');
  18. @ini_set('mbstring.http_input', 'UTF-8');
  19. @ini_set('mbstring.http_output', 'UTF-8');
  20. }
  21. // Same for iconv
  22. if (function_exists('iconv') || ((!strtoupper(substr(PHP_OS, 0, 3)) === 'WIN' && dl('iconv.so'))))
  23. {
  24. // These are settings that can be set inside code
  25. iconv_set_encoding("internal_encoding", "UTF-8");
  26. iconv_set_encoding("input_encoding", "UTF-8");
  27. iconv_set_encoding("output_encoding", "UTF-8");
  28. }
  29. /**
  30. * Include the utf8 package
  31. */
  32. jimport('phputf8.utf8');
  33. jimport('phputf8.strcasecmp');
  34. /**
  35. * String handling class for utf-8 data
  36. * Wraps the phputf8 library
  37. * All functions assume the validity of utf-8 strings.
  38. *
  39. * @package Joomla.Platform
  40. * @subpackage String
  41. * @since 11.1
  42. */
  43. abstract class JString
  44. {
  45. /**
  46. * Increment styles.
  47. *
  48. * @var array
  49. * @since 11.3
  50. */
  51. protected static $incrementStyles = array(
  52. 'dash' => array(
  53. '#-(\d+)$#',
  54. '-%d'
  55. ),
  56. 'default' => array(
  57. array('#\((\d+)\)$#', '#\(\d+\)$#'),
  58. array(' (%d)', '(%d)'),
  59. ),
  60. );
  61. /**
  62. * Split a string in camel case format
  63. *
  64. * "FooBarABCDef" becomes array("Foo", "Bar", "ABC", "Def");
  65. * "JFooBar" becomes array("J", "Foo", "Bar");
  66. * "J001FooBar002" becomes array("J001", "Foo", "Bar002");
  67. * "abcDef" becomes array("abc", "Def");
  68. * "abc_defGhi_Jkl" becomes array("abc_def", "Ghi_Jkl");
  69. * "ThisIsA_NASAAstronaut" becomes array("This", "Is", "A_NASA", "Astronaut")),
  70. * "JohnFitzgerald_Kennedy" becomes array("John", "Fitzgerald_Kennedy")),
  71. *
  72. * @param string $string The source string.
  73. *
  74. * @return array The splitted string.
  75. *
  76. * @since 11.3
  77. */
  78. public static function splitCamelCase($string)
  79. {
  80. return preg_split('/(?<=[^A-Z_])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][^A-Z_])/x', $string);
  81. }
  82. /**
  83. * Increments a trailing number in a string.
  84. *
  85. * Used to easily create distinct labels when copying objects. The method has the following styles:
  86. *
  87. * default: "Label" becomes "Label (2)"
  88. * dash: "Label" becomes "Label-2"
  89. *
  90. * @param string $string The source string.
  91. * @param string $style The the style (default|dash).
  92. * @param integer $n If supplied, this number is used for the copy, otherwise it is the 'next' number.
  93. *
  94. * @return string The incremented string.
  95. *
  96. * @since 11.3
  97. */
  98. public static function increment($string, $style = 'default', $n = 0)
  99. {
  100. $styleSpec = isset(self::$incrementStyles[$style]) ? self::$incrementStyles[$style] : self::$incrementStyles['default'];
  101. // Regular expression search and replace patterns.
  102. if (is_array($styleSpec[0]))
  103. {
  104. $rxSearch = $styleSpec[0][0];
  105. $rxReplace = $styleSpec[0][1];
  106. }
  107. else
  108. {
  109. $rxSearch = $rxReplace = $styleSpec[0];
  110. }
  111. // New and old (existing) sprintf formats.
  112. if (is_array($styleSpec[1]))
  113. {
  114. $newFormat = $styleSpec[1][0];
  115. $oldFormat = $styleSpec[1][1];
  116. }
  117. else
  118. {
  119. $newFormat = $oldFormat = $styleSpec[1];
  120. }
  121. // Check if we are incrementing an existing pattern, or appending a new one.
  122. if (preg_match($rxSearch, $string, $matches))
  123. {
  124. $n = empty($n) ? ($matches[1] + 1) : $n;
  125. $string = preg_replace($rxReplace, sprintf($oldFormat, $n), $string);
  126. }
  127. else
  128. {
  129. $n = empty($n) ? 2 : $n;
  130. $string .= sprintf($newFormat, $n);
  131. }
  132. return $string;
  133. }
  134. /**
  135. * UTF-8 aware alternative to strpos.
  136. *
  137. * Find position of first occurrence of a string.
  138. *
  139. * @param string $str String being examined
  140. * @param string $search String being searched for
  141. * @param integer $offset Optional, specifies the position from which the search should be performed
  142. *
  143. * @return mixed Number of characters before the first match or FALSE on failure
  144. *
  145. * @see http://www.php.net/strpos
  146. * @since 11.1
  147. */
  148. public static function strpos($str, $search, $offset = false)
  149. {
  150. if ($offset === false)
  151. {
  152. return utf8_strpos($str, $search);
  153. }
  154. else
  155. {
  156. return utf8_strpos($str, $search, $offset);
  157. }
  158. }
  159. /**
  160. * UTF-8 aware alternative to strrpos
  161. * Finds position of last occurrence of a string
  162. *
  163. * @param string $str String being examined.
  164. * @param string $search String being searched for.
  165. * @param integer $offset Offset from the left of the string.
  166. *
  167. * @return mixed Number of characters before the last match or false on failure
  168. *
  169. * @see http://www.php.net/strrpos
  170. * @since 11.1
  171. */
  172. public static function strrpos($str, $search, $offset = 0)
  173. {
  174. return utf8_strrpos($str, $search, $offset);
  175. }
  176. /**
  177. * UTF-8 aware alternative to substr
  178. * Return part of a string given character offset (and optionally length)
  179. *
  180. * @param string $str String being processed
  181. * @param integer $offset Number of UTF-8 characters offset (from left)
  182. * @param integer $length Optional length in UTF-8 characters from offset
  183. *
  184. * @return mixed string or FALSE if failure
  185. *
  186. * @see http://www.php.net/substr
  187. * @since 11.1
  188. */
  189. public static function substr($str, $offset, $length = false)
  190. {
  191. if ($length === false)
  192. {
  193. return utf8_substr($str, $offset);
  194. }
  195. else
  196. {
  197. return utf8_substr($str, $offset, $length);
  198. }
  199. }
  200. /**
  201. * UTF-8 aware alternative to strtlower
  202. *
  203. * Make a string lowercase
  204. * Note: The concept of a characters "case" only exists is some alphabets
  205. * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
  206. * not exist in the Chinese alphabet, for example. See Unicode Standard
  207. * Annex #21: Case Mappings
  208. *
  209. * @param string $str String being processed
  210. *
  211. * @return mixed Either string in lowercase or FALSE is UTF-8 invalid
  212. *
  213. * @see http://www.php.net/strtolower
  214. * @since 11.1
  215. */
  216. public static function strtolower($str)
  217. {
  218. return utf8_strtolower($str);
  219. }
  220. /**
  221. * UTF-8 aware alternative to strtoupper
  222. * Make a string uppercase
  223. * Note: The concept of a characters "case" only exists is some alphabets
  224. * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
  225. * not exist in the Chinese alphabet, for example. See Unicode Standard
  226. * Annex #21: Case Mappings
  227. *
  228. * @param string $str String being processed
  229. *
  230. * @return mixed Either string in uppercase or FALSE is UTF-8 invalid
  231. *
  232. * @see http://www.php.net/strtoupper
  233. * @since 11.1
  234. */
  235. public static function strtoupper($str)
  236. {
  237. return utf8_strtoupper($str);
  238. }
  239. /**
  240. * UTF-8 aware alternative to strlen.
  241. *
  242. * Returns the number of characters in the string (NOT THE NUMBER OF BYTES),
  243. *
  244. * @param string $str UTF-8 string.
  245. *
  246. * @return integer Number of UTF-8 characters in string.
  247. *
  248. * @see http://www.php.net/strlen
  249. * @since 11.1
  250. */
  251. public static function strlen($str)
  252. {
  253. return utf8_strlen($str);
  254. }
  255. /**
  256. * UTF-8 aware alternative to str_ireplace
  257. * Case-insensitive version of str_replace
  258. *
  259. * @param string $search String to search
  260. * @param string $replace Existing string to replace
  261. * @param string $str New string to replace with
  262. * @param integer $count Optional count value to be passed by referene
  263. *
  264. * @return string UTF-8 String
  265. *
  266. * @see http://www.php.net/str_ireplace
  267. * @since 11.1
  268. */
  269. public static function str_ireplace($search, $replace, $str, $count = null)
  270. {
  271. jimport('phputf8.str_ireplace');
  272. if ($count === false)
  273. {
  274. return utf8_ireplace($search, $replace, $str);
  275. }
  276. else
  277. {
  278. return utf8_ireplace($search, $replace, $str, $count);
  279. }
  280. }
  281. /**
  282. * UTF-8 aware alternative to str_split
  283. * Convert a string to an array
  284. *
  285. * @param string $str UTF-8 encoded string to process
  286. * @param integer $split_len Number to characters to split string by
  287. *
  288. * @return array
  289. *
  290. * @see http://www.php.net/str_split
  291. * @since 11.1
  292. */
  293. public static function str_split($str, $split_len = 1)
  294. {
  295. jimport('phputf8.str_split');
  296. return utf8_str_split($str, $split_len);
  297. }
  298. /**
  299. * UTF-8/LOCALE aware alternative to strcasecmp
  300. * A case insensitive string comparison
  301. *
  302. * @param string $str1 string 1 to compare
  303. * @param string $str2 string 2 to compare
  304. * @param mixed $locale The locale used by strcoll or false to use classical comparison
  305. *
  306. * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
  307. *
  308. * @see http://www.php.net/strcasecmp
  309. * @see http://www.php.net/strcoll
  310. * @see http://www.php.net/setlocale
  311. * @since 11.1
  312. */
  313. public static function strcasecmp($str1, $str2, $locale = false)
  314. {
  315. if ($locale)
  316. {
  317. // Get current locale
  318. $locale0 = setlocale(LC_COLLATE, 0);
  319. if (!$locale = setlocale(LC_COLLATE, $locale))
  320. {
  321. $locale = $locale0;
  322. }
  323. // See if we have successfully set locale to UTF-8
  324. if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
  325. {
  326. $encoding = 'CP' . $m[1];
  327. }
  328. elseif (stristr($locale, 'UTF-8'))
  329. {
  330. $encoding = 'UTF-8';
  331. }
  332. else
  333. {
  334. $encoding = 'nonrecodable';
  335. }
  336. // if we successfully set encoding it to utf-8 or encoding is sth weird don't recode
  337. if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
  338. {
  339. return strcoll(utf8_strtolower($str1), utf8_strtolower($str2));
  340. }
  341. else
  342. {
  343. return strcoll(
  344. self::transcode(utf8_strtolower($str1), 'UTF-8', $encoding),
  345. self::transcode(utf8_strtolower($str2), 'UTF-8', $encoding)
  346. );
  347. }
  348. }
  349. else
  350. {
  351. return utf8_strcasecmp($str1, $str2);
  352. }
  353. }
  354. /**
  355. * UTF-8/LOCALE aware alternative to strcmp
  356. * A case sensitive string comparison
  357. *
  358. * @param string $str1 string 1 to compare
  359. * @param string $str2 string 2 to compare
  360. * @param mixed $locale The locale used by strcoll or false to use classical comparison
  361. *
  362. * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
  363. *
  364. * @see http://www.php.net/strcmp
  365. * @see http://www.php.net/strcoll
  366. * @see http://www.php.net/setlocale
  367. * @since 11.1
  368. */
  369. public static function strcmp($str1, $str2, $locale = false)
  370. {
  371. if ($locale)
  372. {
  373. // Get current locale
  374. $locale0 = setlocale(LC_COLLATE, 0);
  375. if (!$locale = setlocale(LC_COLLATE, $locale))
  376. {
  377. $locale = $locale0;
  378. }
  379. // See if we have successfully set locale to UTF-8
  380. if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
  381. {
  382. $encoding = 'CP' . $m[1];
  383. }
  384. elseif (stristr($locale, 'UTF-8'))
  385. {
  386. $encoding = 'UTF-8';
  387. }
  388. else
  389. {
  390. $encoding = 'nonrecodable';
  391. }
  392. // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
  393. if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
  394. {
  395. return strcoll($str1, $str2);
  396. }
  397. else
  398. {
  399. return strcoll(self::transcode($str1, 'UTF-8', $encoding), self::transcode($str2, 'UTF-8', $encoding));
  400. }
  401. }
  402. else
  403. {
  404. return strcmp($str1, $str2);
  405. }
  406. }
  407. /**
  408. * UTF-8 aware alternative to strcspn
  409. * Find length of initial segment not matching mask
  410. *
  411. * @param string $str The string to process
  412. * @param string $mask The mask
  413. * @param integer $start Optional starting character position (in characters)
  414. * @param integer $length Optional length
  415. *
  416. * @return integer The length of the initial segment of str1 which does not contain any of the characters in str2
  417. *
  418. * @see http://www.php.net/strcspn
  419. * @since 11.1
  420. */
  421. public static function strcspn($str, $mask, $start = null, $length = null)
  422. {
  423. jimport('phputf8.strcspn');
  424. if ($start === false && $length === false)
  425. {
  426. return utf8_strcspn($str, $mask);
  427. }
  428. elseif ($length === false)
  429. {
  430. return utf8_strcspn($str, $mask, $start);
  431. }
  432. else
  433. {
  434. return utf8_strcspn($str, $mask, $start, $length);
  435. }
  436. }
  437. /**
  438. * UTF-8 aware alternative to stristr
  439. * Returns all of haystack from the first occurrence of needle to the end.
  440. * needle and haystack are examined in a case-insensitive manner
  441. * Find first occurrence of a string using case insensitive comparison
  442. *
  443. * @param string $str The haystack
  444. * @param string $search The needle
  445. *
  446. * @return string the sub string
  447. *
  448. * @see http://www.php.net/stristr
  449. * @since 11.1
  450. */
  451. public static function stristr($str, $search)
  452. {
  453. jimport('phputf8.stristr');
  454. return utf8_stristr($str, $search);
  455. }
  456. /**
  457. * UTF-8 aware alternative to strrev
  458. * Reverse a string
  459. *
  460. * @param string $str String to be reversed
  461. *
  462. * @return string The string in reverse character order
  463. *
  464. * @see http://www.php.net/strrev
  465. * @since 11.1
  466. */
  467. public static function strrev($str)
  468. {
  469. jimport('phputf8.strrev');
  470. return utf8_strrev($str);
  471. }
  472. /**
  473. * UTF-8 aware alternative to strspn
  474. * Find length of initial segment matching mask
  475. *
  476. * @param string $str The haystack
  477. * @param string $mask The mask
  478. * @param integer $start Start optional
  479. * @param integer $length Length optional
  480. *
  481. * @return integer
  482. *
  483. * @see http://www.php.net/strspn
  484. * @since 11.1
  485. */
  486. public static function strspn($str, $mask, $start = null, $length = null)
  487. {
  488. jimport('phputf8.strspn');
  489. if ($start === null && $length === null)
  490. {
  491. return utf8_strspn($str, $mask);
  492. }
  493. elseif ($length === null)
  494. {
  495. return utf8_strspn($str, $mask, $start);
  496. }
  497. else
  498. {
  499. return utf8_strspn($str, $mask, $start, $length);
  500. }
  501. }
  502. /**
  503. * UTF-8 aware substr_replace
  504. * Replace text within a portion of a string
  505. *
  506. * @param string $str The haystack
  507. * @param string $repl The replacement string
  508. * @param integer $start Start
  509. * @param integer $length Length (optional)
  510. *
  511. * @return string
  512. *
  513. * @see http://www.php.net/substr_replace
  514. * @since 11.1
  515. */
  516. public static function substr_replace($str, $repl, $start, $length = null)
  517. {
  518. // loaded by library loader
  519. if ($length === false)
  520. {
  521. return utf8_substr_replace($str, $repl, $start);
  522. }
  523. else
  524. {
  525. return utf8_substr_replace($str, $repl, $start, $length);
  526. }
  527. }
  528. /**
  529. * UTF-8 aware replacement for ltrim()
  530. *
  531. * Strip whitespace (or other characters) from the beginning of a string
  532. * You only need to use this if you are supplying the charlist
  533. * optional arg and it contains UTF-8 characters. Otherwise ltrim will
  534. * work normally on a UTF-8 string
  535. *
  536. * @param string $str The string to be trimmed
  537. * @param string $charlist The optional charlist of additional characters to trim
  538. *
  539. * @return string The trimmed string
  540. *
  541. * @see http://www.php.net/ltrim
  542. * @since 11.1
  543. */
  544. public static function ltrim($str, $charlist = false)
  545. {
  546. if (empty($charlist) && $charlist !== false)
  547. {
  548. return $str;
  549. }
  550. jimport('phputf8.trim');
  551. if ($charlist === false)
  552. {
  553. return utf8_ltrim($str);
  554. }
  555. else
  556. {
  557. return utf8_ltrim($str, $charlist);
  558. }
  559. }
  560. /**
  561. * UTF-8 aware replacement for rtrim()
  562. * Strip whitespace (or other characters) from the end of a string
  563. * You only need to use this if you are supplying the charlist
  564. * optional arg and it contains UTF-8 characters. Otherwise rtrim will
  565. * work normally on a UTF-8 string
  566. *
  567. * @param string $str The string to be trimmed
  568. * @param string $charlist The optional charlist of additional characters to trim
  569. *
  570. * @return string The trimmed string
  571. *
  572. * @see http://www.php.net/rtrim
  573. * @since 11.1
  574. */
  575. public static function rtrim($str, $charlist = false)
  576. {
  577. if (empty($charlist) && $charlist !== false)
  578. {
  579. return $str;
  580. }
  581. jimport('phputf8.trim');
  582. if ($charlist === false)
  583. {
  584. return utf8_rtrim($str);
  585. }
  586. else
  587. {
  588. return utf8_rtrim($str, $charlist);
  589. }
  590. }
  591. /**
  592. * UTF-8 aware replacement for trim()
  593. * Strip whitespace (or other characters) from the beginning and end of a string
  594. * Note: you only need to use this if you are supplying the charlist
  595. * optional arg and it contains UTF-8 characters. Otherwise trim will
  596. * work normally on a UTF-8 string
  597. *
  598. * @param string $str The string to be trimmed
  599. * @param string $charlist The optional charlist of additional characters to trim
  600. *
  601. * @return string The trimmed string
  602. *
  603. * @see http://www.php.net/trim
  604. * @since 11.1
  605. */
  606. public static function trim($str, $charlist = false)
  607. {
  608. if (empty($charlist) && $charlist !== false)
  609. {
  610. return $str;
  611. }
  612. jimport('phputf8.trim');
  613. if ($charlist === false)
  614. {
  615. return utf8_trim($str);
  616. }
  617. else
  618. {
  619. return utf8_trim($str, $charlist);
  620. }
  621. }
  622. /**
  623. * UTF-8 aware alternative to ucfirst
  624. * Make a string's first character uppercase or all words' first character uppercase
  625. *
  626. * @param string $str String to be processed
  627. * @param string $delimiter The words delimiter (null means do not split the string)
  628. * @param string $newDelimiter The new words delimiter (null means equal to $delimiter)
  629. *
  630. * @return string If $delimiter is null, return the string with first character as upper case (if applicable)
  631. * else consider the string of words separated by the delimiter, apply the ucfirst to each words
  632. * and return the string with the new delimiter
  633. *
  634. * @see http://www.php.net/ucfirst
  635. * @since 11.1
  636. */
  637. public static function ucfirst($str, $delimiter = null, $newDelimiter = null)
  638. {
  639. jimport('phputf8.ucfirst');
  640. if ($delimiter === null)
  641. {
  642. return utf8_ucfirst($str);
  643. }
  644. else
  645. {
  646. if ($newDelimiter === null)
  647. {
  648. $newDelimiter = $delimiter;
  649. }
  650. return implode($newDelimiter, array_map('utf8_ucfirst', explode($delimiter, $str)));
  651. }
  652. }
  653. /**
  654. * UTF-8 aware alternative to ucwords
  655. * Uppercase the first character of each word in a string
  656. *
  657. * @param string $str String to be processed
  658. *
  659. * @return string String with first char of each word uppercase
  660. *
  661. * @see http://www.php.net/ucwords
  662. * @since 11.1
  663. */
  664. public static function ucwords($str)
  665. {
  666. jimport('phputf8.ucwords');
  667. return utf8_ucwords($str);
  668. }
  669. /**
  670. * Catch an error and throw an exception.
  671. *
  672. * @param integer $number Error level
  673. * @param string $message Error message
  674. *
  675. * @return void
  676. *
  677. * @link https://bugs.php.net/bug.php?id=48147
  678. *
  679. * @throw ErrorException
  680. */
  681. private static function _iconvErrorHandler($number, $message)
  682. {
  683. throw new ErrorException($message, 0, $number);
  684. }
  685. /**
  686. * Transcode a string.
  687. *
  688. * @param string $source The string to transcode.
  689. * @param string $from_encoding The source encoding.
  690. * @param string $to_encoding The target encoding.
  691. *
  692. * @return mixed The transcoded string, or null if the source was not a string.
  693. *
  694. * @link https://bugs.php.net/bug.php?id=48147
  695. *
  696. * @since 11.1
  697. */
  698. public static function transcode($source, $from_encoding, $to_encoding)
  699. {
  700. if (is_string($source))
  701. {
  702. set_error_handler(array(__CLASS__, '_iconvErrorHandler'), E_NOTICE);
  703. try
  704. {
  705. /*
  706. * "//TRANSLIT//IGNORE" is appended to the $to_encoding to ensure that when iconv comes
  707. * across a character that cannot be represented in the target charset, it can
  708. * be approximated through one or several similarly looking characters or ignored.
  709. */
  710. $iconv = iconv($from_encoding, $to_encoding . '//TRANSLIT//IGNORE', $source);
  711. }
  712. catch (ErrorException $e)
  713. {
  714. /*
  715. * "//IGNORE" is appended to the $to_encoding to ensure that when iconv comes
  716. * across a character that cannot be represented in the target charset, it is ignored.
  717. */
  718. $iconv = iconv($from_encoding, $to_encoding . '//IGNORE', $source);
  719. }
  720. restore_error_handler();
  721. return $iconv;
  722. }
  723. return null;
  724. }
  725. /**
  726. * Tests a string as to whether it's valid UTF-8 and supported by the Unicode standard.
  727. *
  728. * Note: this function has been modified to simple return true or false.
  729. *
  730. * @param string $str UTF-8 encoded string.
  731. *
  732. * @return boolean true if valid
  733. *
  734. * @author <hsivonen@iki.fi>
  735. * @see http://hsivonen.iki.fi/php-utf8/
  736. * @see compliant
  737. * @since 11.1
  738. */
  739. public static function valid($str)
  740. {
  741. // Cached expected number of octets after the current octet
  742. // until the beginning of the next UTF8 character sequence
  743. $mState = 0;
  744. // Cached Unicode character
  745. $mUcs4 = 0;
  746. // Cached expected number of octets in the current sequence
  747. $mBytes = 1;
  748. $len = strlen($str);
  749. for ($i = 0; $i < $len; $i++)
  750. {
  751. $in = ord($str{$i});
  752. if ($mState == 0)
  753. {
  754. // When mState is zero we expect either a US-ASCII character or a
  755. // multi-octet sequence.
  756. if (0 == (0x80 & ($in)))
  757. {
  758. // US-ASCII, pass straight through.
  759. $mBytes = 1;
  760. }
  761. elseif (0xC0 == (0xE0 & ($in)))
  762. {
  763. // First octet of 2 octet sequence
  764. $mUcs4 = ($in);
  765. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  766. $mState = 1;
  767. $mBytes = 2;
  768. }
  769. elseif (0xE0 == (0xF0 & ($in)))
  770. {
  771. // First octet of 3 octet sequence
  772. $mUcs4 = ($in);
  773. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  774. $mState = 2;
  775. $mBytes = 3;
  776. }
  777. elseif (0xF0 == (0xF8 & ($in)))
  778. {
  779. // First octet of 4 octet sequence
  780. $mUcs4 = ($in);
  781. $mUcs4 = ($mUcs4 & 0x07) << 18;
  782. $mState = 3;
  783. $mBytes = 4;
  784. }
  785. elseif (0xF8 == (0xFC & ($in)))
  786. {
  787. /* First octet of 5 octet sequence.
  788. *
  789. * This is illegal because the encoded codepoint must be either
  790. * (a) not the shortest form or
  791. * (b) outside the Unicode range of 0-0x10FFFF.
  792. * Rather than trying to resynchronize, we will carry on until the end
  793. * of the sequence and let the later error handling code catch it.
  794. */
  795. $mUcs4 = ($in);
  796. $mUcs4 = ($mUcs4 & 0x03) << 24;
  797. $mState = 4;
  798. $mBytes = 5;
  799. }
  800. elseif (0xFC == (0xFE & ($in)))
  801. {
  802. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  803. $mUcs4 = ($in);
  804. $mUcs4 = ($mUcs4 & 1) << 30;
  805. $mState = 5;
  806. $mBytes = 6;
  807. }
  808. else
  809. {
  810. /* Current octet is neither in the US-ASCII range nor a legal first
  811. * octet of a multi-octet sequence.
  812. */
  813. return false;
  814. }
  815. }
  816. else
  817. {
  818. // When mState is non-zero, we expect a continuation of the multi-octet
  819. // sequence
  820. if (0x80 == (0xC0 & ($in)))
  821. {
  822. // Legal continuation.
  823. $shift = ($mState - 1) * 6;
  824. $tmp = $in;
  825. $tmp = ($tmp & 0x0000003F) << $shift;
  826. $mUcs4 |= $tmp;
  827. /**
  828. * End of the multi-octet sequence. mUcs4 now contains the final
  829. * Unicode codepoint to be output
  830. */
  831. if (0 == --$mState)
  832. {
  833. /*
  834. * Check for illegal sequences and codepoints.
  835. */
  836. // From Unicode 3.1, non-shortest form is illegal
  837. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || ((3 == $mBytes) && ($mUcs4 < 0x0800)) || ((4 == $mBytes) && ($mUcs4 < 0x10000))
  838. || (4 < $mBytes)
  839. || (($mUcs4 & 0xFFFFF800) == 0xD800) // From Unicode 3.2, surrogate characters are illegal
  840. || ($mUcs4 > 0x10FFFF)) // Codepoints outside the Unicode range are illegal
  841. {
  842. return false;
  843. }
  844. // Initialize UTF8 cache.
  845. $mState = 0;
  846. $mUcs4 = 0;
  847. $mBytes = 1;
  848. }
  849. }
  850. else
  851. {
  852. /**
  853. *((0xC0 & (*in) != 0x80) && (mState != 0))
  854. * Incomplete multi-octet sequence.
  855. */
  856. return false;
  857. }
  858. }
  859. }
  860. return true;
  861. }
  862. /**
  863. * Tests whether a string complies as UTF-8. This will be much
  864. * faster than utf8_is_valid but will pass five and six octet
  865. * UTF-8 sequences, which are not supported by Unicode and
  866. * so cannot be displayed correctly in a browser. In other words
  867. * it is not as strict as utf8_is_valid but it's faster. If you use
  868. * it to validate user input, you place yourself at the risk that
  869. * attackers will be able to inject 5 and 6 byte sequences (which
  870. * may or may not be a significant risk, depending on what you are
  871. * are doing)
  872. *
  873. * @param string $str UTF-8 string to check
  874. *
  875. * @return boolean TRUE if string is valid UTF-8
  876. *
  877. * @see valid
  878. * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
  879. * @since 11.1
  880. */
  881. public static function compliant($str)
  882. {
  883. if (strlen($str) == 0)
  884. {
  885. return true;
  886. }
  887. // If even just the first character can be matched, when the /u
  888. // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
  889. // invalid, nothing at all will match, even if the string contains
  890. // some valid sequences
  891. return (preg_match('/^.{1}/us', $str, $ar) == 1);
  892. }
  893. /**
  894. * Does a UTF-8 safe version of PHP parse_url function
  895. *
  896. * @param string $url URL to parse
  897. *
  898. * @return mixed Associative array or false if badly formed URL.
  899. *
  900. * @see http://us3.php.net/manual/en/function.parse-url.php
  901. * @since 11.1
  902. */
  903. public static function parse_url($url)
  904. {
  905. $result = array();
  906. // Build arrays of values we need to decode before parsing
  907. $entities = array('%21', '%2A', '%27', '%28', '%29', '%3B', '%3A', '%40', '%26', '%3D', '%24', '%2C', '%2F', '%3F', '%25', '%23', '%5B',
  908. '%5D');
  909. $replacements = array('!', '*', "'", "(", ")", ";", ":", "@", "&", "=", "$", ",", "/", "?", "%", "#", "[", "]");
  910. // Create encoded URL with special URL characters decoded so it can be parsed
  911. // All other characters will be encoded
  912. $encodedURL = str_replace($entities, $replacements, urlencode($url));
  913. // Parse the encoded URL
  914. $encodedParts = parse_url($encodedURL);
  915. // Now, decode each value of the resulting array
  916. foreach ($encodedParts as $key => $value)
  917. {
  918. $result[$key] = urldecode($value);
  919. }
  920. return $result;
  921. }
  922. }