PageRenderTime 51ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/system/core/utf8.php

https://github.com/Toushi/flow
PHP | 743 lines | 303 code | 90 blank | 350 comment | 32 complexity | 7adf473dff0622287499b853aa12e31e MD5 | raw file
  1. <?php defined('SYSPATH') or die('No direct script access.');
  2. /**
  3. * A port of phputf8 to a unified file/class. Checks PHP status to ensure that
  4. * UTF-8 support is available and normalize global variables to UTF-8. It also
  5. * provides multi-byte aware replacement string functions.
  6. *
  7. * This file is licensed differently from the rest of Kohana. As a port of
  8. * phputf8, which is LGPL software, this file is released under the LGPL.
  9. *
  10. * PCRE needs to be compiled with UTF-8 support (--enable-utf8).
  11. * Support for Unicode properties is highly recommended (--enable-unicode-properties).
  12. * @see http://php.net/manual/reference.pcre.pattern.modifiers.php
  13. *
  14. * UTF-8 conversion will be much more reliable if the iconv extension is loaded.
  15. * @see http://php.net/iconv
  16. *
  17. * The mbstring extension is highly recommended, but must not be overloading
  18. * string functions.
  19. * @see http://php.net/mbstring
  20. *
  21. * $Id: utf8.php 2712 2008-05-29 17:05:06Z Geert $
  22. *
  23. * @package Core
  24. * @author Kohana Team
  25. * @copyright (c) 2007 Kohana Team
  26. * @copyright (c) 2005 Harry Fuecks
  27. * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt
  28. */
  29. if ( ! preg_match('/^.$/u', 'ñ'))
  30. {
  31. trigger_error
  32. (
  33. '<a href="http://php.net/pcre">PCRE</a> has not been compiled with UTF-8 support. '.
  34. 'See <a href="http://php.net/manual/reference.pcre.pattern.modifiers.php">PCRE Pattern Modifiers</a> '.
  35. 'for more information. This application cannot be run without UTF-8 support.',
  36. E_USER_ERROR
  37. );
  38. }
  39. if ( ! extension_loaded('iconv'))
  40. {
  41. trigger_error
  42. (
  43. 'The <a href="http://php.net/iconv">iconv</a> extension is not loaded. '.
  44. 'Without iconv, strings cannot be properly translated to UTF-8 from user input. '.
  45. 'This application cannot be run without UTF-8 support.',
  46. E_USER_ERROR
  47. );
  48. }
  49. if (extension_loaded('mbstring') AND (ini_get('mbstring.func_overload') & MB_OVERLOAD_STRING))
  50. {
  51. trigger_error
  52. (
  53. 'The <a href="http://php.net/mbstring">mbstring</a> extension is overloading PHP\'s native string functions. '.
  54. 'Disable this by setting mbstring.func_overload to 0, 1, 4 or 5 in php.ini or a .htaccess file.'.
  55. 'This application cannot be run without UTF-8 support.',
  56. E_USER_ERROR
  57. );
  58. }
  59. // Check PCRE support for Unicode properties such as \p and \X.
  60. $ER = error_reporting(0);
  61. define('PCRE_UNICODE_PROPERTIES', (bool) preg_match('/^\pL$/u', 'ñ'));
  62. error_reporting($ER);
  63. // SERVER_UTF8 ? use mb_* functions : use non-native functions
  64. if (extension_loaded('mbstring'))
  65. {
  66. mb_internal_encoding('UTF-8');
  67. define('SERVER_UTF8', TRUE);
  68. }
  69. else
  70. {
  71. define('SERVER_UTF8', FALSE);
  72. }
  73. // Convert all global variables to UTF-8.
  74. $_GET = utf8::clean($_GET);
  75. $_POST = utf8::clean($_POST);
  76. $_COOKIE = utf8::clean($_COOKIE);
  77. $_SERVER = utf8::clean($_SERVER);
  78. if (PHP_SAPI == 'cli')
  79. {
  80. // Convert command line arguments
  81. $_SERVER['argv'] = utf8::clean($_SERVER['argv']);
  82. }
  83. final class utf8 {
  84. // Called methods
  85. static $called = array();
  86. /**
  87. * Recursively cleans arrays, objects, and strings. Removes ASCII control
  88. * codes and converts to UTF-8 while silently discarding incompatible
  89. * UTF-8 characters.
  90. *
  91. * @param string string to clean
  92. * @return string
  93. */
  94. public static function clean($str)
  95. {
  96. if (is_array($str) OR is_object($str))
  97. {
  98. foreach ($str as $key => $val)
  99. {
  100. // Recursion!
  101. $str[self::clean($key)] = self::clean($val);
  102. }
  103. }
  104. elseif (is_string($str) AND $str !== '')
  105. {
  106. // Remove control characters
  107. $str = self::strip_ascii_ctrl($str);
  108. if ( ! self::is_ascii($str))
  109. {
  110. // Disable notices
  111. $ER = error_reporting(~E_NOTICE);
  112. // iconv is expensive, so it is only used when needed
  113. $str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
  114. // Turn notices back on
  115. error_reporting($ER);
  116. }
  117. }
  118. return $str;
  119. }
  120. /**
  121. * Tests whether a string contains only 7bit ASCII bytes. This is used to
  122. * determine when to use native functions or UTF-8 functions.
  123. *
  124. * @param string string to check
  125. * @return bool
  126. */
  127. public static function is_ascii($str)
  128. {
  129. return ! preg_match('/[^\x00-\x7F]/S', $str);
  130. }
  131. /**
  132. * Strips out device control codes in the ASCII range.
  133. *
  134. * @param string string to clean
  135. * @return string
  136. */
  137. public static function strip_ascii_ctrl($str)
  138. {
  139. return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S', '', $str);
  140. }
  141. /**
  142. * Strips out all non-7bit ASCII bytes.
  143. *
  144. * @param string string to clean
  145. * @return string
  146. */
  147. public static function strip_non_ascii($str)
  148. {
  149. return preg_replace('/[^\x00-\x7F]+/S', '', $str);
  150. }
  151. /**
  152. * Replaces special/accented UTF-8 characters by ASCII-7 'equivalents'.
  153. *
  154. * @author Andreas Gohr <andi@splitbrain.org>
  155. *
  156. * @param string string to transliterate
  157. * @param integer -1 lowercase only, +1 uppercase only, 0 both cases
  158. * @return string
  159. */
  160. public static function transliterate_to_ascii($str, $case = 0)
  161. {
  162. if ( ! isset(self::$called[__FUNCTION__]))
  163. {
  164. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  165. // Function has been called
  166. self::$called[__FUNCTION__] = TRUE;
  167. }
  168. return _transliterate_to_ascii($str, $case);
  169. }
  170. /**
  171. * Returns the length of the given string.
  172. * @see http://php.net/strlen
  173. *
  174. * @param string string being measured for length
  175. * @return integer
  176. */
  177. public static function strlen($str)
  178. {
  179. if ( ! isset(self::$called[__FUNCTION__]))
  180. {
  181. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  182. // Function has been called
  183. self::$called[__FUNCTION__] = TRUE;
  184. }
  185. return _strlen($str);
  186. }
  187. /**
  188. * Finds position of first occurrence of a UTF-8 string.
  189. * @see http://php.net/strlen
  190. *
  191. * @author Harry Fuecks <hfuecks@gmail.com>
  192. *
  193. * @param string haystack
  194. * @param string needle
  195. * @param integer offset from which character in haystack to start searching
  196. * @return integer position of needle
  197. * @return boolean FALSE if the needle is not found
  198. */
  199. public static function strpos($str, $search, $offset = 0)
  200. {
  201. if ( ! isset(self::$called[__FUNCTION__]))
  202. {
  203. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  204. // Function has been called
  205. self::$called[__FUNCTION__] = TRUE;
  206. }
  207. return _strpos($str, $search, $offset);
  208. }
  209. /**
  210. * Finds position of last occurrence of a char in a UTF-8 string.
  211. * @see http://php.net/strrpos
  212. *
  213. * @author Harry Fuecks <hfuecks@gmail.com>
  214. *
  215. * @param string haystack
  216. * @param string needle
  217. * @param integer offset from which character in haystack to start searching
  218. * @return integer position of needle
  219. * @return boolean FALSE if the needle is not found
  220. */
  221. public static function strrpos($str, $search, $offset = 0)
  222. {
  223. if ( ! isset(self::$called[__FUNCTION__]))
  224. {
  225. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  226. // Function has been called
  227. self::$called[__FUNCTION__] = TRUE;
  228. }
  229. return _strrpos($str, $search, $offset);
  230. }
  231. /**
  232. * Returns part of a UTF-8 string.
  233. * @see http://php.net/substr
  234. *
  235. * @author Chris Smith <chris@jalakai.co.uk>
  236. *
  237. * @param string input string
  238. * @param integer offset
  239. * @param integer length limit
  240. * @return string
  241. */
  242. public static function substr($str, $offset, $length = NULL)
  243. {
  244. if ( ! isset(self::$called[__FUNCTION__]))
  245. {
  246. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  247. // Function has been called
  248. self::$called[__FUNCTION__] = TRUE;
  249. }
  250. return _substr($str, $offset, $length);
  251. }
  252. /**
  253. * Replaces text within a portion of a UTF-8 string.
  254. * @see http://php.net/substr_replace
  255. *
  256. * @author Harry Fuecks <hfuecks@gmail.com>
  257. *
  258. * @param string input string
  259. * @param string replacement string
  260. * @param integer offset
  261. * @return string
  262. */
  263. public static function substr_replace($str, $replacement, $offset, $length = NULL)
  264. {
  265. if ( ! isset(self::$called[__FUNCTION__]))
  266. {
  267. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  268. // Function has been called
  269. self::$called[__FUNCTION__] = TRUE;
  270. }
  271. return _substr_replace($str, $replacement, $offset, $length);
  272. }
  273. /**
  274. * Makes a UTF-8 string lowercase.
  275. * @see http://php.net/strtolower
  276. *
  277. * @author Andreas Gohr <andi@splitbrain.org>
  278. *
  279. * @param string mixed case string
  280. * @return string
  281. */
  282. public static function strtolower($str)
  283. {
  284. if ( ! isset(self::$called[__FUNCTION__]))
  285. {
  286. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  287. // Function has been called
  288. self::$called[__FUNCTION__] = TRUE;
  289. }
  290. return _strtolower($str);
  291. }
  292. /**
  293. * Makes a UTF-8 string uppercase.
  294. * @see http://php.net/strtoupper
  295. *
  296. * @author Andreas Gohr <andi@splitbrain.org>
  297. *
  298. * @param string mixed case string
  299. * @return string
  300. */
  301. public static function strtoupper($str)
  302. {
  303. if ( ! isset(self::$called[__FUNCTION__]))
  304. {
  305. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  306. // Function has been called
  307. self::$called[__FUNCTION__] = TRUE;
  308. }
  309. return _strtoupper($str);
  310. }
  311. /**
  312. * Makes a UTF-8 string's first character uppercase.
  313. * @see http://php.net/ucfirst
  314. *
  315. * @author Harry Fuecks <hfuecks@gmail.com>
  316. *
  317. * @param string mixed case string
  318. * @return string
  319. */
  320. public static function ucfirst($str)
  321. {
  322. if ( ! isset(self::$called[__FUNCTION__]))
  323. {
  324. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  325. // Function has been called
  326. self::$called[__FUNCTION__] = TRUE;
  327. }
  328. return _ucfirst($str);
  329. }
  330. /**
  331. * Makes the first character of every word in a UTF-8 string uppercase.
  332. * @see http://php.net/ucwords
  333. *
  334. * @author Harry Fuecks <hfuecks@gmail.com>
  335. *
  336. * @param string mixed case string
  337. * @return string
  338. */
  339. public static function ucwords($str)
  340. {
  341. if ( ! isset(self::$called[__FUNCTION__]))
  342. {
  343. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  344. // Function has been called
  345. self::$called[__FUNCTION__] = TRUE;
  346. }
  347. return _ucwords($str);
  348. }
  349. /**
  350. * Case-insensitive UTF-8 string comparison.
  351. * @see http://php.net/strcasecmp
  352. *
  353. * @author Harry Fuecks <hfuecks@gmail.com>
  354. *
  355. * @param string string to compare
  356. * @param string string to compare
  357. * @return integer less than 0 if str1 is less than str2
  358. * @return integer greater than 0 if str1 is greater than str2
  359. * @return integer 0 if they are equal
  360. */
  361. public static function strcasecmp($str1, $str2)
  362. {
  363. if ( ! isset(self::$called[__FUNCTION__]))
  364. {
  365. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  366. // Function has been called
  367. self::$called[__FUNCTION__] = TRUE;
  368. }
  369. return _strcasecmp($str1, $str2);
  370. }
  371. /**
  372. * Returns a string or an array with all occurrences of search in subject (ignoring case).
  373. * replaced with the given replace value.
  374. * @see http://php.net/str_ireplace
  375. *
  376. * @note It's not fast and gets slower if $search and/or $replace are arrays.
  377. * @author Harry Fuecks <hfuecks@gmail.com
  378. *
  379. * @param string|array text to replace
  380. * @param string|array replacement text
  381. * @param string|array subject text
  382. * @param integer number of matched and replaced needles will be returned via this parameter which is passed by reference
  383. * @return string if the input was a string
  384. * @return array if the input was an array
  385. */
  386. public static function str_ireplace($search, $replace, $str, & $count = NULL)
  387. {
  388. if ( ! isset(self::$called[__FUNCTION__]))
  389. {
  390. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  391. // Function has been called
  392. self::$called[__FUNCTION__] = TRUE;
  393. }
  394. return _str_ireplace($search, $replace, $str, $count);
  395. }
  396. /**
  397. * Case-insenstive UTF-8 version of strstr. Returns all of input string
  398. * from the first occurrence of needle to the end.
  399. * @see http://php.net/stristr
  400. *
  401. * @author Harry Fuecks <hfuecks@gmail.com>
  402. *
  403. * @param string input string
  404. * @param string needle
  405. * @return string matched substring if found
  406. * @return boolean FALSE if the substring was not found
  407. */
  408. public static function stristr($str, $search)
  409. {
  410. if ( ! isset(self::$called[__FUNCTION__]))
  411. {
  412. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  413. // Function has been called
  414. self::$called[__FUNCTION__] = TRUE;
  415. }
  416. return _stristr($str, $search);
  417. }
  418. /**
  419. * Finds the length of the initial segment matching mask.
  420. * @see http://php.net/strspn
  421. *
  422. * @author Harry Fuecks <hfuecks@gmail.com>
  423. *
  424. * @param string input string
  425. * @param string mask for search
  426. * @param integer start position of the string to examine
  427. * @param integer length of the string to examine
  428. * @return integer length of the initial segment that contains characters in the mask
  429. */
  430. public static function strspn($str, $mask, $offset = NULL, $length = NULL)
  431. {
  432. if ( ! isset(self::$called[__FUNCTION__]))
  433. {
  434. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  435. // Function has been called
  436. self::$called[__FUNCTION__] = TRUE;
  437. }
  438. return _strspn($str, $mask, $offset, $length);
  439. }
  440. /**
  441. * Finds the length of the initial segment not matching mask.
  442. * @see http://php.net/strcspn
  443. *
  444. * @author Harry Fuecks <hfuecks@gmail.com>
  445. *
  446. * @param string input string
  447. * @param string mask for search
  448. * @param integer start position of the string to examine
  449. * @param integer length of the string to examine
  450. * @return integer length of the initial segment that contains characters not in the mask
  451. */
  452. public static function strcspn($str, $mask, $offset = NULL, $length = NULL)
  453. {
  454. if ( ! isset(self::$called[__FUNCTION__]))
  455. {
  456. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  457. // Function has been called
  458. self::$called[__FUNCTION__] = TRUE;
  459. }
  460. return _strcspn($str, $mask, $offset, $length);
  461. }
  462. /**
  463. * Pads a UTF-8 string to a certain length with another string.
  464. * @see http://php.net/str_pad
  465. *
  466. * @author Harry Fuecks <hfuecks@gmail.com>
  467. *
  468. * @param string input string
  469. * @param integer desired string length after padding
  470. * @param string string to use as padding
  471. * @param string padding type: STR_PAD_RIGHT, STR_PAD_LEFT, or STR_PAD_BOTH
  472. * @return string
  473. */
  474. public static function str_pad($str, $final_str_length, $pad_str = ' ', $pad_type = STR_PAD_RIGHT)
  475. {
  476. if ( ! isset(self::$called[__FUNCTION__]))
  477. {
  478. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  479. // Function has been called
  480. self::$called[__FUNCTION__] = TRUE;
  481. }
  482. return _str_pad($str, $final_str_length, $pad_str, $pad_type);
  483. }
  484. /**
  485. * Converts a UTF-8 string to an array.
  486. * @see http://php.net/str_split
  487. *
  488. * @author Harry Fuecks <hfuecks@gmail.com>
  489. *
  490. * @param string input string
  491. * @param integer maximum length of each chunk
  492. * @return array
  493. */
  494. public static function str_split($str, $split_length = 1)
  495. {
  496. if ( ! isset(self::$called[__FUNCTION__]))
  497. {
  498. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  499. // Function has been called
  500. self::$called[__FUNCTION__] = TRUE;
  501. }
  502. return _str_split($str, $split_length);
  503. }
  504. /**
  505. * Reverses a UTF-8 string.
  506. * @see http://php.net/strrev
  507. *
  508. * @author Harry Fuecks <hfuecks@gmail.com>
  509. *
  510. * @param string string to be reversed
  511. * @return string
  512. */
  513. public static function strrev($str)
  514. {
  515. if ( ! isset(self::$called[__FUNCTION__]))
  516. {
  517. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  518. // Function has been called
  519. self::$called[__FUNCTION__] = TRUE;
  520. }
  521. return _strrev($str);
  522. }
  523. /**
  524. * Strips whitespace (or other UTF-8 characters) from the beginning and
  525. * end of a string.
  526. * @see http://php.net/trim
  527. *
  528. * @author Andreas Gohr <andi@splitbrain.org>
  529. *
  530. * @param string input string
  531. * @param string string of characters to remove
  532. * @return string
  533. */
  534. public static function trim($str, $charlist = NULL)
  535. {
  536. if ( ! isset(self::$called[__FUNCTION__]))
  537. {
  538. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  539. // Function has been called
  540. self::$called[__FUNCTION__] = TRUE;
  541. }
  542. return _trim($str, $charlist);
  543. }
  544. /**
  545. * Strips whitespace (or other UTF-8 characters) from the beginning of a string.
  546. * @see http://php.net/ltrim
  547. *
  548. * @author Andreas Gohr <andi@splitbrain.org>
  549. *
  550. * @param string input string
  551. * @param string string of characters to remove
  552. * @return string
  553. */
  554. public static function ltrim($str, $charlist = NULL)
  555. {
  556. if ( ! isset(self::$called[__FUNCTION__]))
  557. {
  558. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  559. // Function has been called
  560. self::$called[__FUNCTION__] = TRUE;
  561. }
  562. return _ltrim($str, $charlist);
  563. }
  564. /**
  565. * Strips whitespace (or other UTF-8 characters) from the end of a string.
  566. * @see http://php.net/rtrim
  567. *
  568. * @author Andreas Gohr <andi@splitbrain.org>
  569. *
  570. * @param string input string
  571. * @param string string of characters to remove
  572. * @return string
  573. */
  574. public static function rtrim($str, $charlist = NULL)
  575. {
  576. if ( ! isset(self::$called[__FUNCTION__]))
  577. {
  578. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  579. // Function has been called
  580. self::$called[__FUNCTION__] = TRUE;
  581. }
  582. return _rtrim($str, $charlist);
  583. }
  584. /**
  585. * Returns the unicode ordinal for a character.
  586. * @see http://php.net/ord
  587. *
  588. * @author Harry Fuecks <hfuecks@gmail.com>
  589. *
  590. * @param string UTF-8 encoded character
  591. * @return integer
  592. */
  593. public static function ord($chr)
  594. {
  595. if ( ! isset(self::$called[__FUNCTION__]))
  596. {
  597. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  598. // Function has been called
  599. self::$called[__FUNCTION__] = TRUE;
  600. }
  601. return _ord($chr);
  602. }
  603. /**
  604. * Takes an UTF-8 string and returns an array of ints representing the Unicode characters.
  605. * Astral planes are supported i.e. the ints in the output can be > 0xFFFF.
  606. * Occurrances of the BOM are ignored. Surrogates are not allowed.
  607. *
  608. * The Original Code is Mozilla Communicator client code.
  609. * The Initial Developer of the Original Code is Netscape Communications Corporation.
  610. * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
  611. * Ported to PHP by Henri Sivonen <hsivonen@iki.fi>, see http://hsivonen.iki.fi/php-utf8/.
  612. * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
  613. *
  614. * @param string UTF-8 encoded string
  615. * @return array unicode code points
  616. * @return boolean FALSE if the string is invalid
  617. */
  618. public static function to_unicode($str)
  619. {
  620. if ( ! isset(self::$called[__FUNCTION__]))
  621. {
  622. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  623. // Function has been called
  624. self::$called[__FUNCTION__] = TRUE;
  625. }
  626. return _to_unicode($str);
  627. }
  628. /**
  629. * Takes an array of ints representing the Unicode characters and returns a UTF-8 string.
  630. * Astral planes are supported i.e. the ints in the input can be > 0xFFFF.
  631. * Occurrances of the BOM are ignored. Surrogates are not allowed.
  632. *
  633. * The Original Code is Mozilla Communicator client code.
  634. * The Initial Developer of the Original Code is Netscape Communications Corporation.
  635. * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
  636. * Ported to PHP by Henri Sivonen <hsivonen@iki.fi>, see http://hsivonen.iki.fi/php-utf8/.
  637. * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
  638. *
  639. * @param array unicode code points representing a string
  640. * @return string utf8 string of characters
  641. * @return boolean FALSE if a code point cannot be found
  642. */
  643. public static function from_unicode($arr)
  644. {
  645. if ( ! isset(self::$called[__FUNCTION__]))
  646. {
  647. require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  648. // Function has been called
  649. self::$called[__FUNCTION__] = TRUE;
  650. }
  651. return _from_unicode($arr);
  652. }
  653. } // End utf8