PageRenderTime 51ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/system/core/utf8.php

https://github.com/lmorchard/friendfeedarchiver
PHP | 564 lines | 204 code | 39 blank | 321 comment | 9 complexity | 164c080ee500742e9b17aaebef3e5d90 MD5 | raw file
  1. <?php defined('SYSPATH') or die('No direct script access.');
  2. /**
  3. * A port of phputf8 to a unified file/class. Checks PHP status to ensure that
  4. * UTF-8 support is available and normalize global variables to UTF-8. It also
  5. * provides multi-byte aware replacement string functions.
  6. *
  7. * This file is licensed differently from the rest of Kohana. As a port of
  8. * phputf8, which is LGPL software, this file is released under the LGPL.
  9. *
  10. * PCRE needs to be compiled with UTF-8 support (--enable-utf8).
  11. * Support for Unicode properties is highly recommended (--enable-unicode-properties).
  12. * @see http://php.net/manual/reference.pcre.pattern.modifiers.php
  13. *
  14. * UTF-8 conversion will be much more reliable if the iconv extension is loaded.
  15. * @see http://php.net/iconv
  16. *
  17. * The mbstring extension is highly recommended, but must not be overloading
  18. * string functions.
  19. * @see http://php.net/mbstring
  20. *
  21. * $Id: utf8.php 1847 2008-01-28 20:30:44Z Geert $
  22. *
  23. * @package Core
  24. * @author Kohana Team
  25. * @copyright (c) 2007 Kohana Team
  26. * @copyright (c) 2005 Harry Fuecks
  27. * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt
  28. */
  29. if ( ! preg_match('/^.$/u', 'ñ'))
  30. {
  31. trigger_error
  32. (
  33. '<a href="http://php.net/pcre">PCRE</a> has not been compiled with UTF-8 support. '.
  34. 'See <a href="http://php.net/manual/reference.pcre.pattern.modifiers.php">PCRE Pattern Modifiers</a> '.
  35. 'for more information. This application cannot be run without UTF-8 support.',
  36. E_USER_ERROR
  37. );
  38. }
  39. if ( ! extension_loaded('iconv'))
  40. {
  41. trigger_error
  42. (
  43. 'The <a href="http://php.net/iconv">iconv</a> extension is not loaded. '.
  44. 'Without iconv, strings cannot be properly translated to UTF-8 from user input. '.
  45. 'This application cannot be run without UTF-8 support.',
  46. E_USER_ERROR
  47. );
  48. }
  49. if (extension_loaded('mbstring') AND (ini_get('mbstring.func_overload') & MB_OVERLOAD_STRING))
  50. {
  51. trigger_error
  52. (
  53. 'The <a href="http://php.net/mbstring">mbstring</a> extension is overloading PHP\'s native string functions. '.
  54. 'Disable this by setting mbstring.func_overload to 0, 1, 4 or 5 in php.ini or a .htaccess file.'.
  55. 'This application cannot be run without UTF-8 support.',
  56. E_USER_ERROR
  57. );
  58. }
  59. // Check PCRE support for Unicode properties such as \p and \X.
  60. $ER = error_reporting(0);
  61. define('PCRE_UNICODE_PROPERTIES', (bool) preg_match('/^\pL$/u', 'ñ'));
  62. error_reporting($ER);
  63. // SERVER_UTF8 ? use mb_* functions : use non-native functions
  64. if (extension_loaded('mbstring'))
  65. {
  66. mb_internal_encoding('UTF-8');
  67. define('SERVER_UTF8', TRUE);
  68. }
  69. else
  70. {
  71. define('SERVER_UTF8', FALSE);
  72. }
  73. // Convert all global variables to UTF-8.
  74. $_GET = utf8::clean($_GET);
  75. $_POST = utf8::clean($_POST);
  76. $_COOKIE = utf8::clean($_COOKIE);
  77. $_SERVER = utf8::clean($_SERVER);
  78. if (PHP_SAPI == 'cli')
  79. {
  80. // Convert command line arguments
  81. $_SERVER['argv'] = utf8::clean($_SERVER['argv']);
  82. }
  83. final class utf8 {
  84. /**
  85. * Recursively cleans arrays, objects, and strings. Removes ASCII control
  86. * codes and converts to UTF-8 while silently discarding incompatible
  87. * UTF-8 characters.
  88. *
  89. * @param string string to clean
  90. * @return string
  91. */
  92. public static function clean($str)
  93. {
  94. if (is_array($str) OR is_object($str))
  95. {
  96. foreach($str as $key => $val)
  97. {
  98. $str[self::clean($key)] = self::clean($val);
  99. }
  100. }
  101. elseif (is_string($str) AND $str != '')
  102. {
  103. // iconv is fairly expensive, so it is only used when needed
  104. if ( ! self::is_ascii($str))
  105. {
  106. $str = @iconv('UTF-8', 'UTF-8//IGNORE', $str);
  107. }
  108. $str = self::strip_ascii_ctrl($str);
  109. }
  110. return $str;
  111. }
  112. /**
  113. * Tests whether a string contains only 7bit ASCII bytes. This is used to
  114. * determine when to use native functions or UTF-8 functions.
  115. *
  116. * @param string string to check
  117. * @return bool
  118. */
  119. public static function is_ascii($str)
  120. {
  121. return ! preg_match('/[^\x00-\x7F]/S', $str);
  122. }
  123. /**
  124. * Strips out device control codes in the ASCII range.
  125. *
  126. * @param string string to clean
  127. * @return string
  128. */
  129. public static function strip_ascii_ctrl($str)
  130. {
  131. return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S', '', $str);
  132. }
  133. /**
  134. * Strips out all non-7bit ASCII bytes.
  135. *
  136. * @param string string to clean
  137. * @return string
  138. */
  139. public static function strip_non_ascii($str)
  140. {
  141. return preg_replace('/[^\x00-\x7F]+/S', '', $str);
  142. }
  143. /**
  144. * Replaces special/accented UTF-8 characters by ASCII-7 'equivalents'.
  145. *
  146. * @author Andreas Gohr <andi@splitbrain.org>
  147. *
  148. * @param string string to transliterate
  149. * @param integer -1 lowercase only, +1 uppercase only, 0 both cases
  150. * @return string
  151. */
  152. public static function transliterate_to_ascii($str, $case = 0)
  153. {
  154. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  155. return _transliterate_to_ascii($str, $case);
  156. }
  157. /**
  158. * Returns the length of the given string.
  159. * @see http://php.net/strlen
  160. *
  161. * @param string string being measured for length
  162. * @return integer
  163. */
  164. public static function strlen($str)
  165. {
  166. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  167. return _strlen($str);
  168. }
  169. /**
  170. * Finds position of first occurrence of a UTF-8 string.
  171. * @see http://php.net/strlen
  172. *
  173. * @author Harry Fuecks <hfuecks@gmail.com>
  174. *
  175. * @param string haystack
  176. * @param string needle
  177. * @param integer offset from which character in haystack to start searching
  178. * @return integer position of needle
  179. * @return boolean FALSE if the needle is not found
  180. */
  181. public static function strpos($str, $search, $offset = 0)
  182. {
  183. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  184. return _strpos($str, $search, $offset);
  185. }
  186. /**
  187. * Finds position of last occurrence of a char in a UTF-8 string.
  188. * @see http://php.net/strrpos
  189. *
  190. * @author Harry Fuecks <hfuecks@gmail.com>
  191. *
  192. * @param string haystack
  193. * @param string needle
  194. * @param integer offset from which character in haystack to start searching
  195. * @return integer position of needle
  196. * @return boolean FALSE if the needle is not found
  197. */
  198. public static function strrpos($str, $search, $offset = 0)
  199. {
  200. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  201. return _strrpos($str, $search, $offset);
  202. }
  203. /**
  204. * Returns part of a UTF-8 string.
  205. * @see http://php.net/substr
  206. *
  207. * @author Chris Smith <chris@jalakai.co.uk>
  208. *
  209. * @param string input string
  210. * @param integer offset
  211. * @param integer length limit
  212. * @return string
  213. */
  214. public static function substr($str, $offset, $length = NULL)
  215. {
  216. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  217. return _substr($str, $offset, $length);
  218. }
  219. /**
  220. * Replaces text within a portion of a UTF-8 string.
  221. * @see http://php.net/substr_replace
  222. *
  223. * @author Harry Fuecks <hfuecks@gmail.com>
  224. *
  225. * @param string input string
  226. * @param string replacement string
  227. * @param integer offset
  228. * @return string
  229. */
  230. public static function substr_replace($str, $replacement, $offset, $length = NULL)
  231. {
  232. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  233. return _substr_replace($str, $replacement, $offset, $length);
  234. }
  235. /**
  236. * Makes a UTF-8 string lowercase.
  237. * @see http://php.net/strtolower
  238. *
  239. * @author Andreas Gohr <andi@splitbrain.org>
  240. *
  241. * @param string mixed case string
  242. * @return string
  243. */
  244. public static function strtolower($str)
  245. {
  246. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  247. return _strtolower($str);
  248. }
  249. /**
  250. * Makes a UTF-8 string uppercase.
  251. * @see http://php.net/strtoupper
  252. *
  253. * @author Andreas Gohr <andi@splitbrain.org>
  254. *
  255. * @param string mixed case string
  256. * @return string
  257. */
  258. public static function strtoupper($str)
  259. {
  260. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  261. return _strtoupper($str);
  262. }
  263. /**
  264. * Makes a UTF-8 string's first character uppercase.
  265. * @see http://php.net/ucfirst
  266. *
  267. * @author Harry Fuecks <hfuecks@gmail.com>
  268. *
  269. * @param string mixed case string
  270. * @return string
  271. */
  272. public static function ucfirst($str)
  273. {
  274. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  275. return _ucfirst($str);
  276. }
  277. /**
  278. * Makes the first character of every word in a UTF-8 string uppercase.
  279. * @see http://php.net/ucwords
  280. *
  281. * @author Harry Fuecks <hfuecks@gmail.com>
  282. *
  283. * @param string mixed case string
  284. * @return string
  285. */
  286. public static function ucwords($str)
  287. {
  288. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  289. return _ucwords($str);
  290. }
  291. /**
  292. * Case-insensitive UTF-8 string comparison.
  293. * @see http://php.net/strcasecmp
  294. *
  295. * @author Harry Fuecks <hfuecks@gmail.com>
  296. *
  297. * @param string string to compare
  298. * @param string string to compare
  299. * @return integer less than 0 if str1 is less than str2
  300. * @return integer greater than 0 if str1 is greater than str2
  301. * @return integer 0 if they are equal
  302. */
  303. public static function strcasecmp($str1, $str2)
  304. {
  305. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  306. return _strcasecmp($str1, $str2);
  307. }
  308. /**
  309. * Returns a string or an array with all occurrences of search in subject (ignoring case).
  310. * replaced with the given replace value.
  311. * @see http://php.net/str_ireplace
  312. *
  313. * @note It's not fast and gets slower if $search and/or $replace are arrays.
  314. * @author Harry Fuecks <hfuecks@gmail.com
  315. *
  316. * @param string|array text to replace
  317. * @param string|array replacement text
  318. * @param string|array subject text
  319. * @param integer number of matched and replaced needles will be returned via this parameter which is passed by reference
  320. * @return string if the input was a string
  321. * @return array if the input was an array
  322. */
  323. public static function str_ireplace($search, $replace, $str, & $count = NULL)
  324. {
  325. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  326. return _str_ireplace($search, $replace, $str, $count);
  327. }
  328. /**
  329. * Case-insenstive UTF-8 version of strstr. Returns all of input string
  330. * from the first occurrence of needle to the end.
  331. * @see http://php.net/stristr
  332. *
  333. * @author Harry Fuecks <hfuecks@gmail.com>
  334. *
  335. * @param string input string
  336. * @param string needle
  337. * @return string matched substring if found
  338. * @return boolean FALSE if the substring was not found
  339. */
  340. public static function stristr($str, $search)
  341. {
  342. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  343. return _stristr($str, $search);
  344. }
  345. /**
  346. * Finds the length of the initial segment matching mask.
  347. * @see http://php.net/strspn
  348. *
  349. * @author Harry Fuecks <hfuecks@gmail.com>
  350. *
  351. * @param string input string
  352. * @param string mask for search
  353. * @param integer start position of the string to examine
  354. * @param integer length of the string to examine
  355. * @return integer length of the initial segment that contains characters in the mask
  356. */
  357. public static function strspn($str, $mask, $offset = NULL, $length = NULL)
  358. {
  359. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  360. return _strspn($str, $mask, $offset, $length);
  361. }
  362. /**
  363. * Finds the length of the initial segment not matching mask.
  364. * @see http://php.net/strcspn
  365. *
  366. * @author Harry Fuecks <hfuecks@gmail.com>
  367. *
  368. * @param string input string
  369. * @param string mask for search
  370. * @param integer start position of the string to examine
  371. * @param integer length of the string to examine
  372. * @return integer length of the initial segment that contains characters not in the mask
  373. */
  374. public static function strcspn($str, $mask, $offset = NULL, $length = NULL)
  375. {
  376. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  377. return _strcspn($str, $mask, $offset, $length);
  378. }
  379. /**
  380. * Pads a UTF-8 string to a certain length with another string.
  381. * @see http://php.net/str_pad
  382. *
  383. * @author Harry Fuecks <hfuecks@gmail.com>
  384. *
  385. * @param string input string
  386. * @param integer desired string length after padding
  387. * @param string string to use as padding
  388. * @param string padding type: STR_PAD_RIGHT, STR_PAD_LEFT, or STR_PAD_BOTH
  389. * @return string
  390. */
  391. public static function str_pad($str, $final_str_length, $pad_str = ' ', $pad_type = STR_PAD_RIGHT)
  392. {
  393. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  394. return _str_pad($str, $final_str_length, $pad_str, $pad_type);
  395. }
  396. /**
  397. * Converts a UTF-8 string to an array.
  398. * @see http://php.net/str_split
  399. *
  400. * @author Harry Fuecks <hfuecks@gmail.com>
  401. *
  402. * @param string input string
  403. * @param integer maximum length of each chunk
  404. * @return array
  405. */
  406. public static function str_split($str, $split_length = 1)
  407. {
  408. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  409. return _str_split($str, $split_length);
  410. }
  411. /**
  412. * Reverses a UTF-8 string.
  413. * @see http://php.net/strrev
  414. *
  415. * @author Harry Fuecks <hfuecks@gmail.com>
  416. *
  417. * @param string string to be reversed
  418. * @return string
  419. */
  420. public static function strrev($str)
  421. {
  422. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  423. return _strrev($str);
  424. }
  425. /**
  426. * Strips whitespace (or other UTF-8 characters) from the beginning and
  427. * end of a string.
  428. * @see http://php.net/trim
  429. *
  430. * @author Andreas Gohr <andi@splitbrain.org>
  431. *
  432. * @param string input string
  433. * @param string string of characters to remove
  434. * @return string
  435. */
  436. public static function trim($str, $charlist = NULL)
  437. {
  438. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  439. return _trim($str, $charlist);
  440. }
  441. /**
  442. * Strips whitespace (or other UTF-8 characters) from the beginning of a string.
  443. * @see http://php.net/ltrim
  444. *
  445. * @author Andreas Gohr <andi@splitbrain.org>
  446. *
  447. * @param string input string
  448. * @param string string of characters to remove
  449. * @return string
  450. */
  451. public static function ltrim($str, $charlist = NULL)
  452. {
  453. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  454. return _ltrim($str, $charlist);
  455. }
  456. /**
  457. * Strips whitespace (or other UTF-8 characters) from the end of a string.
  458. * @see http://php.net/rtrim
  459. *
  460. * @author Andreas Gohr <andi@splitbrain.org>
  461. *
  462. * @param string input string
  463. * @param string string of characters to remove
  464. * @return string
  465. */
  466. public static function rtrim($str, $charlist = NULL)
  467. {
  468. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  469. return _rtrim($str, $charlist);
  470. }
  471. /**
  472. * Returns the unicode ordinal for a character.
  473. * @see http://php.net/ord
  474. *
  475. * @author Harry Fuecks <hfuecks@gmail.com>
  476. *
  477. * @param string UTF-8 encoded character
  478. * @return integer
  479. */
  480. public static function ord($chr)
  481. {
  482. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  483. return _ord($chr);
  484. }
  485. /**
  486. * Takes an UTF-8 string and returns an array of ints representing the Unicode characters.
  487. * Astral planes are supported i.e. the ints in the output can be > 0xFFFF.
  488. * Occurrances of the BOM are ignored. Surrogates are not allowed.
  489. *
  490. * The Original Code is Mozilla Communicator client code.
  491. * The Initial Developer of the Original Code is Netscape Communications Corporation.
  492. * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
  493. * Ported to PHP by Henri Sivonen <hsivonen@iki.fi>, see http://hsivonen.iki.fi/php-utf8/.
  494. * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
  495. *
  496. * @param string UTF-8 encoded string
  497. * @return array unicode code points
  498. * @return boolean FALSE if the string is invalid
  499. */
  500. public static function to_unicode($str)
  501. {
  502. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  503. return _to_unicode($str);
  504. }
  505. /**
  506. * Takes an array of ints representing the Unicode characters and returns a UTF-8 string.
  507. * Astral planes are supported i.e. the ints in the input can be > 0xFFFF.
  508. * Occurrances of the BOM are ignored. Surrogates are not allowed.
  509. *
  510. * The Original Code is Mozilla Communicator client code.
  511. * The Initial Developer of the Original Code is Netscape Communications Corporation.
  512. * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
  513. * Ported to PHP by Henri Sivonen <hsivonen@iki.fi>, see http://hsivonen.iki.fi/php-utf8/.
  514. * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
  515. *
  516. * @param array unicode code points representing a string
  517. * @return string utf8 string of characters
  518. * @return boolean FALSE if a code point cannot be found
  519. */
  520. public static function from_unicode($arr)
  521. {
  522. require_once SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
  523. return _from_unicode($arr);
  524. }
  525. } // End utf8