PageRenderTime 50ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/codeigniter/utf8.php

https://github.com/shivercube/kcPHP
PHP | 835 lines | 462 code | 85 blank | 288 comment | 59 complexity | c600af57edbdbbdb2b711587a28ab5fb MD5 | raw file
  1. <?php
  2. /**
  3. * kcPHP
  4. *
  5. * An open source application development framework for PHP 5.3.0 or newer
  6. *
  7. * @package kcPHP
  8. * @subpackage codeigniter
  9. * @author ExpressionEngine Dev Team
  10. * @modified ShiverCube - Removed PHP4 compatibily, and added a few framework tweaks
  11. * @copyright Copyright (c) 2008 - 2010, EllisLab, Inc.
  12. * @license http://codeigniter.com/user_guide/license.html
  13. * @link http://codeigniter.com
  14. * @since Version 1.0
  15. * @filesource
  16. */
  17. // ------------------------------------------------------------------------
  18. /**
  19. * CodeIgniter UTF-8 functions
  20. *
  21. * @author ShiverCube
  22. */
  23. // ------------------------------------------------------------------------
  24. namespace utf8;
  25. \mb_internal_encoding('UTF-8');
  26. /**
  27. * Recursively cleans arrays, objects, and strings. Removes ASCII control
  28. * codes and converts to UTF-8 while silently discarding incompatible
  29. * UTF-8 characters.
  30. *
  31. * @param string string to clean
  32. * @return string
  33. */
  34. function clean($str)
  35. {
  36. if (is_array($str) OR is_object($str))
  37. {
  38. foreach ($str as $key => $val)
  39. {
  40. // Recursion!
  41. $str[clean($key)] = clean($val);
  42. }
  43. }
  44. elseif (is_string($str) AND $str !== '')
  45. {
  46. // Remove control characters
  47. $str = strip_ascii_ctrl($str);
  48. if ( ! is_ascii($str))
  49. {
  50. // iconv is expensive, so it is only used when needed
  51. $str = @iconv('UTF-8', 'UTF-8//IGNORE', $str); // Disable notices with @
  52. }
  53. }
  54. return $str;
  55. }
  56. /**
  57. * Tests whether a string contains only 7bit ASCII bytes. This is used to
  58. * determine when to use native functions or UTF-8 functions.
  59. *
  60. * @param string string to check
  61. * @return bool
  62. */
  63. function is_ascii($str)
  64. {
  65. return ! preg_match('/[^\x00-\x7F]/S', $str);
  66. }
  67. /**
  68. * Returns the length of the given string
  69. *
  70. * @param string $str The string being measured for length
  71. * @return The length of the string on success, and 0 if the string is empty
  72. */
  73. function strlen($str)
  74. {
  75. return \mb_strlen($str);
  76. }
  77. /**
  78. * Strips out device control codes in the ASCII range.
  79. *
  80. * @param string string to clean
  81. * @return string
  82. */
  83. function strip_ascii_ctrl($str)
  84. {
  85. return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S', '', $str);
  86. }
  87. /**
  88. * Strips out all non-7bit ASCII bytes.
  89. *
  90. * @param string string to clean
  91. * @return string
  92. */
  93. function strip_non_ascii($str)
  94. {
  95. return preg_replace('/[^\x00-\x7F]+/S', '', $str);
  96. }
  97. /**
  98. * Replaces special/accented UTF-8 characters by ASCII-7 'equivalents'.
  99. *
  100. * @author Andreas Gohr <andi@splitbrain.org>
  101. *
  102. * @param string string to transliterate
  103. * @param integer -1 lowercase only, +1 uppercase only, 0 both cases
  104. * @return string
  105. */
  106. function transliterate_to_ascii($str, $case = 0)
  107. {
  108. static $UTF8_LOWER_ACCENTS = NULL;
  109. static $UTF8_UPPER_ACCENTS = NULL;
  110. if ($case <= 0)
  111. {
  112. if ($UTF8_LOWER_ACCENTS === NULL)
  113. {
  114. $UTF8_LOWER_ACCENTS = array(
  115. 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
  116. 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
  117. 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
  118. 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
  119. 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
  120. 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
  121. 'ū' => 'u', 'č' => 'c', 'ö' => 'o', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
  122. 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
  123. 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
  124. 'ŗ' => 'r', 'ä' => 'a', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'u', 'ò' => 'o',
  125. 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
  126. 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
  127. 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
  128. 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
  129. 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
  130. );
  131. }
  132. $str = str_replace(
  133. array_keys($UTF8_LOWER_ACCENTS),
  134. array_values($UTF8_LOWER_ACCENTS),
  135. $str
  136. );
  137. }
  138. if ($case >= 0)
  139. {
  140. if ($UTF8_UPPER_ACCENTS === NULL)
  141. {
  142. $UTF8_UPPER_ACCENTS = array(
  143. 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
  144. 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K', 'Ĕ' => 'E',
  145. 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
  146. 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
  147. 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
  148. 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
  149. 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'O', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
  150. 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
  151. 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
  152. 'Ŗ' => 'R', 'Ä' => 'A', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'U', 'Ò' => 'O',
  153. 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
  154. 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
  155. 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
  156. 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
  157. 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
  158. );
  159. }
  160. $str = str_replace(
  161. array_keys($UTF8_UPPER_ACCENTS),
  162. array_values($UTF8_UPPER_ACCENTS),
  163. $str
  164. );
  165. }
  166. return $str;
  167. }
  168. /**
  169. * Strips whitespace (or other UTF-8 characters) from the beginning of a string.
  170. * @see http://php.net/ltrim
  171. *
  172. * @author Andreas Gohr <andi@splitbrain.org>
  173. *
  174. * @param string input string
  175. * @param string string of characters to remove
  176. * @return string
  177. */
  178. function ltrim($str, $charlist = NULL)
  179. {
  180. if ($charlist === NULL)
  181. {
  182. return \ltrim($str);
  183. }
  184. if (is_ascii($charlist))
  185. {
  186. return \ltrim($str, $charlist);
  187. }
  188. $charlist = preg_replace('#[-\[\]:\\\\^/]#', '\\\\$0', $charlist);
  189. return preg_replace('/^['.$charlist.']+/u', '', $str);
  190. }
  191. /**
  192. * Strips whitespace (or other UTF-8 characters) from the end of a string.
  193. * @see http://php.net/rtrim
  194. *
  195. * @author Andreas Gohr <andi@splitbrain.org>
  196. *
  197. * @param string input string
  198. * @param string string of characters to remove
  199. * @return string
  200. */
  201. function rtrim($str, $charlist = NULL)
  202. {
  203. if ($charlist === NULL)
  204. {
  205. return \rtrim($str);
  206. }
  207. if (is_ascii($charlist))
  208. {
  209. return \rtrim($str, $charlist);
  210. }
  211. $charlist = preg_replace('#[-\[\]:\\\\^/]#', '\\\\$0', $charlist);
  212. return preg_replace('/['.$charlist.']++$/uD', '', $str);
  213. }
  214. /**
  215. * Strips whitespace (or other UTF-8 characters) from the beginning and
  216. * end of a string.
  217. * @see http://php.net/trim
  218. *
  219. * @author Andreas Gohr <andi@splitbrain.org>
  220. *
  221. * @param string input string
  222. * @param string string of characters to remove
  223. * @return string
  224. */
  225. function trim($str, $charlist = NULL)
  226. {
  227. if ($charlist === NULL)
  228. {
  229. return \trim($str);
  230. }
  231. return ltrim(rtrim($str, $charlist), $charlist);
  232. }
  233. /**
  234. * Returns part of a UTF-8 string.
  235. * @see http://php.net/substr
  236. *
  237. * @author Chris Smith <chris@jalakai.co.uk>
  238. *
  239. * @param string input string
  240. * @param integer offset
  241. * @param integer length limit
  242. * @return string
  243. */
  244. function substr($str, $offset, $length = NULL)
  245. {
  246. return ($length === NULL) ? \mb_substr($str, $offset) : \mb_substr($str, $offset, $length);
  247. }
  248. /**
  249. * Takes an UTF-8 string and returns an array of ints representing the Unicode characters.
  250. * Astral planes are supported i.e. the ints in the output can be > 0xFFFF.
  251. * Occurrances of the BOM are ignored. Surrogates are not allowed.
  252. *
  253. * The Original Code is Mozilla Communicator client code.
  254. * The Initial Developer of the Original Code is Netscape Communications Corporation.
  255. * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
  256. * Ported to PHP by Henri Sivonen <hsivonen@iki.fi>, see http://hsivonen.iki.fi/php-utf8/.
  257. * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
  258. *
  259. * @param string UTF-8 encoded string
  260. * @return array unicode code points
  261. * @return boolean FALSE if the string is invalid
  262. */
  263. function to_unicode($str)
  264. {
  265. $mState = 0; // cached expected number of octets after the current octet until the beginning of the next
  266. // UTF8 character sequence
  267. $mUcs4 = 0; // cached Unicode character
  268. $mBytes = 1; // cached expected number of octets in the current sequence
  269. $out = array();
  270. $len = \strlen($str);
  271. for ($i = 0; $i < $len; $i++)
  272. {
  273. $in = \ord($str[$i]);
  274. if ($mState == 0)
  275. {
  276. // When mState is zero we expect either a US-ASCII character or a
  277. // multi-octet sequence.
  278. if (0 == (0x80 & $in))
  279. {
  280. // US-ASCII, pass straight through.
  281. $out[] = $in;
  282. $mBytes = 1;
  283. }
  284. elseif (0xC0 == (0xE0 & $in))
  285. {
  286. // First octet of 2 octet sequence
  287. $mUcs4 = $in;
  288. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  289. $mState = 1;
  290. $mBytes = 2;
  291. }
  292. elseif (0xE0 == (0xF0 & $in))
  293. {
  294. // First octet of 3 octet sequence
  295. $mUcs4 = $in;
  296. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  297. $mState = 2;
  298. $mBytes = 3;
  299. }
  300. elseif (0xF0 == (0xF8 & $in))
  301. {
  302. // First octet of 4 octet sequence
  303. $mUcs4 = $in;
  304. $mUcs4 = ($mUcs4 & 0x07) << 18;
  305. $mState = 3;
  306. $mBytes = 4;
  307. }
  308. elseif (0xF8 == (0xFC & $in))
  309. {
  310. // First octet of 5 octet sequence.
  311. //
  312. // This is illegal because the encoded codepoint must be either
  313. // (a) not the shortest form or
  314. // (b) outside the Unicode range of 0-0x10FFFF.
  315. // Rather than trying to resynchronize, we will carry on until the end
  316. // of the sequence and let the later error handling code catch it.
  317. $mUcs4 = $in;
  318. $mUcs4 = ($mUcs4 & 0x03) << 24;
  319. $mState = 4;
  320. $mBytes = 5;
  321. }
  322. elseif (0xFC == (0xFE & $in))
  323. {
  324. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  325. $mUcs4 = $in;
  326. $mUcs4 = ($mUcs4 & 1) << 30;
  327. $mState = 5;
  328. $mBytes = 6;
  329. }
  330. else
  331. {
  332. // Current octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence
  333. \trigger_error('to_unicode: Illegal sequence identifier in UTF-8 at byte '.$i,
  334. E_USER_WARNING);
  335. return FALSE;
  336. }
  337. }
  338. else
  339. {
  340. // When mState is non-zero, we expect a continuation of the multi-octet sequence
  341. if (0x80 == (0xC0 & $in))
  342. {
  343. // Legal continuation
  344. $shift = ($mState - 1) * 6;
  345. $tmp = $in;
  346. $tmp = ($tmp & 0x0000003F) << $shift;
  347. $mUcs4 |= $tmp;
  348. // End of the multi-octet sequence. mUcs4 now contains the final Unicode codepoint to be output
  349. if (0 == --$mState)
  350. {
  351. // Check for illegal sequences and codepoints
  352. // From Unicode 3.1, non-shortest form is illegal
  353. if (((2 == $mBytes) AND ($mUcs4 < 0x0080)) OR
  354. ((3 == $mBytes) AND ($mUcs4 < 0x0800)) OR
  355. ((4 == $mBytes) AND ($mUcs4 < 0x10000)) OR
  356. (4 < $mBytes) OR
  357. // From Unicode 3.2, surrogate characters are illegal
  358. (($mUcs4 & 0xFFFFF800) == 0xD800) OR
  359. // Codepoints outside the Unicode range are illegal
  360. ($mUcs4 > 0x10FFFF))
  361. {
  362. \trigger_error('to_unicode: Illegal sequence or codepoint in UTF-8 at byte '.$i,
  363. E_USER_WARNING);
  364. return FALSE;
  365. }
  366. if (0xFEFF != $mUcs4)
  367. {
  368. // BOM is legal but we don't want to output it
  369. $out[] = $mUcs4;
  370. }
  371. // Initialize UTF-8 cache
  372. $mState = 0;
  373. $mUcs4 = 0;
  374. $mBytes = 1;
  375. }
  376. }
  377. else
  378. {
  379. // ((0xC0 & (*in) != 0x80) AND (mState != 0))
  380. // Incomplete multi-octet sequence
  381. \trigger_error('to_unicode: Incomplete multi-octet sequence in UTF-8 at byte '.$i,
  382. E_USER_WARNING);
  383. return FALSE;
  384. }
  385. }
  386. }
  387. return $out;
  388. }
  389. /**
  390. * Converts an array of unicode characters to a string of HTML character entities
  391. *
  392. * @param array The unicode characters to convert
  393. * @return string The string of HTML entities
  394. */
  395. function to_entities($unicode)
  396. {
  397. $entities = '';
  398. foreach ($unicode as $value)
  399. {
  400. $entities .= "&#{$value};";
  401. }
  402. return $entities;
  403. }
  404. /**
  405. * Converts an array of unicode characters to a string of HTML character entities, preserving all existing ASCII
  406. * characters
  407. */
  408. function to_entities_preserving_ascii($unicode)
  409. {
  410. $entities = '';
  411. foreach ($unicode as $value)
  412. {
  413. if ($value > 127)
  414. {
  415. $entities .= "&#{$value};";
  416. }
  417. else
  418. {
  419. $entities .= chr($value);
  420. }
  421. }
  422. return $entities;
  423. }
  424. /**
  425. * Takes an array of ints representing the Unicode characters and returns a UTF-8 string.
  426. * Astral planes are supported i.e. the ints in the input can be > 0xFFFF.
  427. * Occurrances of the BOM are ignored. Surrogates are not allowed.
  428. *
  429. * The Original Code is Mozilla Communicator client code.
  430. * The Initial Developer of the Original Code is Netscape Communications Corporation.
  431. * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
  432. * Ported to PHP by Henri Sivonen <hsivonen@iki.fi>, see http://hsivonen.iki.fi/php-utf8/.
  433. * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
  434. *
  435. * @param array unicode code points representing a string
  436. * @return string utf8 string of characters
  437. * @return boolean FALSE if a code point cannot be found
  438. */
  439. function from_unicode($arr)
  440. {
  441. ob_start();
  442. $keys = array_keys($arr);
  443. foreach ($keys as $k)
  444. {
  445. // ASCII range (including control chars)
  446. if (($arr[$k] >= 0) AND ($arr[$k] <= 0x007f))
  447. {
  448. echo chr($arr[$k]);
  449. }
  450. // 2 byte sequence
  451. elseif ($arr[$k] <= 0x07ff)
  452. {
  453. echo chr(0xc0 | ($arr[$k] >> 6));
  454. echo chr(0x80 | ($arr[$k] & 0x003f));
  455. }
  456. // Byte order mark (skip)
  457. elseif ($arr[$k] == 0xFEFF)
  458. {
  459. // nop -- zap the BOM
  460. }
  461. // Test for illegal surrogates
  462. elseif ($arr[$k] >= 0xD800 AND $arr[$k] <= 0xDFFF)
  463. {
  464. // Found a surrogate
  465. \trigger_error('from_unicode: Illegal surrogate at index: '.$k.', value: '.$arr[$k],
  466. E_USER_WARNING);
  467. return FALSE;
  468. }
  469. // 3 byte sequence
  470. elseif ($arr[$k] <= 0xffff)
  471. {
  472. echo chr(0xe0 | ($arr[$k] >> 12));
  473. echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
  474. echo chr(0x80 | ($arr[$k] & 0x003f));
  475. }
  476. // 4 byte sequence
  477. elseif ($arr[$k] <= 0x10ffff)
  478. {
  479. echo chr(0xf0 | ($arr[$k] >> 18));
  480. echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
  481. echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
  482. echo chr(0x80 | ($arr[$k] & 0x3f));
  483. }
  484. // Out of range
  485. else
  486. {
  487. \trigger_error('from_unicode: Codepoint out of Unicode range at index: '.$k.', value: '.$arr[$k],
  488. E_USER_WARNING);
  489. return FALSE;
  490. }
  491. }
  492. $result = ob_get_contents();
  493. ob_end_clean();
  494. return $result;
  495. }
  496. /**
  497. * Makes a UTF-8 string lowercase.
  498. * @see http://php.net/strtolower
  499. *
  500. * @author Andreas Gohr <andi@splitbrain.org>
  501. *
  502. * @param string mixed case string
  503. * @return string
  504. */
  505. function strtolower($str)
  506. {
  507. return \mb_strtolower($str);
  508. }
  509. /**
  510. * Makes a UTF-8 string uppercase.
  511. * @see http://php.net/strtoupper
  512. *
  513. * @author Andreas Gohr <andi@splitbrain.org>
  514. *
  515. * @param string mixed case string
  516. * @return string
  517. */
  518. function strtoupper($str)
  519. {
  520. return \mb_strtoupper($str);
  521. }
  522. /**
  523. * Replaces text within a portion of a UTF-8 string.
  524. * @see http://php.net/substr_replace
  525. *
  526. * @author Harry Fuecks <hfuecks@gmail.com>
  527. *
  528. * @param string input string
  529. * @param string replacement string
  530. * @param integer offset
  531. * @return string
  532. */
  533. function substr_replace($str, $replacement, $offset, $length = NULL)
  534. {
  535. if (is_ascii($str))
  536. {
  537. return ($length === NULL) ?
  538. \substr_replace($str, $replacement, $offset) : \substr_replace($str, $replacement, $offset, $length);
  539. }
  540. $length = ($length === NULL) ? strlen($str) : (int)$length;
  541. preg_match_all('/./us', $str, $str_array);
  542. preg_match_all('/./us', $replacement, $replacement_array);
  543. array_splice($str_array[0], $offset, $length, $replacement_array[0]);
  544. return implode('', $str_array[0]);
  545. }
  546. /**
  547. * Makes a UTF-8 string's first character uppercase.
  548. * @see http://php.net/ucfirst
  549. *
  550. * @author Harry Fuecks <hfuecks@gmail.com>
  551. *
  552. * @param string mixed case string
  553. * @return string
  554. */
  555. function ucfirst($str)
  556. {
  557. if (is_ascii($str))
  558. {
  559. return \ucfirst($str);
  560. }
  561. preg_match('/^(.?)(.*)$/us', $str, $matches);
  562. return strtoupper($matches[1]).$matches[2];
  563. }
  564. /**
  565. * Makes the first character of every word in a UTF-8 string uppercase.
  566. * @see http://php.net/ucwords
  567. *
  568. * @author Harry Fuecks <hfuecks@gmail.com>
  569. *
  570. * @param string mixed case string
  571. * @return string
  572. */
  573. function ucwords($str)
  574. {
  575. return \mb_convert_case($str, MB_CASE_TITLE);
  576. }
  577. /**
  578. * Case-insensitive UTF-8 string comparison.
  579. * @see http://php.net/strcasecmp
  580. *
  581. * @author Harry Fuecks <hfuecks@gmail.com>
  582. *
  583. * @param string string to compare
  584. * @param string string to compare
  585. * @return integer less than 0 if str1 is less than str2
  586. * @return integer greater than 0 if str1 is greater than str2
  587. * @return integer 0 if they are equal
  588. */
  589. function strcasecmp($str1, $str2)
  590. {
  591. if (is_ascii($str1) AND is_ascii($str2))
  592. {
  593. return \strcasecmp($str1, $str2);
  594. }
  595. $str1 = strtolower($str1);
  596. $str2 = strtolower($str2);
  597. return strcmp($str1, $str2);
  598. }
  599. /**
  600. * Finds the length of the initial segment not matching mask.
  601. * @see http://php.net/strcspn
  602. *
  603. * @author Harry Fuecks <hfuecks@gmail.com>
  604. *
  605. * @param string input string
  606. * @param string mask for search
  607. * @param integer start position of the string to examine
  608. * @param integer length of the string to examine
  609. * @return integer length of the initial segment that contains characters not in the mask
  610. */
  611. function strcspn($str, $mask, $offset = NULL, $length = NULL)
  612. {
  613. if ($str == '' OR $mask == '')
  614. {
  615. return 0;
  616. }
  617. if (is_ascii($str) AND is_ascii($mask))
  618. {
  619. return ($offset === NULL) ? \strcspn($str, $mask) :
  620. (($length === NULL) ? \strcspn($str, $mask, $offset) : \strcspn($str, $mask, $offset, $length));
  621. }
  622. if ($str !== NULL OR $length !== NULL)
  623. {
  624. $str = substr($str, $offset, $length);
  625. }
  626. // Escape these characters: - [ ] . : \ ^ /
  627. // The . and : are escaped to prevent possible warnings about POSIX regex elements
  628. $mask = preg_replace('#[-[\].:\\\\^/]#', '\\\\$0', $mask);
  629. preg_match('/^[^'.$mask.']+/u', $str, $matches);
  630. return isset($matches[0]) ? strlen($matches[0]) : 0;
  631. }
  632. /**
  633. * Converts a UTF-8 string to an array.
  634. * @see http://php.net/str_split
  635. *
  636. * @author Harry Fuecks <hfuecks@gmail.com>
  637. *
  638. * @param string input string
  639. * @param integer maximum length of each chunk
  640. * @return array
  641. */
  642. function str_split($str, $split_length = 1)
  643. {
  644. $split_length = (int) $split_length;
  645. if (is_ascii($str))
  646. {
  647. return \str_split($str, $split_length);
  648. }
  649. if ($split_length < 1)
  650. {
  651. return FALSE;
  652. }
  653. if (strlen($str) <= $split_length)
  654. {
  655. return array($str);
  656. }
  657. preg_match_all('/.{'.$split_length.'}|[^\x00]{1,'.$split_length.'}$/us', $str, $matches);
  658. return $matches[0];
  659. }
  660. /**
  661. * Reverses a UTF-8 string.
  662. * @see http://php.net/strrev
  663. *
  664. * @author Harry Fuecks <hfuecks@gmail.com>
  665. *
  666. * @param string string to be reversed
  667. * @return string
  668. */
  669. function strrev($str)
  670. {
  671. if (is_ascii($str))
  672. {
  673. return \strrev($str);
  674. }
  675. preg_match_all('/./us', $str, $matches);
  676. return implode('', array_reverse($matches[0]));
  677. }
  678. /**
  679. * Returns the unicode ordinal for a character.
  680. * @see http://php.net/ord
  681. *
  682. * @author Harry Fuecks <hfuecks@gmail.com>
  683. *
  684. * @param string UTF-8 encoded character
  685. * @return integer
  686. */
  687. function ord($chr)
  688. {
  689. $ord0 = \ord($chr);
  690. if ($ord0 >= 0 AND $ord0 <= 127)
  691. {
  692. return $ord0;
  693. }
  694. if ( ! isset($chr[1]))
  695. {
  696. \trigger_error('Short sequence - at least 2 bytes expected, only 1 seen', E_USER_WARNING);
  697. return FALSE;
  698. }
  699. $ord1 = \ord($chr[1]);
  700. if ($ord0 >= 192 AND $ord0 <= 223)
  701. {
  702. return ($ord0 - 192) * 64 + ($ord1 - 128);
  703. }
  704. if ( ! isset($chr[2]))
  705. {
  706. \trigger_error('Short sequence - at least 3 bytes expected, only 2 seen', E_USER_WARNING);
  707. return FALSE;
  708. }
  709. $ord2 = \ord($chr[2]);
  710. if ($ord0 >= 224 AND $ord0 <= 239)
  711. {
  712. return ($ord0 - 224) * 4096 + ($ord1 - 128) * 64 + ($ord2 - 128);
  713. }
  714. if ( ! isset($chr[3]))
  715. {
  716. \trigger_error('Short sequence - at least 4 bytes expected, only 3 seen', E_USER_WARNING);
  717. return FALSE;
  718. }
  719. $ord3 = \ord($chr[3]);
  720. if ($ord0 >= 240 AND $ord0 <= 247)
  721. {
  722. return ($ord0 - 240) * 262144 + ($ord1 - 128) * 4096 + ($ord2-128) * 64 + ($ord3 - 128);
  723. }
  724. if ( ! isset($chr[4]))
  725. {
  726. \trigger_error('Short sequence - at least 5 bytes expected, only 4 seen', E_USER_WARNING);
  727. return FALSE;
  728. }
  729. $ord4 = \ord($chr[4]);
  730. if ($ord0 >= 248 AND $ord0 <= 251)
  731. {
  732. return ($ord0 - 248) * 16777216 + ($ord1-128) * 262144 + ($ord2 - 128) * 4096 + ($ord3 - 128) * 64 + ($ord4 - 128);
  733. }
  734. if ( ! isset($chr[5]))
  735. {
  736. \trigger_error('Short sequence - at least 6 bytes expected, only 5 seen', E_USER_WARNING);
  737. return FALSE;
  738. }
  739. if ($ord0 >= 252 AND $ord0 <= 253)
  740. {
  741. return ($ord0 - 252) * 1073741824 + ($ord1 - 128) * 16777216 + ($ord2 - 128) * 262144 + ($ord3 - 128) * 4096 + ($ord4 - 128) * 64 + (ord($chr[5]) - 128);
  742. }
  743. if ($ord0 >= 254 AND $ord0 <= 255)
  744. {
  745. \trigger_error('Invalid UTF-8 with surrogate ordinal '.$ord0, E_USER_WARNING);
  746. return FALSE;
  747. }
  748. }
  749. /* End of file utf8_helper.php */
  750. /* Location: ./system/helpers/utf8_helper.php */