PageRenderTime 48ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/utf8.php

https://github.com/Xeoncross/php_utf8
PHP | 441 lines | 233 code | 47 blank | 161 comment | 21 complexity | 8ff72f4c373664364f1e170e34624691 MD5 | raw file
  1. <?php
  2. /**
  3. * php_utf8
  4. *
  5. * A simple collection of functions to provide a standardized framework for
  6. * working with multibyte strings (like UTF-8) in a variety of server
  7. * environments. Requires either mbstring or iconv to work!
  8. *
  9. * @author David Pennington <xeoncross.com>
  10. * @link http://sourceforge.net/projects/phputf8/
  11. * @link http://github.com/Xeoncross/php_utf8
  12. * @license http://opensource.org/licenses/mit-license.php MIT License
  13. */
  14. // Is PCRE compiled with UTF-8 support? Please say YES!!!!
  15. define('PCRE_SUPPORTS_UTF8', preg_match('/^.{1}$/u',"ñ", array()));
  16. // Default to English UTF-8
  17. setlocale(LC_ALL, 'en_US.UTF8');
  18. if(extension_loaded('mbstring'))
  19. {
  20. if (ini_get('mbstring.func_overload') & MB_OVERLOAD_STRING)
  21. {
  22. trigger_error
  23. (
  24. 'The <a href="http://php.net/mbstring">mbstring</a> extension is overloading '.
  25. 'PHP\'s native string functions. Disable this by setting mbstring.func_overload '.
  26. 'to 0, 1, 4 or 5 in php.ini or a .htaccess file.',
  27. E_USER_ERROR
  28. );
  29. }
  30. // Set internal character encoding to UTF-8
  31. mb_internal_encoding("UTF-8");
  32. }
  33. elseif (extension_loaded('iconv'))
  34. {
  35. // Set internal character encoding to UTF-8
  36. iconv_set_encoding("internal_encoding", "UTF-8");
  37. }
  38. else
  39. {
  40. trigger_error
  41. (
  42. 'Neither the <a href="http://php.net/iconv">iconv</a> nor <a href="http://'.
  43. 'php.net/mbstring">mbstring</a> PHP extensions are loaded. Without one of '.
  44. 'these, UTF-8 strings cannot be properly handled.',
  45. E_USER_ERROR
  46. );
  47. }
  48. // Enable basic multibyte string support if mbstring is not installed!
  49. if( ! extension_loaded('mbstring'))
  50. {
  51. /**
  52. * Unicode aware replacement for strlen(). Returns the number of characters
  53. * in the string (not the number of bytes), replacing multibyte characters
  54. * with a single byte equivalent utf8_decode() converts characters that are
  55. * not in ISO-8859-1 to '?', which, for the purpose of counting, is alright
  56. * - It's much faster than iconv_strlen.
  57. *
  58. * Note: this function does not count bad UTF-8 bytes in the string
  59. *
  60. * @author <chernyshevsky at hotmail dot com>
  61. * @param string $string a valid UTF-8 string
  62. * @return int
  63. */
  64. function mb_strlen($string)
  65. {
  66. return strlen(utf8_decode($string));
  67. }
  68. /**
  69. * UTF-8 aware alternative to substr
  70. * Return part of a string given character offset (and optionally length)
  71. *
  72. * @param string $string to parse
  73. * @param int $start the starting offset
  74. * @param int $length of part to return
  75. * @param string $encoding defaults to UTF-8
  76. * @return string
  77. */
  78. function mb_substr($string, $start, $length, $encoding = NULL)
  79. {
  80. return iconv_substr($string, $start, $length);
  81. }
  82. /**
  83. * UTF-8 aware alternative to strpos
  84. * Find position of first occurrence of a string
  85. *
  86. * @param string $haystack to search
  87. * @param string $needle substring to look for
  88. * @param int $offset to start from
  89. * @param string $encoding defaults to UTF-8
  90. * @return int
  91. */
  92. function mb_strpos($haystack, $needle, $offset = 0, $encoding = NULL)
  93. {
  94. return iconv_strpos($haystack, $needel, $offset);
  95. }
  96. /**
  97. * UTF-8 aware alternative to strrpos
  98. * Finds the last occurrence of a needle within a haystack
  99. *
  100. * @param string $haystack to search
  101. * @param string $needle substring to look for
  102. * @param string $encoding defaults to UTF-8
  103. * @return int
  104. */
  105. function mb_strrpos($haystack, $needle, $encoding = NULL)
  106. {
  107. return iconv_strrpos($haystack, $needle);
  108. }
  109. /**
  110. * Convert a UTF-8 string to lowercase
  111. *
  112. * @param string $string to convert
  113. * @param string $encoding defaults to UTF-8
  114. * @return string
  115. */
  116. function mb_strtolower($string, $encoding)
  117. {
  118. return $string;
  119. }
  120. /**
  121. * Convert a UTF-8 string to uppercase
  122. *
  123. * @param string $string to convert
  124. * @param string $encoding defaults to UTF-8
  125. * @return string
  126. */
  127. function mb_strtoupper($string, $encoding)
  128. {
  129. return $string;
  130. }
  131. }
  132. /**
  133. * UTF-8 aware alternative to str_split to convert a string to an array
  134. *
  135. * @param string $string to split
  136. * @param int $split_len of characters to split string by
  137. * @return string
  138. */
  139. function mb_str_split($string, $split_len = 1)
  140. {
  141. if (mb_strlen($string) <= $split_len)
  142. return array($string);
  143. preg_match_all('/.{'.$split_len.'}|[^\x00]{1,'.$split_len.'}$/us', $string, $array);
  144. return $array[0];
  145. }
  146. /**
  147. * UTF-8 aware substr_replace.
  148. *
  149. * @param string $string to process
  150. * @param string $replacement text
  151. * @param int $start offset
  152. * @param int $length to replace
  153. * @return string
  154. */
  155. function mb_substr_replace($string, $replacement, $start, $length = NULL )
  156. {
  157. return mb_substr($str, 0, $start) . $replacement . mb_substr($str, $length + 1);
  158. }
  159. /**
  160. * UTF-8 aware alternative to strrev
  161. * Reverse a string
  162. *
  163. * @param string $string to reverse
  164. * @return string
  165. */
  166. function mb_strrev($string)
  167. {
  168. preg_match_all('/./us', $string, $ar);
  169. return join('',array_reverse($ar[0]));
  170. }
  171. /**
  172. * Tests whether a string contains only 7bit ASCII bytes.
  173. *
  174. * @param string $string to check
  175. * @return bool
  176. */
  177. function is_ascii($string)
  178. {
  179. return ! preg_match('/[^\x00-\x7F]/S', $string);
  180. }
  181. /**
  182. * Checks to see if a string is utf8 encoded.
  183. *
  184. * NOTE: This function checks for 5-Byte sequences, UTF8
  185. * has Bytes Sequences with a maximum length of 4.
  186. *
  187. * @author bmorel at ssi dot fr (modified)
  188. * @param string $str The string to be checked
  189. * @return bool
  190. */
  191. function seems_utf8($str)
  192. {
  193. $length = strlen($str);
  194. for ($i=0; $i < $length; $i++) {
  195. $c = ord($str[$i]);
  196. if ($c < 0x80) $n = 0; # 0bbbbbbb
  197. elseif (($c & 0xE0) == 0xC0) $n=1; # 110bbbbb
  198. elseif (($c & 0xF0) == 0xE0) $n=2; # 1110bbbb
  199. elseif (($c & 0xF8) == 0xF0) $n=3; # 11110bbb
  200. elseif (($c & 0xFC) == 0xF8) $n=4; # 111110bb
  201. elseif (($c & 0xFE) == 0xFC) $n=5; # 1111110b
  202. else return false; # Does not match any model
  203. for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
  204. if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80))
  205. return false;
  206. }
  207. }
  208. return true;
  209. }
  210. /**
  211. * Converts most Latin accent characters to ASCII characters. If there are no
  212. * accent characters, then the string given is returned unchanged.
  213. *
  214. * @author wordpress.org
  215. * @param string $string that might have accent characters
  216. * @return string
  217. */
  218. function remove_accents($string)
  219. {
  220. // We only need to translate from U+0080 to U+00FF
  221. if ( ! preg_match('/[\x80-\xff]/', $string))
  222. return $string;
  223. $chars = array(
  224. // Decompositions for Latin-1 Supplement
  225. chr(195).chr(128) => 'A', chr(195).chr(129) => 'A',
  226. chr(195).chr(130) => 'A', chr(195).chr(131) => 'A',
  227. chr(195).chr(132) => 'A', chr(195).chr(133) => 'A',
  228. chr(195).chr(135) => 'C', chr(195).chr(136) => 'E',
  229. chr(195).chr(137) => 'E', chr(195).chr(138) => 'E',
  230. chr(195).chr(139) => 'E', chr(195).chr(140) => 'I',
  231. chr(195).chr(141) => 'I', chr(195).chr(142) => 'I',
  232. chr(195).chr(143) => 'I', chr(195).chr(145) => 'N',
  233. chr(195).chr(146) => 'O', chr(195).chr(147) => 'O',
  234. chr(195).chr(148) => 'O', chr(195).chr(149) => 'O',
  235. chr(195).chr(150) => 'O', chr(195).chr(153) => 'U',
  236. chr(195).chr(154) => 'U', chr(195).chr(155) => 'U',
  237. chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y',
  238. chr(195).chr(159) => 's', chr(195).chr(160) => 'a',
  239. chr(195).chr(161) => 'a', chr(195).chr(162) => 'a',
  240. chr(195).chr(163) => 'a', chr(195).chr(164) => 'a',
  241. chr(195).chr(165) => 'a', chr(195).chr(167) => 'c',
  242. chr(195).chr(168) => 'e', chr(195).chr(169) => 'e',
  243. chr(195).chr(170) => 'e', chr(195).chr(171) => 'e',
  244. chr(195).chr(172) => 'i', chr(195).chr(173) => 'i',
  245. chr(195).chr(174) => 'i', chr(195).chr(175) => 'i',
  246. chr(195).chr(177) => 'n', chr(195).chr(178) => 'o',
  247. chr(195).chr(179) => 'o', chr(195).chr(180) => 'o',
  248. chr(195).chr(181) => 'o', chr(195).chr(182) => 'o',
  249. chr(195).chr(182) => 'o', chr(195).chr(185) => 'u',
  250. chr(195).chr(186) => 'u', chr(195).chr(187) => 'u',
  251. chr(195).chr(188) => 'u', chr(195).chr(189) => 'y',
  252. chr(195).chr(191) => 'y',
  253. // Decompositions for Latin Extended-A
  254. chr(196).chr(128) => 'A', chr(196).chr(129) => 'a',
  255. chr(196).chr(130) => 'A', chr(196).chr(131) => 'a',
  256. chr(196).chr(132) => 'A', chr(196).chr(133) => 'a',
  257. chr(196).chr(134) => 'C', chr(196).chr(135) => 'c',
  258. chr(196).chr(136) => 'C', chr(196).chr(137) => 'c',
  259. chr(196).chr(138) => 'C', chr(196).chr(139) => 'c',
  260. chr(196).chr(140) => 'C', chr(196).chr(141) => 'c',
  261. chr(196).chr(142) => 'D', chr(196).chr(143) => 'd',
  262. chr(196).chr(144) => 'D', chr(196).chr(145) => 'd',
  263. chr(196).chr(146) => 'E', chr(196).chr(147) => 'e',
  264. chr(196).chr(148) => 'E', chr(196).chr(149) => 'e',
  265. chr(196).chr(150) => 'E', chr(196).chr(151) => 'e',
  266. chr(196).chr(152) => 'E', chr(196).chr(153) => 'e',
  267. chr(196).chr(154) => 'E', chr(196).chr(155) => 'e',
  268. chr(196).chr(156) => 'G', chr(196).chr(157) => 'g',
  269. chr(196).chr(158) => 'G', chr(196).chr(159) => 'g',
  270. chr(196).chr(160) => 'G', chr(196).chr(161) => 'g',
  271. chr(196).chr(162) => 'G', chr(196).chr(163) => 'g',
  272. chr(196).chr(164) => 'H', chr(196).chr(165) => 'h',
  273. chr(196).chr(166) => 'H', chr(196).chr(167) => 'h',
  274. chr(196).chr(168) => 'I', chr(196).chr(169) => 'i',
  275. chr(196).chr(170) => 'I', chr(196).chr(171) => 'i',
  276. chr(196).chr(172) => 'I', chr(196).chr(173) => 'i',
  277. chr(196).chr(174) => 'I', chr(196).chr(175) => 'i',
  278. chr(196).chr(176) => 'I', chr(196).chr(177) => 'i',
  279. chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij',
  280. chr(196).chr(180) => 'J', chr(196).chr(181) => 'j',
  281. chr(196).chr(182) => 'K', chr(196).chr(183) => 'k',
  282. chr(196).chr(184) => 'k', chr(196).chr(185) => 'L',
  283. chr(196).chr(186) => 'l', chr(196).chr(187) => 'L',
  284. chr(196).chr(188) => 'l', chr(196).chr(189) => 'L',
  285. chr(196).chr(190) => 'l', chr(196).chr(191) => 'L',
  286. chr(197).chr(128) => 'l', chr(197).chr(129) => 'L',
  287. chr(197).chr(130) => 'l', chr(197).chr(131) => 'N',
  288. chr(197).chr(132) => 'n', chr(197).chr(133) => 'N',
  289. chr(197).chr(134) => 'n', chr(197).chr(135) => 'N',
  290. chr(197).chr(136) => 'n', chr(197).chr(137) => 'N',
  291. chr(197).chr(138) => 'n', chr(197).chr(139) => 'N',
  292. chr(197).chr(140) => 'O', chr(197).chr(141) => 'o',
  293. chr(197).chr(142) => 'O', chr(197).chr(143) => 'o',
  294. chr(197).chr(144) => 'O', chr(197).chr(145) => 'o',
  295. chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe',
  296. chr(197).chr(148) => 'R',chr(197).chr(149) => 'r',
  297. chr(197).chr(150) => 'R',chr(197).chr(151) => 'r',
  298. chr(197).chr(152) => 'R',chr(197).chr(153) => 'r',
  299. chr(197).chr(154) => 'S',chr(197).chr(155) => 's',
  300. chr(197).chr(156) => 'S',chr(197).chr(157) => 's',
  301. chr(197).chr(158) => 'S',chr(197).chr(159) => 's',
  302. chr(197).chr(160) => 'S', chr(197).chr(161) => 's',
  303. chr(197).chr(162) => 'T', chr(197).chr(163) => 't',
  304. chr(197).chr(164) => 'T', chr(197).chr(165) => 't',
  305. chr(197).chr(166) => 'T', chr(197).chr(167) => 't',
  306. chr(197).chr(168) => 'U', chr(197).chr(169) => 'u',
  307. chr(197).chr(170) => 'U', chr(197).chr(171) => 'u',
  308. chr(197).chr(172) => 'U', chr(197).chr(173) => 'u',
  309. chr(197).chr(174) => 'U', chr(197).chr(175) => 'u',
  310. chr(197).chr(176) => 'U', chr(197).chr(177) => 'u',
  311. chr(197).chr(178) => 'U', chr(197).chr(179) => 'u',
  312. chr(197).chr(180) => 'W', chr(197).chr(181) => 'w',
  313. chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y',
  314. chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z',
  315. chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z',
  316. chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z',
  317. chr(197).chr(190) => 'z', chr(197).chr(191) => 's',
  318. // Euro Sign
  319. chr(226).chr(130).chr(172) => 'E',
  320. // GBP (Pound) Sign
  321. chr(194).chr(163) => ''
  322. );
  323. return strtr($string, $chars);
  324. }
  325. /**
  326. * Filter a valid UTF-8 string so that it contains only words, numbers,
  327. * dashes, underscores, periods, and spaces - all of which are safe
  328. * characters to use in file names, URI, XML, JSON, and (X)HTML.
  329. *
  330. * @param string $string to clean
  331. * @param bool $remove_spaces if set to TRUE
  332. * @return string
  333. */
  334. function sanitize($string, $remove_spaces = FALSE)
  335. {
  336. // Only allow words (letters or numbers) and a couple other characters
  337. $string = preg_replace('/[^\w\-\. ]+/u', ' ', $string);
  338. // Remove doubles of all non-word characters
  339. $string = preg_replace(array('/\s\s+/', '/\.\.+/', '/--+/', '/__+/'), array(' ', '.', '-', '_'), $string);
  340. // Remove spaces?
  341. if($remove_spaces)
  342. {
  343. $string = preg_replace('/--+/', '-', str_replace(' ', '-', $string));
  344. }
  345. // Remove starting/ending symbols
  346. return trim($string, '-._ ');
  347. }
  348. /**
  349. * Create a SEO friendly URL string from a valid UTF-8 string
  350. *
  351. * @param string $string to filter
  352. * @return string
  353. */
  354. function sanitize_url($string)
  355. {
  356. return urlencode(remove_accents(mb_strtolower(sanitize($string, TRUE))));
  357. }
  358. /**
  359. * Filter a valid UTF-8 string to be file name safe.
  360. *
  361. * @param string $string to filter
  362. * @return string
  363. */
  364. function sanitize_filename($string)
  365. {
  366. return sanitize($string, TRUE);
  367. }
  368. /**
  369. * Convert a string from one encoding to another encoding (Defaults to UTF-8)
  370. *
  371. * @param string $string to convert
  372. * @param string $to_encoding you want the string in
  373. * @param string $from_encoding that string is in
  374. * @return string
  375. */
  376. function encode($string, $to_encoding = 'UTF-8', $from_encoding = 'UTF-8')
  377. {
  378. // ASCII-7 is valid UTF-8 already
  379. if ($to_encoding === 'UTF-8' AND is_ascii($string))
  380. return $string;
  381. if(function_exists('iconv'))
  382. {
  383. // Disable notices
  384. $ER = error_reporting(~E_NOTICE);
  385. $string = iconv($from_encoding, $to_encoding.'//TRANSLIT', $string);
  386. // Turn notices back on
  387. error_reporting($ER);
  388. return $string;
  389. }
  390. else
  391. {
  392. return mb_convert_encoding($string, $to_encoding, mb_detect_encoding($string, "auto", TRUE));
  393. }
  394. }