PageRenderTime 79ms CodeModel.GetById 34ms RepoModel.GetById 8ms app.codeStats 0ms

/system/helper/mbstring.php

https://github.com/redpinata-dev/contao
PHP | 582 lines | 266 code | 121 blank | 195 comment | 86 complexity | 81c2c2983e2d6d5980cd6959880c1978 MD5 | raw file
Possible License(s): GPL-3.0, LGPL-3.0, LGPL-2.1, BSD-3-Clause
  1. <?php
  2. /**
  3. * Contao Open Source CMS
  4. *
  5. * Copyright (c) 2005-2014 Leo Feyer
  6. *
  7. * @package Core
  8. * @link https://contao.org
  9. * @license http://www.gnu.org/licenses/lgpl-3.0.html LGPL
  10. */
  11. /**
  12. * This file contains some UTF-8 helper functions that allow to run Contao
  13. * without the mbstring extension. It is based on the UTF-8 library written
  14. * by Andreas Gohr <andi@splitbrain.org> which is part of the DokuWiki project.
  15. * Visit http://www.splitbrain.org/projects/dokuwiki to get the original file.
  16. *
  17. * This library supports the following functions:
  18. * - utf8_chr
  19. * - utf8_ord
  20. * - utf8_convert_encoding
  21. * - utf8_decode_entities
  22. * - utf8_detect_encoding
  23. * - utf8_romanize
  24. * - utf8_strlen
  25. * - utf8_strpos
  26. * - utf8_strrchr
  27. * - utf8_strrpos
  28. * - utf8_strstr
  29. * - utf8_strtolower
  30. * - utf8_strtoupper
  31. * - utf8_substr
  32. * - utf8_ucfirst
  33. * - utf8_str_split
  34. *
  35. * A few functions are based on the UTF-8 library written by Niels Leenheer
  36. * and Andy Matsubara which is part of the Zen Photo web photo album project.
  37. * Visit http://www.zenphoto.org to get the original file.
  38. */
  39. /**
  40. * Check whether we can use mbstring
  41. */
  42. define('USE_MBSTRING', function_exists('mb_strlen'));
  43. if (USE_MBSTRING)
  44. mb_internal_encoding('UTF-8');
  45. /**
  46. * Return a specific character
  47. *
  48. * Unicode version of chr() that handles UTF-8 characters. It is basically
  49. * used as callback function for utf8_decode_entities().
  50. * @param integer
  51. * @return string
  52. */
  53. function utf8_chr($dec)
  54. {
  55. if ($dec < 128)
  56. return chr($dec);
  57. if ($dec < 2048)
  58. return chr(($dec >> 6) + 192) . chr(($dec & 63) + 128);
  59. if ($dec < 65536)
  60. return chr(($dec >> 12) + 224) . chr((($dec >> 6) & 63) + 128) . chr(($dec & 63) + 128);
  61. if ($dec < 2097152)
  62. return chr(($dec >> 18) + 240) . chr((($dec >> 12) & 63) + 128) . chr((($dec >> 6) & 63) + 128) . chr(($dec & 63) + 128);
  63. return '';
  64. }
  65. /**
  66. * Return the ASCII value of a character
  67. *
  68. * Unicode version of ord() that handles UTF-8 characters. The function has
  69. * been published by R. Rajesh Jeba Anbiah on php.net.
  70. * @param string
  71. * @return integer
  72. */
  73. function utf8_ord($str)
  74. {
  75. if (ord($str{0}) >= 0 && ord($str{0}) <= 127)
  76. return ord($str{0});
  77. if (ord($str{0}) >= 192 && ord($str{0}) <= 223)
  78. return (ord($str{0})-192)*64 + (ord($str{1})-128);
  79. if (ord($str{0}) >= 224 && ord($str{0}) <= 239)
  80. return (ord($str{0})-224)*4096 + (ord($str{1})-128)*64 + (ord($str{2})-128);
  81. if (ord($str{0}) >= 240 && ord($str{0}) <= 247)
  82. return (ord($str{0})-240)*262144 + (ord($str{1})-128)*4096 + (ord($str{2})-128)*64 + (ord($str{3})-128);
  83. if (ord($str{0}) >= 248 && ord($str{0}) <= 251)
  84. return (ord($str{0})-248)*16777216 + (ord($str{1})-128)*262144 + (ord($str{2})-128)*4096 + (ord($str{3})-128)*64 + (ord($str{4})-128);
  85. if (ord($str{0}) >= 252 && ord($str{0}) <= 253)
  86. return (ord($str{0})-252)*1073741824 + (ord($str{1})-128)*16777216 + (ord($str{2})-128)*262144 + (ord($str{3})-128)*4096 + (ord($str{4})-128)*64 + (ord($str{5})-128);
  87. if (ord($str{0}) >= 254 && ord($str{0}) <= 255) //error
  88. return false;
  89. return 0;
  90. }
  91. /**
  92. * Convert character encoding
  93. *
  94. * Use utf8_decode() to convert UTF-8 to ISO-8859-1, otherwise use iconv()
  95. * or mb_convert_encoding(). Return the original string if none of these
  96. * libraries is available.
  97. * @param string
  98. * @param string
  99. * @param string
  100. * @return string
  101. */
  102. function utf8_convert_encoding($str, $to, $from=null)
  103. {
  104. if (!$str)
  105. return '';
  106. if (!$from)
  107. $from = utf8_detect_encoding($str);
  108. if ($from == $to)
  109. return $str;
  110. if ($from == 'UTF-8' && $to == 'ISO-8859-1')
  111. return utf8_decode($str);
  112. if ($from == 'ISO-8859-1' && $to == 'UTF-8')
  113. return utf8_encode($str);
  114. if (USE_MBSTRING)
  115. {
  116. @mb_substitute_character('none');
  117. return @mb_convert_encoding($str, $to, $from);
  118. }
  119. if (function_exists('iconv'))
  120. {
  121. if (strlen($iconv = @iconv($from, $to . '//IGNORE', $str)))
  122. return $iconv;
  123. return @iconv($from, $to, $str);
  124. }
  125. return $str;
  126. }
  127. /**
  128. * Convert all unicode entities to their applicable characters
  129. *
  130. * Calls utf8_chr() to convert unicode entities. HTML entities like '&nbsp;'
  131. * or '&quot;' will not be decoded.
  132. * @param string
  133. * @return string
  134. */
  135. function utf8_decode_entities($str)
  136. {
  137. $str = preg_replace_callback('~&#x([0-9a-f]+);~i', 'utf8_hexchr_callback', $str);
  138. $str = preg_replace_callback('~&#([0-9]+);~', 'utf8_chr_callback', $str);
  139. return $str;
  140. }
  141. /**
  142. * Callback function for utf8_decode_entities
  143. * @param array
  144. * @return string
  145. */
  146. function utf8_chr_callback($matches)
  147. {
  148. return utf8_chr($matches[1]);
  149. }
  150. /**
  151. * Callback function for utf8_decode_entities
  152. * @param array
  153. * @return string
  154. */
  155. function utf8_hexchr_callback($matches)
  156. {
  157. return utf8_chr(hexdec($matches[1]));
  158. }
  159. /**
  160. * Detect the encoding of a string
  161. *
  162. * Use mb_detect_encoding() if available since it seems to be about 20 times
  163. * faster than using ereg() or preg_match().
  164. * @param string
  165. * @return string
  166. */
  167. function utf8_detect_encoding($str)
  168. {
  169. if (USE_MBSTRING)
  170. return mb_detect_encoding($str, array('ASCII', 'ISO-2022-JP', 'UTF-8', 'EUC-JP', 'ISO-8859-1'));
  171. if (!preg_match("/[\x80-\xFF]/", $str))
  172. {
  173. if (!preg_match("/\x1B/", $str))
  174. return 'ASCII';
  175. return 'ISO-2022-JP';
  176. }
  177. if (preg_match("/^([\x01-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF])+$/", $str) == 1)
  178. return 'UTF-8';
  179. if (preg_match("/^([\x01-\x7F]|\x8E[\xA0-\xDF]|\x8F[xA1-\xFE][\xA1-\xFE]|[\xA1-\xFE][\xA1-\xFE])+$/", $str) == 1)
  180. return 'EUC-JP';
  181. return 'ISO-8859-1';
  182. }
  183. /**
  184. * Romanize a string
  185. *
  186. * Use the UTF-8 lookup table to replace non ascii characters with their
  187. * respective roman character.
  188. * @param string
  189. * @return string
  190. */
  191. function utf8_romanize($str)
  192. {
  193. global $UTF8_LOOKUP_TABLE;
  194. if (!is_array($UTF8_LOOKUP_TABLE))
  195. require_once TL_ROOT . '/system/helper/utf8_lookup.php';
  196. return strtr(utf8_convert_encoding($str, 'UTF-8'), $UTF8_LOOKUP_TABLE['romanize']);
  197. }
  198. /**
  199. * Determine the number of characters of a string
  200. *
  201. * Use mb_strlen() if available since it seems to be the fastes way to
  202. * determine the string length. Otherwise decode the string (will convert
  203. * non ISO-8859-1 characters to '?') and use strlen().
  204. * @param string
  205. * @return integer
  206. */
  207. function utf8_strlen($str)
  208. {
  209. if (USE_MBSTRING)
  210. return mb_strlen($str);
  211. return strlen(utf8_decode($str));
  212. }
  213. /**
  214. * Find the position of the first occurence of a string in another string
  215. *
  216. * Use mb_strpos() if available. Otherwise combine strpos() and utf8_strlen()
  217. * to detect the numeric position of the first occurrence.
  218. * @param string
  219. * @param string
  220. * @param integer
  221. * @return integer
  222. */
  223. function utf8_strpos($haystack, $needle, $offset=0)
  224. {
  225. if (USE_MBSTRING)
  226. {
  227. if ($offset === 0)
  228. return mb_strpos($haystack, $needle);
  229. return mb_strpos($haystack, $needle, $offset);
  230. }
  231. $comp = 0;
  232. $length = null;
  233. while ($length === null || $length < $offset)
  234. {
  235. $pos = strpos($haystack, $needle, $offset + $comp);
  236. if ($pos === false)
  237. return false;
  238. $length = utf8_strlen(substr($haystack, 0, $pos));
  239. if ($length < $offset)
  240. $comp = $pos - $length;
  241. }
  242. return $length;
  243. }
  244. /**
  245. * Find the last occurrence of a character in a string
  246. *
  247. * Use mb_strrchr() if available since it seems to be about eight times
  248. * faster than combining utf8_substr() and utf8_strrpos().
  249. * @param string
  250. * @param string
  251. * @return string
  252. */
  253. function utf8_strrchr($haystack, $needle)
  254. {
  255. if (USE_MBSTRING)
  256. return mb_strrchr($haystack, $needle);
  257. $pos = utf8_strrpos($haystack, $needle);
  258. if ($pos === false)
  259. return false;
  260. return utf8_substr($haystack, $pos);
  261. }
  262. /**
  263. * Find the position of the last occurrence of a string in another string
  264. *
  265. * Use mb_strrpos() if available since it is about twice as fast as our
  266. * workaround. Otherwise use utf8_strlen() to determine the position.
  267. * @param string
  268. * @param string
  269. * @return mixed
  270. */
  271. function utf8_strrpos($haystack, $needle)
  272. {
  273. if (USE_MBSTRING)
  274. return mb_strrpos($haystack, $needle);
  275. $pos = strrpos($haystack, $needle);
  276. if ($pos === false)
  277. return false;
  278. return utf8_strlen(substr($haystack, 0, $pos));
  279. }
  280. /**
  281. * Find the first occurrence of a string in another string
  282. *
  283. * Use mb_strstr() if available since it seems to be about eight times
  284. * faster than combining utf8_substr() and utf8_strpos().
  285. * @param string
  286. * @param string
  287. * @return string
  288. */
  289. function utf8_strstr($haystack, $needle)
  290. {
  291. if (USE_MBSTRING)
  292. return mb_strstr($haystack, $needle);
  293. $pos = utf8_strpos($haystack, $needle);
  294. if ($pos === false)
  295. return false;
  296. return utf8_substr($haystack, $pos);
  297. }
  298. /**
  299. * Make a string lowercase
  300. *
  301. * Use mb_strtolower() if available, although our workaround does not seem
  302. * to be significantly slower.
  303. * @param string
  304. * @return string
  305. */
  306. function utf8_strtolower($str)
  307. {
  308. if (USE_MBSTRING)
  309. return mb_strtolower($str, utf8_detect_encoding($str));
  310. global $UTF8_LOOKUP_TABLE;
  311. if (!is_array($UTF8_LOOKUP_TABLE))
  312. require_once TL_ROOT . '/system/helper/utf8_lookup.php';
  313. return strtr($str, $UTF8_LOOKUP_TABLE['strtolower']);
  314. }
  315. /**
  316. * Make a string uppercase
  317. *
  318. * Use mb_strtoupper() if available, although our workaround does not seem
  319. * to be significantly slower.
  320. * @param string
  321. * @return string
  322. */
  323. function utf8_strtoupper($str)
  324. {
  325. if (USE_MBSTRING)
  326. return mb_strtoupper($str, utf8_detect_encoding($str));
  327. global $UTF8_LOOKUP_TABLE;
  328. if (!is_array($UTF8_LOOKUP_TABLE))
  329. require_once TL_ROOT . '/system/helper/utf8_lookup.php';
  330. return strtr($str, $UTF8_LOOKUP_TABLE['strtoupper']);
  331. }
  332. /**
  333. * Return substring of a string
  334. *
  335. * Use mb_substr() if available since it is about three times faster than
  336. * our workaround. Otherwise, use PCRE regular expressions with 'u' flag.
  337. * Thanks to Andreas Gohr <andi@splitbrain.org> for this wonderful algorithm
  338. * which is the fastes workaround I could find on the internet.
  339. * @param string
  340. * @param integer
  341. * @param integer
  342. * @return string
  343. */
  344. function utf8_substr($str, $start, $length=null)
  345. {
  346. if (USE_MBSTRING)
  347. {
  348. if ($length === null)
  349. return mb_substr($str, $start);
  350. return mb_substr($str, $start, $length);
  351. }
  352. $str = (string) $str;
  353. $start = (int) $start;
  354. if ($length !== null)
  355. $length = (int) $length;
  356. // Handle trivial cases
  357. if ($length === 0)
  358. return '';
  359. if ($start < 0 && $length < 0 && $length < $start)
  360. return '';
  361. $start_pattern = '';
  362. $length_pattern = '';
  363. // Normalise -ve offsets
  364. if ($start < 0)
  365. {
  366. $strlen = strlen(utf8_decode($str));
  367. $start = $strlen + $start;
  368. if ($start < 0)
  369. $start = 0;
  370. }
  371. // Establish a pattern for offset
  372. if ($start > 0)
  373. {
  374. $Ox = (int) ($start / 65535);
  375. $Oy = $start % 65535;
  376. if ($Ox)
  377. $start_pattern = '(?:.{65535}){'.$Ox.'}';
  378. $start_pattern = '^(?:'.$start_pattern.'.{'.$Oy.'})';
  379. }
  380. // Anchor the pattern if offset == 0
  381. else
  382. {
  383. $start_pattern = '^';
  384. }
  385. // Establish a pattern for length
  386. if ($length === null)
  387. {
  388. $length_pattern = '(.*)$';
  389. }
  390. else
  391. {
  392. if (!isset($strlen))
  393. $strlen = strlen(utf8_decode($str));
  394. if ($start > $strlen)
  395. return '';
  396. if ($length > 0)
  397. {
  398. // Reduce any length that would go passed the end of the string
  399. $length = min($strlen-$start, $length);
  400. $Lx = (int) ($length / 65535);
  401. $Ly = $length % 65535;
  402. if ($Lx)
  403. $length_pattern = '(?:.{65535}){'.$Lx.'}';
  404. $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
  405. }
  406. else if ($length < 0)
  407. {
  408. if ($length < ($start - $strlen))
  409. return '';
  410. $Lx = (int) ((-$length) / 65535);
  411. $Ly = (-$length) % 65535;
  412. if ($Lx)
  413. $length_pattern = '(?:.{65535}){'.$Lx.'}';
  414. $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
  415. }
  416. }
  417. $match = array();
  418. if (!preg_match('#'.$start_pattern.$length_pattern.'#us', $str, $match))
  419. return '';
  420. return $match[1];
  421. }
  422. /**
  423. * Make sure the first letter is uppercase
  424. *
  425. * @param string
  426. * @return string
  427. */
  428. function utf8_ucfirst($str)
  429. {
  430. return utf8_strtoupper(utf8_substr($str, 0, 1)) . utf8_substr($str, 1);
  431. }
  432. /**
  433. * Convert a string to an array
  434. *
  435. * Unicode version of str_split() that handles UTF-8 characters. The function
  436. * has been published by saeedco on php.net.
  437. * @param string
  438. * @return array
  439. */
  440. function utf8_str_split($str)
  441. {
  442. $array = array();
  443. for ($i=0; $i<strlen($str);)
  444. {
  445. $split = 1;
  446. $value = ord($str[$i]);
  447. $key = null;
  448. if($value >= 192 && $value <= 223)
  449. $split=2;
  450. elseif($value >= 224 && $value <= 239)
  451. $split=3;
  452. elseif($value >= 240 && $value <= 247)
  453. $split=4;
  454. for ($j=0; $j<$split; $j++,$i++)
  455. {
  456. $key .= $str[$i];
  457. }
  458. array_push($array, $key);
  459. }
  460. return $array;
  461. }