PageRenderTime 3ms CodeModel.GetById 833ms app.highlight 105ms RepoModel.GetById 187ms app.codeStats 0ms

/system/helper/mbstring.php

https://github.com/redpinata-dev/contao
PHP | 582 lines | 266 code | 121 blank | 195 comment | 86 complexity | 81c2c2983e2d6d5980cd6959880c1978 MD5 | raw file
  1<?php
  2
  3/**
  4 * Contao Open Source CMS
  5 *
  6 * Copyright (c) 2005-2014 Leo Feyer
  7 *
  8 * @package Core
  9 * @link    https://contao.org
 10 * @license http://www.gnu.org/licenses/lgpl-3.0.html LGPL
 11 */
 12
 13
 14/**
 15 * This file contains some UTF-8 helper functions that allow to run Contao
 16 * without the mbstring extension. It is based on the UTF-8 library written
 17 * by Andreas Gohr <andi@splitbrain.org> which is part of the DokuWiki project.
 18 * Visit http://www.splitbrain.org/projects/dokuwiki to get the original file.
 19 *
 20 * This library supports the following functions:
 21 * - utf8_chr
 22 * - utf8_ord
 23 * - utf8_convert_encoding
 24 * - utf8_decode_entities
 25 * - utf8_detect_encoding
 26 * - utf8_romanize
 27 * - utf8_strlen
 28 * - utf8_strpos
 29 * - utf8_strrchr
 30 * - utf8_strrpos
 31 * - utf8_strstr
 32 * - utf8_strtolower
 33 * - utf8_strtoupper
 34 * - utf8_substr
 35 * - utf8_ucfirst
 36 * - utf8_str_split
 37 *
 38 * A few functions are based on the UTF-8 library written by Niels Leenheer
 39 * and Andy Matsubara which is part of the Zen Photo web photo album project.
 40 * Visit http://www.zenphoto.org to get the original file.
 41 */
 42
 43
 44/**
 45 * Check whether we can use mbstring
 46 */
 47define('USE_MBSTRING', function_exists('mb_strlen'));
 48
 49if (USE_MBSTRING)
 50	mb_internal_encoding('UTF-8');
 51
 52
 53/**
 54 * Return a specific character
 55 *
 56 * Unicode version of chr() that handles UTF-8 characters. It is basically
 57 * used as callback function for utf8_decode_entities().
 58 * @param integer
 59 * @return string
 60 */
 61function utf8_chr($dec)
 62{
 63	if ($dec < 128)
 64		return chr($dec);
 65
 66    if ($dec < 2048)
 67    	return chr(($dec >> 6) + 192) . chr(($dec & 63) + 128);
 68
 69    if ($dec < 65536)
 70    	return chr(($dec >> 12) + 224) . chr((($dec >> 6) & 63) + 128) . chr(($dec & 63) + 128);
 71
 72    if ($dec < 2097152)
 73    	return chr(($dec >> 18) + 240) . chr((($dec >> 12) & 63) + 128) . chr((($dec >> 6) & 63) + 128) . chr(($dec & 63) + 128);
 74
 75    return '';
 76}
 77
 78
 79/**
 80 * Return the ASCII value of a character
 81 *
 82 * Unicode version of ord() that handles UTF-8 characters. The function has
 83 * been published by R. Rajesh Jeba Anbiah on php.net.
 84 * @param string
 85 * @return integer
 86 */
 87function utf8_ord($str)
 88{
 89	if (ord($str{0}) >= 0 && ord($str{0}) <= 127)
 90		return ord($str{0});
 91
 92	if (ord($str{0}) >= 192 && ord($str{0}) <= 223)
 93		return (ord($str{0})-192)*64 + (ord($str{1})-128);
 94
 95	if (ord($str{0}) >= 224 && ord($str{0}) <= 239)
 96		return (ord($str{0})-224)*4096 + (ord($str{1})-128)*64 + (ord($str{2})-128);
 97
 98	if (ord($str{0}) >= 240 && ord($str{0}) <= 247)
 99		return (ord($str{0})-240)*262144 + (ord($str{1})-128)*4096 + (ord($str{2})-128)*64 + (ord($str{3})-128);
100
101	if (ord($str{0}) >= 248 && ord($str{0}) <= 251)
102		return (ord($str{0})-248)*16777216 + (ord($str{1})-128)*262144 + (ord($str{2})-128)*4096 + (ord($str{3})-128)*64 + (ord($str{4})-128);
103
104	if (ord($str{0}) >= 252 && ord($str{0}) <= 253)
105		return (ord($str{0})-252)*1073741824 + (ord($str{1})-128)*16777216 + (ord($str{2})-128)*262144 + (ord($str{3})-128)*4096 + (ord($str{4})-128)*64 + (ord($str{5})-128);
106
107	if (ord($str{0}) >= 254 && ord($str{0}) <= 255) //error
108		return false;
109
110	return 0;
111}
112
113
114/**
115 * Convert character encoding
116 *
117 * Use utf8_decode() to convert UTF-8 to ISO-8859-1, otherwise use iconv()
118 * or mb_convert_encoding(). Return the original string if none of these
119 * libraries is available.
120 * @param string
121 * @param string
122 * @param string
123 * @return string
124 */
125function utf8_convert_encoding($str, $to, $from=null)
126{
127	if (!$str)
128		return '';
129
130	if (!$from)
131		$from = utf8_detect_encoding($str);
132
133	if ($from == $to)
134		return $str;
135
136	if ($from == 'UTF-8' && $to == 'ISO-8859-1')
137		return utf8_decode($str);
138
139	if ($from == 'ISO-8859-1' && $to == 'UTF-8')
140		return utf8_encode($str);
141
142	if (USE_MBSTRING)
143	{
144		@mb_substitute_character('none');
145		return @mb_convert_encoding($str, $to, $from);
146	}
147
148	if (function_exists('iconv'))
149	{
150		if (strlen($iconv = @iconv($from, $to . '//IGNORE', $str)))
151			return $iconv;
152
153		return @iconv($from, $to, $str);
154	}
155
156	return $str;
157}
158
159
160/**
161 * Convert all unicode entities to their applicable characters
162 *
163 * Calls utf8_chr() to convert unicode entities. HTML entities like '&nbsp;'
164 * or '&quot;' will not be decoded.
165 * @param string
166 * @return string
167 */
168function utf8_decode_entities($str)
169{
170	$str = preg_replace_callback('~&#x([0-9a-f]+);~i', 'utf8_hexchr_callback', $str);
171	$str = preg_replace_callback('~&#([0-9]+);~', 'utf8_chr_callback', $str);
172
173	return $str;
174}
175
176
177/**
178 * Callback function for utf8_decode_entities
179 * @param array
180 * @return string
181 */
182function utf8_chr_callback($matches)
183{
184	return utf8_chr($matches[1]);
185}
186
187
188/**
189 * Callback function for utf8_decode_entities
190 * @param array
191 * @return string
192 */
193function utf8_hexchr_callback($matches)
194{
195	return utf8_chr(hexdec($matches[1]));
196}
197
198
199/**
200 * Detect the encoding of a string
201 *
202 * Use mb_detect_encoding() if available since it seems to be about 20 times
203 * faster than using ereg() or preg_match().
204 * @param string
205 * @return string
206 */
207function utf8_detect_encoding($str)
208{
209	if (USE_MBSTRING)
210		return mb_detect_encoding($str, array('ASCII', 'ISO-2022-JP', 'UTF-8', 'EUC-JP', 'ISO-8859-1'));
211
212	if (!preg_match("/[\x80-\xFF]/", $str))
213	{
214		if (!preg_match("/\x1B/", $str))
215			return 'ASCII';
216
217		return 'ISO-2022-JP';
218	}
219
220	if (preg_match("/^([\x01-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF])+$/", $str) == 1)
221		return 'UTF-8';
222
223	if (preg_match("/^([\x01-\x7F]|\x8E[\xA0-\xDF]|\x8F[xA1-\xFE][\xA1-\xFE]|[\xA1-\xFE][\xA1-\xFE])+$/", $str) == 1)
224		return 'EUC-JP';
225
226	return 'ISO-8859-1';
227}
228
229
230/**
231 * Romanize a string
232 *
233 * Use the UTF-8 lookup table to replace non ascii characters with their
234 * respective roman character.
235 * @param string
236 * @return string
237 */
238function utf8_romanize($str)
239{
240	global $UTF8_LOOKUP_TABLE;
241
242	if (!is_array($UTF8_LOOKUP_TABLE))
243		require_once TL_ROOT . '/system/helper/utf8_lookup.php';
244
245	return strtr(utf8_convert_encoding($str, 'UTF-8'), $UTF8_LOOKUP_TABLE['romanize']);
246}
247
248
249/**
250 * Determine the number of characters of a string
251 *
252 * Use mb_strlen() if available since it seems to be the fastes way to
253 * determine the string length. Otherwise decode the string (will convert
254 * non ISO-8859-1 characters to '?') and use strlen().
255 * @param string
256 * @return integer
257 */
258function utf8_strlen($str)
259{
260	if (USE_MBSTRING)
261		return mb_strlen($str);
262
263	return strlen(utf8_decode($str));
264}
265
266
267/**
268 * Find the position of the first occurence of a string in another string
269 *
270 * Use mb_strpos() if available. Otherwise combine strpos() and utf8_strlen()
271 * to detect the numeric position of the first occurrence.
272 * @param string
273 * @param string
274 * @param integer
275 * @return integer
276 */
277function utf8_strpos($haystack, $needle, $offset=0)
278{
279	if (USE_MBSTRING)
280	{
281		if ($offset === 0)
282			return mb_strpos($haystack, $needle);
283
284		return mb_strpos($haystack, $needle, $offset);
285	}
286
287	$comp = 0;
288	$length = null;
289
290	while ($length === null || $length < $offset)
291	{
292		$pos = strpos($haystack, $needle, $offset + $comp);
293
294		if ($pos === false)
295			return false;
296
297		$length = utf8_strlen(substr($haystack, 0, $pos));
298
299		if ($length < $offset)
300			$comp = $pos - $length;
301	}
302
303	return $length;
304}
305
306
307/**
308 * Find the last occurrence of a character in a string
309 *
310 * Use mb_strrchr() if available since it seems to be about eight times
311 * faster than combining utf8_substr() and utf8_strrpos().
312 * @param string
313 * @param string
314 * @return string
315 */
316function utf8_strrchr($haystack, $needle)
317{
318	if (USE_MBSTRING)
319		return mb_strrchr($haystack, $needle);
320
321	$pos = utf8_strrpos($haystack, $needle);
322
323	if ($pos === false)
324		return false;
325
326	return utf8_substr($haystack, $pos);
327}
328
329
330/**
331 * Find the position of the last occurrence of a string in another string
332 *
333 * Use mb_strrpos() if available since it is about twice as fast as our
334 * workaround. Otherwise use utf8_strlen() to determine the position.
335 * @param string
336 * @param string
337 * @return mixed
338 */
339function utf8_strrpos($haystack, $needle)
340{
341	if (USE_MBSTRING)
342		return mb_strrpos($haystack, $needle);
343
344	$pos = strrpos($haystack, $needle);
345
346	if ($pos === false)
347		return false;
348
349	return utf8_strlen(substr($haystack, 0, $pos));
350}
351
352
353/**
354 * Find the first occurrence of a string in another string
355 *
356 * Use mb_strstr() if available since it seems to be about eight times
357 * faster than combining utf8_substr() and utf8_strpos().
358 * @param string
359 * @param string
360 * @return string
361 */
362function utf8_strstr($haystack, $needle)
363{
364	if (USE_MBSTRING)
365		return mb_strstr($haystack, $needle);
366
367	$pos = utf8_strpos($haystack, $needle);
368
369	if ($pos === false)
370		return false;
371
372	return utf8_substr($haystack, $pos);
373}
374
375
376/**
377 * Make a string lowercase
378 *
379 * Use mb_strtolower() if available, although our workaround does not seem
380 * to be significantly slower.
381 * @param string
382 * @return string
383 */
384function utf8_strtolower($str)
385{
386	if (USE_MBSTRING)
387		return mb_strtolower($str, utf8_detect_encoding($str));
388
389	global $UTF8_LOOKUP_TABLE;
390
391	if (!is_array($UTF8_LOOKUP_TABLE))
392		require_once TL_ROOT . '/system/helper/utf8_lookup.php';
393
394	return strtr($str, $UTF8_LOOKUP_TABLE['strtolower']);
395}
396
397
398/**
399 * Make a string uppercase
400 *
401 * Use mb_strtoupper() if available, although our workaround does not seem
402 * to be significantly slower.
403 * @param string
404 * @return string
405 */
406function utf8_strtoupper($str)
407{
408	if (USE_MBSTRING)
409		return mb_strtoupper($str, utf8_detect_encoding($str));
410
411	global $UTF8_LOOKUP_TABLE;
412
413	if (!is_array($UTF8_LOOKUP_TABLE))
414		require_once TL_ROOT . '/system/helper/utf8_lookup.php';
415
416	return strtr($str, $UTF8_LOOKUP_TABLE['strtoupper']);
417}
418
419
420/**
421 * Return substring of a string
422 *
423 * Use mb_substr() if available since it is about three times faster than
424 * our workaround. Otherwise, use PCRE regular expressions with 'u' flag.
425 * Thanks to Andreas Gohr <andi@splitbrain.org> for this wonderful algorithm
426 * which is the fastes workaround I could find on the internet.
427 * @param string
428 * @param integer
429 * @param integer
430 * @return string
431 */
432function utf8_substr($str, $start, $length=null)
433{
434	if (USE_MBSTRING)
435	{
436		if ($length === null)
437			return mb_substr($str, $start);
438
439		return mb_substr($str, $start, $length);
440	}
441
442	$str = (string) $str;
443	$start = (int) $start;
444
445	if ($length !== null)
446		$length = (int) $length;
447
448	// Handle trivial cases
449	if ($length === 0)
450		return '';
451
452	if ($start < 0 && $length < 0 && $length < $start)
453		return '';
454
455	$start_pattern = '';
456	$length_pattern = '';
457
458	// Normalise -ve offsets
459	if ($start < 0)
460	{
461		$strlen = strlen(utf8_decode($str));
462		$start = $strlen + $start;
463
464		if ($start < 0)
465			$start = 0;
466	}
467
468	// Establish a pattern for offset
469	if ($start > 0)
470	{
471		$Ox = (int) ($start / 65535);
472		$Oy = $start % 65535;
473
474		if ($Ox)
475			$start_pattern = '(?:.{65535}){'.$Ox.'}';
476
477		$start_pattern = '^(?:'.$start_pattern.'.{'.$Oy.'})';
478	}
479
480	// Anchor the pattern if offset == 0
481	else
482	{
483		$start_pattern = '^';
484	}
485
486	// Establish a pattern for length
487	if ($length === null)
488	{
489		$length_pattern = '(.*)$';
490	}
491	else
492	{
493		if (!isset($strlen))
494			$strlen = strlen(utf8_decode($str));
495
496		if ($start > $strlen)
497			return '';
498
499		if ($length > 0)
500		{
501			// Reduce any length that would go passed the end of the string
502			$length = min($strlen-$start, $length);
503
504			$Lx = (int) ($length / 65535);
505			$Ly = $length % 65535;
506
507			if ($Lx)
508				$length_pattern = '(?:.{65535}){'.$Lx.'}';
509
510			$length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
511		}
512		else if ($length < 0)
513		{
514			if ($length < ($start - $strlen))
515				return '';
516
517			$Lx = (int) ((-$length) / 65535);
518			$Ly = (-$length) % 65535;
519
520			if ($Lx)
521				$length_pattern = '(?:.{65535}){'.$Lx.'}';
522
523			$length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
524		}
525	}
526
527	$match = array();
528
529	if (!preg_match('#'.$start_pattern.$length_pattern.'#us', $str, $match))
530		return '';
531
532	return $match[1];
533}
534
535
536/**
537 * Make sure the first letter is uppercase
538 *
539 * @param string
540 * @return string
541 */
542function utf8_ucfirst($str)
543{
544	return utf8_strtoupper(utf8_substr($str, 0, 1)) . utf8_substr($str, 1);
545}
546
547
548/**
549 * Convert a string to an array
550 *
551 * Unicode version of str_split() that handles UTF-8 characters. The function
552 * has been published by saeedco on php.net.
553 * @param string
554 * @return array
555 */
556function utf8_str_split($str)
557{
558	$array = array();
559
560	for ($i=0; $i<strlen($str);)
561	{
562		$split = 1;
563		$value = ord($str[$i]);
564		$key = null;
565
566		if($value >= 192 && $value <= 223)
567			$split=2;
568		elseif($value >= 224 && $value <= 239)
569			$split=3;
570		elseif($value >= 240 && $value <= 247)
571			$split=4;
572
573		for ($j=0; $j<$split; $j++,$i++)
574		{
575			$key .= $str[$i];
576		}
577
578		array_push($array, $key);
579	}
580
581	return $array;
582}