PageRenderTime 169ms CodeModel.GetById 60ms app.highlight 46ms RepoModel.GetById 55ms app.codeStats 0ms

/libraries/joomla/string/string.php

http://github.com/joomla/joomla-platform
PHP | 944 lines | 443 code | 66 blank | 435 comment | 76 complexity | f7fe206569e95e436e29fb7aa9f6f473 MD5 | raw file
  1<?php
  2/**
  3 * @package     Joomla.Platform
  4 * @subpackage  String
  5 *
  6 * @copyright   Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
  7 * @license     GNU General Public License version 2 or later; see LICENSE
  8 */
  9
 10defined('JPATH_PLATFORM') or die;
 11
 12// PHP mbstring and iconv local configuration
 13
 14// Check if mbstring extension is loaded and attempt to load it if not present except for windows
 15if (extension_loaded('mbstring'))
 16{
 17	// Make sure to suppress the output in case ini_set is disabled
 18	@ini_set('mbstring.internal_encoding', 'UTF-8');
 19	@ini_set('mbstring.http_input', 'UTF-8');
 20	@ini_set('mbstring.http_output', 'UTF-8');
 21}
 22
 23// Same for iconv
 24if (function_exists('iconv'))
 25{
 26	// These are settings that can be set inside code
 27	iconv_set_encoding("internal_encoding", "UTF-8");
 28	iconv_set_encoding("input_encoding", "UTF-8");
 29	iconv_set_encoding("output_encoding", "UTF-8");
 30}
 31
 32/**
 33 * Include the utf8 package
 34 */
 35jimport('phputf8.utf8');
 36jimport('phputf8.strcasecmp');
 37
 38/**
 39 * String handling class for utf-8 data
 40 * Wraps the phputf8 library
 41 * All functions assume the validity of utf-8 strings.
 42 *
 43 * @package     Joomla.Platform
 44 * @subpackage  String
 45 * @since       11.1
 46 */
 47abstract class JString
 48{
 49	/**
 50	 * Increment styles.
 51	 *
 52	 * @var    array
 53	 * @since  11.3
 54	 */
 55	protected static $incrementStyles = array(
 56		'dash' => array(
 57			'#-(\d+)$#',
 58			'-%d'
 59		),
 60		'default' => array(
 61			array('#\((\d+)\)$#', '#\(\d+\)$#'),
 62			array(' (%d)', '(%d)'),
 63		),
 64	);
 65
 66	/**
 67	 * Increments a trailing number in a string.
 68	 *
 69	 * Used to easily create distinct labels when copying objects. The method has the following styles:
 70	 *
 71	 * default: "Label" becomes "Label (2)"
 72	 * dash:    "Label" becomes "Label-2"
 73	 *
 74	 * @param   string   $string  The source string.
 75	 * @param   string   $style   The the style (default|dash).
 76	 * @param   integer  $n       If supplied, this number is used for the copy, otherwise it is the 'next' number.
 77	 *
 78	 * @return  string  The incremented string.
 79	 *
 80	 * @since   11.3
 81	 */
 82	public static function increment($string, $style = 'default', $n = 0)
 83	{
 84		$styleSpec = isset(self::$incrementStyles[$style]) ? self::$incrementStyles[$style] : self::$incrementStyles['default'];
 85
 86		// Regular expression search and replace patterns.
 87		if (is_array($styleSpec[0]))
 88		{
 89			$rxSearch = $styleSpec[0][0];
 90			$rxReplace = $styleSpec[0][1];
 91		}
 92		else
 93		{
 94			$rxSearch = $rxReplace = $styleSpec[0];
 95		}
 96
 97		// New and old (existing) sprintf formats.
 98		if (is_array($styleSpec[1]))
 99		{
100			$newFormat = $styleSpec[1][0];
101			$oldFormat = $styleSpec[1][1];
102		}
103		else
104		{
105			$newFormat = $oldFormat = $styleSpec[1];
106		}
107
108		// Check if we are incrementing an existing pattern, or appending a new one.
109		if (preg_match($rxSearch, $string, $matches))
110		{
111			$n = empty($n) ? ($matches[1] + 1) : $n;
112			$string = preg_replace($rxReplace, sprintf($oldFormat, $n), $string);
113		}
114		else
115		{
116			$n = empty($n) ? 2 : $n;
117			$string .= sprintf($newFormat, $n);
118		}
119
120		return $string;
121	}
122
123	/**
124	 * UTF-8 aware alternative to strpos.
125	 *
126	 * Find position of first occurrence of a string.
127	 *
128	 * @param   string   $str     String being examined
129	 * @param   string   $search  String being searched for
130	 * @param   integer  $offset  Optional, specifies the position from which the search should be performed
131	 *
132	 * @return  mixed  Number of characters before the first match or FALSE on failure
133	 *
134	 * @see     http://www.php.net/strpos
135	 * @since   11.1
136	 */
137	public static function strpos($str, $search, $offset = false)
138	{
139		if ($offset === false)
140		{
141			return utf8_strpos($str, $search);
142		}
143		else
144		{
145			return utf8_strpos($str, $search, $offset);
146		}
147	}
148
149	/**
150	 * UTF-8 aware alternative to strrpos
151	 * Finds position of last occurrence of a string
152	 *
153	 * @param   string   $str     String being examined.
154	 * @param   string   $search  String being searched for.
155	 * @param   integer  $offset  Offset from the left of the string.
156	 *
157	 * @return  mixed  Number of characters before the last match or false on failure
158	 *
159	 * @see     http://www.php.net/strrpos
160	 * @since   11.1
161	 */
162	public static function strrpos($str, $search, $offset = 0)
163	{
164		return utf8_strrpos($str, $search, $offset);
165	}
166
167	/**
168	 * UTF-8 aware alternative to substr
169	 * Return part of a string given character offset (and optionally length)
170	 *
171	 * @param   string   $str     String being processed
172	 * @param   integer  $offset  Number of UTF-8 characters offset (from left)
173	 * @param   integer  $length  Optional length in UTF-8 characters from offset
174	 *
175	 * @return  mixed string or FALSE if failure
176	 *
177	 * @see     http://www.php.net/substr
178	 * @since   11.1
179	 */
180	public static function substr($str, $offset, $length = false)
181	{
182		if ($length === false)
183		{
184			return utf8_substr($str, $offset);
185		}
186		else
187		{
188			return utf8_substr($str, $offset, $length);
189		}
190	}
191
192	/**
193	 * UTF-8 aware alternative to strtlower
194	 *
195	 * Make a string lowercase
196	 * Note: The concept of a characters "case" only exists is some alphabets
197	 * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
198	 * not exist in the Chinese alphabet, for example. See Unicode Standard
199	 * Annex #21: Case Mappings
200	 *
201	 * @param   string  $str  String being processed
202	 *
203	 * @return  mixed  Either string in lowercase or FALSE is UTF-8 invalid
204	 *
205	 * @see http://www.php.net/strtolower
206	 * @since   11.1
207	 */
208	public static function strtolower($str)
209	{
210		return utf8_strtolower($str);
211	}
212
213	/**
214	 * UTF-8 aware alternative to strtoupper
215	 * Make a string uppercase
216	 * Note: The concept of a characters "case" only exists is some alphabets
217	 * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
218	 * not exist in the Chinese alphabet, for example. See Unicode Standard
219	 * Annex #21: Case Mappings
220	 *
221	 * @param   string  $str  String being processed
222	 *
223	 * @return  mixed  Either string in uppercase or FALSE is UTF-8 invalid
224	 *
225	 * @see     http://www.php.net/strtoupper
226	 * @since   11.1
227	 */
228	public static function strtoupper($str)
229	{
230		return utf8_strtoupper($str);
231	}
232
233	/**
234	 * UTF-8 aware alternative to strlen.
235	 *
236	 * Returns the number of characters in the string (NOT THE NUMBER OF BYTES),
237	 *
238	 * @param   string  $str  UTF-8 string.
239	 *
240	 * @return  integer  Number of UTF-8 characters in string.
241	 *
242	 * @see http://www.php.net/strlen
243	 * @since   11.1
244	 */
245	public static function strlen($str)
246	{
247		return utf8_strlen($str);
248	}
249
250	/**
251	 * UTF-8 aware alternative to str_ireplace
252	 * Case-insensitive version of str_replace
253	 *
254	 * @param   string   $search   String to search
255	 * @param   string   $replace  Existing string to replace
256	 * @param   string   $str      New string to replace with
257	 * @param   integer  $count    Optional count value to be passed by referene
258	 *
259	 * @return  string  UTF-8 String
260	 *
261	 * @see     http://www.php.net/str_ireplace
262	 * @since   11.1
263	 */
264	public static function str_ireplace($search, $replace, $str, $count = null)
265	{
266		jimport('phputf8.str_ireplace');
267
268		if ($count === false)
269		{
270			return utf8_ireplace($search, $replace, $str);
271		}
272		else
273		{
274			return utf8_ireplace($search, $replace, $str, $count);
275		}
276	}
277
278	/**
279	 * UTF-8 aware alternative to str_split
280	 * Convert a string to an array
281	 *
282	 * @param   string   $str        UTF-8 encoded string to process
283	 * @param   integer  $split_len  Number to characters to split string by
284	 *
285	 * @return  array
286	 *
287	 * @see     http://www.php.net/str_split
288	 * @since   11.1
289	 */
290	public static function str_split($str, $split_len = 1)
291	{
292		jimport('phputf8.str_split');
293
294		return utf8_str_split($str, $split_len);
295	}
296
297	/**
298	 * UTF-8/LOCALE aware alternative to strcasecmp
299	 * A case insensitive string comparison
300	 *
301	 * @param   string  $str1    string 1 to compare
302	 * @param   string  $str2    string 2 to compare
303	 * @param   mixed   $locale  The locale used by strcoll or false to use classical comparison
304	 *
305	 * @return  integer   < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
306	 *
307	 * @see     http://www.php.net/strcasecmp
308	 * @see     http://www.php.net/strcoll
309	 * @see     http://www.php.net/setlocale
310	 * @since   11.1
311	 */
312	public static function strcasecmp($str1, $str2, $locale = false)
313	{
314		if ($locale)
315		{
316			// Get current locale
317			$locale0 = setlocale(LC_COLLATE, 0);
318
319			if (!$locale = setlocale(LC_COLLATE, $locale))
320			{
321				$locale = $locale0;
322			}
323
324			// See if we have successfully set locale to UTF-8
325			if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
326			{
327				$encoding = 'CP' . $m[1];
328			}
329			elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8'))
330			{
331				$encoding = 'UTF-8';
332			}
333			else
334			{
335				$encoding = 'nonrecodable';
336			}
337
338			// If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
339			if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
340			{
341				return strcoll(utf8_strtolower($str1), utf8_strtolower($str2));
342			}
343			else
344			{
345				return strcoll(
346					self::transcode(utf8_strtolower($str1), 'UTF-8', $encoding),
347					self::transcode(utf8_strtolower($str2), 'UTF-8', $encoding)
348				);
349			}
350		}
351		else
352		{
353			return utf8_strcasecmp($str1, $str2);
354		}
355	}
356
357	/**
358	 * UTF-8/LOCALE aware alternative to strcmp
359	 * A case sensitive string comparison
360	 *
361	 * @param   string  $str1    string 1 to compare
362	 * @param   string  $str2    string 2 to compare
363	 * @param   mixed   $locale  The locale used by strcoll or false to use classical comparison
364	 *
365	 * @return  integer  < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
366	 *
367	 * @see     http://www.php.net/strcmp
368	 * @see     http://www.php.net/strcoll
369	 * @see     http://www.php.net/setlocale
370	 * @since   11.1
371	 */
372	public static function strcmp($str1, $str2, $locale = false)
373	{
374		if ($locale)
375		{
376			// Get current locale
377			$locale0 = setlocale(LC_COLLATE, 0);
378
379			if (!$locale = setlocale(LC_COLLATE, $locale))
380			{
381				$locale = $locale0;
382			}
383
384			// See if we have successfully set locale to UTF-8
385			if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
386			{
387				$encoding = 'CP' . $m[1];
388			}
389			elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8'))
390			{
391				$encoding = 'UTF-8';
392			}
393			else
394			{
395				$encoding = 'nonrecodable';
396			}
397
398			// If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
399			if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
400			{
401				return strcoll($str1, $str2);
402			}
403			else
404			{
405				return strcoll(self::transcode($str1, 'UTF-8', $encoding), self::transcode($str2, 'UTF-8', $encoding));
406			}
407		}
408		else
409		{
410			return strcmp($str1, $str2);
411		}
412	}
413
414	/**
415	 * UTF-8 aware alternative to strcspn
416	 * Find length of initial segment not matching mask
417	 *
418	 * @param   string   $str     The string to process
419	 * @param   string   $mask    The mask
420	 * @param   integer  $start   Optional starting character position (in characters)
421	 * @param   integer  $length  Optional length
422	 *
423	 * @return  integer  The length of the initial segment of str1 which does not contain any of the characters in str2
424	 *
425	 * @see     http://www.php.net/strcspn
426	 * @since   11.1
427	 */
428	public static function strcspn($str, $mask, $start = null, $length = null)
429	{
430		jimport('phputf8.strcspn');
431
432		if ($start === false && $length === false)
433		{
434			return utf8_strcspn($str, $mask);
435		}
436		elseif ($length === false)
437		{
438			return utf8_strcspn($str, $mask, $start);
439		}
440		else
441		{
442			return utf8_strcspn($str, $mask, $start, $length);
443		}
444	}
445
446	/**
447	 * UTF-8 aware alternative to stristr
448	 * Returns all of haystack from the first occurrence of needle to the end.
449	 * needle and haystack are examined in a case-insensitive manner
450	 * Find first occurrence of a string using case insensitive comparison
451	 *
452	 * @param   string  $str     The haystack
453	 * @param   string  $search  The needle
454	 *
455	 * @return string the sub string
456	 *
457	 * @see     http://www.php.net/stristr
458	 * @since   11.1
459	 */
460	public static function stristr($str, $search)
461	{
462		jimport('phputf8.stristr');
463
464		return utf8_stristr($str, $search);
465	}
466
467	/**
468	 * UTF-8 aware alternative to strrev
469	 * Reverse a string
470	 *
471	 * @param   string  $str  String to be reversed
472	 *
473	 * @return  string   The string in reverse character order
474	 *
475	 * @see     http://www.php.net/strrev
476	 * @since   11.1
477	 */
478	public static function strrev($str)
479	{
480		jimport('phputf8.strrev');
481
482		return utf8_strrev($str);
483	}
484
485	/**
486	 * UTF-8 aware alternative to strspn
487	 * Find length of initial segment matching mask
488	 *
489	 * @param   string   $str     The haystack
490	 * @param   string   $mask    The mask
491	 * @param   integer  $start   Start optional
492	 * @param   integer  $length  Length optional
493	 *
494	 * @return  integer
495	 *
496	 * @see     http://www.php.net/strspn
497	 * @since   11.1
498	 */
499	public static function strspn($str, $mask, $start = null, $length = null)
500	{
501		jimport('phputf8.strspn');
502
503		if ($start === null && $length === null)
504		{
505			return utf8_strspn($str, $mask);
506		}
507		elseif ($length === null)
508		{
509			return utf8_strspn($str, $mask, $start);
510		}
511		else
512		{
513			return utf8_strspn($str, $mask, $start, $length);
514		}
515	}
516
517	/**
518	 * UTF-8 aware substr_replace
519	 * Replace text within a portion of a string
520	 *
521	 * @param   string   $str     The haystack
522	 * @param   string   $repl    The replacement string
523	 * @param   integer  $start   Start
524	 * @param   integer  $length  Length (optional)
525	 *
526	 * @return  string
527	 *
528	 * @see     http://www.php.net/substr_replace
529	 * @since   11.1
530	 */
531	public static function substr_replace($str, $repl, $start, $length = null)
532	{
533		// Loaded by library loader
534		if ($length === false)
535		{
536			return utf8_substr_replace($str, $repl, $start);
537		}
538		else
539		{
540			return utf8_substr_replace($str, $repl, $start, $length);
541		}
542	}
543
544	/**
545	 * UTF-8 aware replacement for ltrim()
546	 *
547	 * Strip whitespace (or other characters) from the beginning of a string
548	 * You only need to use this if you are supplying the charlist
549	 * optional arg and it contains UTF-8 characters. Otherwise ltrim will
550	 * work normally on a UTF-8 string
551	 *
552	 * @param   string  $str       The string to be trimmed
553	 * @param   string  $charlist  The optional charlist of additional characters to trim
554	 *
555	 * @return  string  The trimmed string
556	 *
557	 * @see     http://www.php.net/ltrim
558	 * @since   11.1
559	 */
560	public static function ltrim($str, $charlist = false)
561	{
562		if (empty($charlist) && $charlist !== false)
563		{
564			return $str;
565		}
566
567		jimport('phputf8.trim');
568
569		if ($charlist === false)
570		{
571			return utf8_ltrim($str);
572		}
573		else
574		{
575			return utf8_ltrim($str, $charlist);
576		}
577	}
578
579	/**
580	 * UTF-8 aware replacement for rtrim()
581	 * Strip whitespace (or other characters) from the end of a string
582	 * You only need to use this if you are supplying the charlist
583	 * optional arg and it contains UTF-8 characters. Otherwise rtrim will
584	 * work normally on a UTF-8 string
585	 *
586	 * @param   string  $str       The string to be trimmed
587	 * @param   string  $charlist  The optional charlist of additional characters to trim
588	 *
589	 * @return  string  The trimmed string
590	 *
591	 * @see     http://www.php.net/rtrim
592	 * @since   11.1
593	 */
594	public static function rtrim($str, $charlist = false)
595	{
596		if (empty($charlist) && $charlist !== false)
597		{
598			return $str;
599		}
600
601		jimport('phputf8.trim');
602
603		if ($charlist === false)
604		{
605			return utf8_rtrim($str);
606		}
607		else
608		{
609			return utf8_rtrim($str, $charlist);
610		}
611	}
612
613	/**
614	 * UTF-8 aware replacement for trim()
615	 * Strip whitespace (or other characters) from the beginning and end of a string
616	 * Note: you only need to use this if you are supplying the charlist
617	 * optional arg and it contains UTF-8 characters. Otherwise trim will
618	 * work normally on a UTF-8 string
619	 *
620	 * @param   string  $str       The string to be trimmed
621	 * @param   string  $charlist  The optional charlist of additional characters to trim
622	 *
623	 * @return  string  The trimmed string
624	 *
625	 * @see     http://www.php.net/trim
626	 * @since   11.1
627	 */
628	public static function trim($str, $charlist = false)
629	{
630		if (empty($charlist) && $charlist !== false)
631		{
632			return $str;
633		}
634
635		jimport('phputf8.trim');
636
637		if ($charlist === false)
638		{
639			return utf8_trim($str);
640		}
641		else
642		{
643			return utf8_trim($str, $charlist);
644		}
645	}
646
647	/**
648	 * UTF-8 aware alternative to ucfirst
649	 * Make a string's first character uppercase or all words' first character uppercase
650	 *
651	 * @param   string  $str           String to be processed
652	 * @param   string  $delimiter     The words delimiter (null means do not split the string)
653	 * @param   string  $newDelimiter  The new words delimiter (null means equal to $delimiter)
654	 *
655	 * @return  string  If $delimiter is null, return the string with first character as upper case (if applicable)
656	 *                  else consider the string of words separated by the delimiter, apply the ucfirst to each words
657	 *                  and return the string with the new delimiter
658	 *
659	 * @see     http://www.php.net/ucfirst
660	 * @since   11.1
661	 */
662	public static function ucfirst($str, $delimiter = null, $newDelimiter = null)
663	{
664		jimport('phputf8.ucfirst');
665
666		if ($delimiter === null)
667		{
668			return utf8_ucfirst($str);
669		}
670		else
671		{
672			if ($newDelimiter === null)
673			{
674				$newDelimiter = $delimiter;
675			}
676			return implode($newDelimiter, array_map('utf8_ucfirst', explode($delimiter, $str)));
677		}
678	}
679
680	/**
681	 * UTF-8 aware alternative to ucwords
682	 * Uppercase the first character of each word in a string
683	 *
684	 * @param   string  $str  String to be processed
685	 *
686	 * @return  string  String with first char of each word uppercase
687	 *
688	 * @see     http://www.php.net/ucwords
689	 * @since   11.1
690	 */
691	public static function ucwords($str)
692	{
693		jimport('phputf8.ucwords');
694
695		return utf8_ucwords($str);
696	}
697
698	/**
699	 * Transcode a string.
700	 *
701	 * @param   string  $source         The string to transcode.
702	 * @param   string  $from_encoding  The source encoding.
703	 * @param   string  $to_encoding    The target encoding.
704	 *
705	 * @return  mixed  The transcoded string, or null if the source was not a string.
706	 *
707	 * @link    https://bugs.php.net/bug.php?id=48147
708	 *
709	 * @since   11.1
710	 */
711	public static function transcode($source, $from_encoding, $to_encoding)
712	{
713		if (is_string($source))
714		{
715			switch (ICONV_IMPL)
716			{
717				case 'glibc':
718				return @iconv($from_encoding, $to_encoding . '//TRANSLIT,IGNORE', $source);
719				case 'libiconv':
720				default:
721				return iconv($from_encoding, $to_encoding . '//IGNORE//TRANSLIT', $source);
722			}
723		}
724
725		return null;
726	}
727
728	/**
729	 * Tests a string as to whether it's valid UTF-8 and supported by the Unicode standard.
730	 *
731	 * Note: this function has been modified to simple return true or false.
732	 *
733	 * @param   string  $str  UTF-8 encoded string.
734	 *
735	 * @return  boolean  true if valid
736	 *
737	 * @author  <hsivonen@iki.fi>
738	 * @see     http://hsivonen.iki.fi/php-utf8/
739	 * @see     compliant
740	 * @since   11.1
741	 */
742	public static function valid($str)
743	{
744		// Cached expected number of octets after the current octet
745		// until the beginning of the next UTF8 character sequence
746		$mState = 0;
747
748		// Cached Unicode character
749		$mUcs4 = 0;
750
751		// Cached expected number of octets in the current sequence
752		$mBytes = 1;
753
754		$len = strlen($str);
755
756		for ($i = 0; $i < $len; $i++)
757		{
758			$in = ord($str{$i});
759
760			if ($mState == 0)
761			{
762				// When mState is zero we expect either a US-ASCII character or a
763				// multi-octet sequence.
764				if (0 == (0x80 & ($in)))
765				{
766					// US-ASCII, pass straight through.
767					$mBytes = 1;
768				}
769				elseif (0xC0 == (0xE0 & ($in)))
770				{
771					// First octet of 2 octet sequence
772					$mUcs4 = ($in);
773					$mUcs4 = ($mUcs4 & 0x1F) << 6;
774					$mState = 1;
775					$mBytes = 2;
776				}
777				elseif (0xE0 == (0xF0 & ($in)))
778				{
779					// First octet of 3 octet sequence
780					$mUcs4 = ($in);
781					$mUcs4 = ($mUcs4 & 0x0F) << 12;
782					$mState = 2;
783					$mBytes = 3;
784				}
785				elseif (0xF0 == (0xF8 & ($in)))
786				{
787					// First octet of 4 octet sequence
788					$mUcs4 = ($in);
789					$mUcs4 = ($mUcs4 & 0x07) << 18;
790					$mState = 3;
791					$mBytes = 4;
792				}
793				elseif (0xF8 == (0xFC & ($in)))
794				{
795					/* First octet of 5 octet sequence.
796					 *
797					 * This is illegal because the encoded codepoint must be either
798					 * (a) not the shortest form or
799					 * (b) outside the Unicode range of 0-0x10FFFF.
800					 * Rather than trying to resynchronize, we will carry on until the end
801					 * of the sequence and let the later error handling code catch it.
802					 */
803					$mUcs4 = ($in);
804					$mUcs4 = ($mUcs4 & 0x03) << 24;
805					$mState = 4;
806					$mBytes = 5;
807				}
808				elseif (0xFC == (0xFE & ($in)))
809				{
810					// First octet of 6 octet sequence, see comments for 5 octet sequence.
811					$mUcs4 = ($in);
812					$mUcs4 = ($mUcs4 & 1) << 30;
813					$mState = 5;
814					$mBytes = 6;
815
816				}
817				else
818				{
819					/* Current octet is neither in the US-ASCII range nor a legal first
820					 * octet of a multi-octet sequence.
821					 */
822					return false;
823				}
824			}
825			else
826			{
827				// When mState is non-zero, we expect a continuation of the multi-octet
828				// sequence
829				if (0x80 == (0xC0 & ($in)))
830				{
831					// Legal continuation.
832					$shift = ($mState - 1) * 6;
833					$tmp = $in;
834					$tmp = ($tmp & 0x0000003F) << $shift;
835					$mUcs4 |= $tmp;
836
837					/**
838					 * End of the multi-octet sequence. mUcs4 now contains the final
839					 * Unicode codepoint to be output
840					 */
841					if (0 == --$mState)
842					{
843						/*
844						 * Check for illegal sequences and codepoints.
845						 */
846						// From Unicode 3.1, non-shortest form is illegal
847						if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || ((3 == $mBytes) && ($mUcs4 < 0x0800)) || ((4 == $mBytes) && ($mUcs4 < 0x10000))
848							|| (4 < $mBytes)
849							|| (($mUcs4 & 0xFFFFF800) == 0xD800) // From Unicode 3.2, surrogate characters are illegal
850							|| ($mUcs4 > 0x10FFFF)) // Codepoints outside the Unicode range are illegal
851						{
852							return false;
853						}
854
855						// Initialize UTF8 cache.
856						$mState = 0;
857						$mUcs4 = 0;
858						$mBytes = 1;
859					}
860				}
861				else
862				{
863					/**
864					 *((0xC0 & (*in) != 0x80) && (mState != 0))
865					 * Incomplete multi-octet sequence.
866					 */
867					return false;
868				}
869			}
870		}
871		return true;
872	}
873
874	/**
875	 * Tests whether a string complies as UTF-8. This will be much
876	 * faster than utf8_is_valid but will pass five and six octet
877	 * UTF-8 sequences, which are not supported by Unicode and
878	 * so cannot be displayed correctly in a browser. In other words
879	 * it is not as strict as utf8_is_valid but it's faster. If you use
880	 * it to validate user input, you place yourself at the risk that
881	 * attackers will be able to inject 5 and 6 byte sequences (which
882	 * may or may not be a significant risk, depending on what you are
883	 * are doing)
884	 *
885	 * @param   string  $str  UTF-8 string to check
886	 *
887	 * @return  boolean  TRUE if string is valid UTF-8
888	 *
889	 * @see     valid
890	 * @see     http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
891	 * @since   11.1
892	 */
893	public static function compliant($str)
894	{
895		if (strlen($str) == 0)
896		{
897			return true;
898		}
899
900		/*
901		 * If even just the first character can be matched, when the /u
902		 * modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
903		 * invalid, nothing at all will match, even if the string contains
904		 * some valid sequences
905		 */
906		return (preg_match('/^.{1}/us', $str, $ar) == 1);
907	}
908
909	/**
910	 * Does a UTF-8 safe version of PHP parse_url function
911	 *
912	 * @param   string  $url  URL to parse
913	 *
914	 * @return  mixed  Associative array or false if badly formed URL.
915	 *
916	 * @see     http://us3.php.net/manual/en/function.parse-url.php
917	 * @since   11.1
918	 */
919	public static function parse_url($url)
920	{
921		$result = false;
922
923		// Build arrays of values we need to decode before parsing
924		$entities = array('%21', '%2A', '%27', '%28', '%29', '%3B', '%3A', '%40', '%26', '%3D', '%24', '%2C', '%2F', '%3F', '%23', '%5B', '%5D');
925		$replacements = array('!', '*', "'", "(", ")", ";", ":", "@", "&", "=", "$", ",", "/", "?", "#", "[", "]");
926
927		// Create encoded URL with special URL characters decoded so it can be parsed
928		// All other characters will be encoded
929		$encodedURL = str_replace($entities, $replacements, urlencode($url));
930
931		// Parse the encoded URL
932		$encodedParts = parse_url($encodedURL);
933
934		// Now, decode each value of the resulting array
935		if ($encodedParts)
936		{
937			foreach ($encodedParts as $key => $value)
938			{
939				$result[$key] = urldecode(str_replace($replacements, $entities, $value));
940			}
941		}
942		return $result;
943	}
944}