PageRenderTime 81ms CodeModel.GetById 58ms app.highlight 17ms RepoModel.GetById 1ms app.codeStats 0ms

/libraries/joomla/string/string.php

https://bitbucket.org/asosso/joomla25
PHP | 976 lines | 449 code | 54 blank | 473 comment | 77 complexity | bb820c85860c51d199655215b29f18e8 MD5 | raw file
  1<?php
  2/**
  3 * @package     Joomla.Platform
  4 * @subpackage  String
  5 *
  6 * @copyright   Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
  7 * @license     GNU General Public License version 2 or later; see LICENSE
  8 */
  9
 10defined('JPATH_PLATFORM') or die;
 11
 12//
 13// PHP mbstring and iconv local configuration
 14//
 15// Check if mbstring extension is loaded and attempt to load it if not present except for windows
 16if (extension_loaded('mbstring') || ((!strtoupper(substr(PHP_OS, 0, 3)) === 'WIN' && dl('mbstring.so'))))
 17{
 18	// Make sure to suppress the output in case ini_set is disabled
 19	@ini_set('mbstring.internal_encoding', 'UTF-8');
 20	@ini_set('mbstring.http_input', 'UTF-8');
 21	@ini_set('mbstring.http_output', 'UTF-8');
 22}
 23
 24// Same for iconv
 25if (function_exists('iconv') || ((!strtoupper(substr(PHP_OS, 0, 3)) === 'WIN' && dl('iconv.so'))))
 26{
 27	// These are settings that can be set inside code
 28	iconv_set_encoding("internal_encoding", "UTF-8");
 29	iconv_set_encoding("input_encoding", "UTF-8");
 30	iconv_set_encoding("output_encoding", "UTF-8");
 31}
 32
 33/**
 34 * Include the utf8 package
 35 */
 36jimport('phputf8.utf8');
 37jimport('phputf8.strcasecmp');
 38
 39/**
 40 * String handling class for utf-8 data
 41 * Wraps the phputf8 library
 42 * All functions assume the validity of utf-8 strings.
 43 *
 44 * @package     Joomla.Platform
 45 * @subpackage  String
 46 * @since       11.1
 47 */
 48abstract class JString
 49{
 50	/**
 51	 * Increment styles.
 52	 *
 53	 * @var    array
 54	 * @since  11.3
 55	 */
 56	protected static $incrementStyles = array(
 57		'dash' => array(
 58			'#-(\d+)$#',
 59			'-%d'
 60		),
 61		'default' => array(
 62			array('#\((\d+)\)$#', '#\(\d+\)$#'),
 63			array(' (%d)', '(%d)'),
 64		),
 65	);
 66
 67	/**
 68	 * Split a string in camel case format
 69	 *
 70	 * "FooBarABCDef"            becomes  array("Foo", "Bar", "ABC", "Def");
 71	 * "JFooBar"                 becomes  array("J", "Foo", "Bar");
 72	 * "J001FooBar002"           becomes  array("J001", "Foo", "Bar002");
 73	 * "abcDef"                  becomes  array("abc", "Def");
 74	 * "abc_defGhi_Jkl"          becomes  array("abc_def", "Ghi_Jkl");
 75	 * "ThisIsA_NASAAstronaut"   becomes  array("This", "Is", "A_NASA", "Astronaut")),
 76	 * "JohnFitzgerald_Kennedy"  becomes  array("John", "Fitzgerald_Kennedy")),
 77	 *
 78	 * @param   string  $string  The source string.
 79	 *
 80	 * @return  array   The splitted string.
 81	 *
 82	 * @since   11.3
 83	 */
 84	public static function splitCamelCase($string)
 85	{
 86		return preg_split('/(?<=[^A-Z_])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][^A-Z_])/x', $string);
 87	}
 88
 89	/**
 90	 * Increments a trailing number in a string.
 91	 *
 92	 * Used to easily create distinct labels when copying objects. The method has the following styles:
 93	 *
 94	 * default: "Label" becomes "Label (2)"
 95	 * dash:    "Label" becomes "Label-2"
 96	 *
 97	 * @param   string   $string  The source string.
 98	 * @param   string   $style   The the style (default|dash).
 99	 * @param   integer  $n       If supplied, this number is used for the copy, otherwise it is the 'next' number.
100	 *
101	 * @return  string  The incremented string.
102	 *
103	 * @since   11.3
104	 */
105	public static function increment($string, $style = 'default', $n = 0)
106	{
107		$styleSpec = isset(self::$incrementStyles[$style]) ? self::$incrementStyles[$style] : self::$incrementStyles['default'];
108
109		// Regular expression search and replace patterns.
110		if (is_array($styleSpec[0]))
111		{
112			$rxSearch = $styleSpec[0][0];
113			$rxReplace = $styleSpec[0][1];
114		}
115		else
116		{
117			$rxSearch = $rxReplace = $styleSpec[0];
118		}
119
120		// New and old (existing) sprintf formats.
121		if (is_array($styleSpec[1]))
122		{
123			$newFormat = $styleSpec[1][0];
124			$oldFormat = $styleSpec[1][1];
125		}
126		else
127		{
128			$newFormat = $oldFormat = $styleSpec[1];
129		}
130
131		// Check if we are incrementing an existing pattern, or appending a new one.
132		if (preg_match($rxSearch, $string, $matches))
133		{
134			$n = empty($n) ? ($matches[1] + 1) : $n;
135			$string = preg_replace($rxReplace, sprintf($oldFormat, $n), $string);
136		}
137		else
138		{
139			$n = empty($n) ? 2 : $n;
140			$string .= sprintf($newFormat, $n);
141		}
142
143		return $string;
144	}
145
146	/**
147	 * UTF-8 aware alternative to strpos.
148	 *
149	 * Find position of first occurrence of a string.
150	 *
151	 * @param   string   $str     String being examined
152	 * @param   string   $search  String being searched for
153	 * @param   integer  $offset  Optional, specifies the position from which the search should be performed
154	 *
155	 * @return  mixed  Number of characters before the first match or FALSE on failure
156	 *
157	 * @see     http://www.php.net/strpos
158	 * @since   11.1
159	 */
160	public static function strpos($str, $search, $offset = false)
161	{
162		if ($offset === false)
163		{
164			return utf8_strpos($str, $search);
165		}
166		else
167		{
168			return utf8_strpos($str, $search, $offset);
169		}
170	}
171
172	/**
173	 * UTF-8 aware alternative to strrpos
174	 * Finds position of last occurrence of a string
175	 *
176	 * @param   string   $str     String being examined.
177	 * @param   string   $search  String being searched for.
178	 * @param   integer  $offset  Offset from the left of the string.
179	 *
180	 * @return  mixed  Number of characters before the last match or false on failure
181	 *
182	 * @see     http://www.php.net/strrpos
183	 * @since   11.1
184	 */
185	public static function strrpos($str, $search, $offset = 0)
186	{
187		return utf8_strrpos($str, $search, $offset);
188	}
189
190	/**
191	 * UTF-8 aware alternative to substr
192	 * Return part of a string given character offset (and optionally length)
193	 *
194	 * @param   string   $str     String being processed
195	 * @param   integer  $offset  Number of UTF-8 characters offset (from left)
196	 * @param   integer  $length  Optional length in UTF-8 characters from offset
197	 *
198	 * @return  mixed string or FALSE if failure
199	 *
200	 * @see     http://www.php.net/substr
201	 * @since   11.1
202	 */
203	public static function substr($str, $offset, $length = false)
204	{
205		if ($length === false)
206		{
207			return utf8_substr($str, $offset);
208		}
209		else
210		{
211			return utf8_substr($str, $offset, $length);
212		}
213	}
214
215	/**
216	 * UTF-8 aware alternative to strtlower
217	 *
218	 * Make a string lowercase
219	 * Note: The concept of a characters "case" only exists is some alphabets
220	 * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
221	 * not exist in the Chinese alphabet, for example. See Unicode Standard
222	 * Annex #21: Case Mappings
223	 *
224	 * @param   string  $str  String being processed
225	 *
226	 * @return  mixed  Either string in lowercase or FALSE is UTF-8 invalid
227	 *
228	 * @see http://www.php.net/strtolower
229	 * @since   11.1
230	 */
231	public static function strtolower($str)
232	{
233		return utf8_strtolower($str);
234	}
235
236	/**
237	 * UTF-8 aware alternative to strtoupper
238	 * Make a string uppercase
239	 * Note: The concept of a characters "case" only exists is some alphabets
240	 * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
241	 * not exist in the Chinese alphabet, for example. See Unicode Standard
242	 * Annex #21: Case Mappings
243	 *
244	 * @param   string  $str  String being processed
245	 *
246	 * @return  mixed  Either string in uppercase or FALSE is UTF-8 invalid
247	 *
248	 * @see     http://www.php.net/strtoupper
249	 * @since   11.1
250	 */
251	public static function strtoupper($str)
252	{
253		return utf8_strtoupper($str);
254	}
255
256	/**
257	 * UTF-8 aware alternative to strlen.
258	 *
259	 * Returns the number of characters in the string (NOT THE NUMBER OF BYTES),
260	 *
261	 * @param   string  $str  UTF-8 string.
262	 *
263	 * @return  integer  Number of UTF-8 characters in string.
264	 *
265	 * @see http://www.php.net/strlen
266	 * @since   11.1
267	 */
268	public static function strlen($str)
269	{
270		return utf8_strlen($str);
271	}
272
273	/**
274	 * UTF-8 aware alternative to str_ireplace
275	 * Case-insensitive version of str_replace
276	 *
277	 * @param   string   $search   String to search
278	 * @param   string   $replace  Existing string to replace
279	 * @param   string   $str      New string to replace with
280	 * @param   integer  $count    Optional count value to be passed by referene
281	 *
282	 * @return  string  UTF-8 String
283	 *
284	 * @see     http://www.php.net/str_ireplace
285	 * @since   11.1
286	 */
287	public static function str_ireplace($search, $replace, $str, $count = null)
288	{
289		jimport('phputf8.str_ireplace');
290		if ($count === false)
291		{
292			return utf8_ireplace($search, $replace, $str);
293		}
294		else
295		{
296			return utf8_ireplace($search, $replace, $str, $count);
297		}
298	}
299
300	/**
301	 * UTF-8 aware alternative to str_split
302	 * Convert a string to an array
303	 *
304	 * @param   string   $str        UTF-8 encoded string to process
305	 * @param   integer  $split_len  Number to characters to split string by
306	 *
307	 * @return  array
308	 *
309	 * @see     http://www.php.net/str_split
310	 * @since   11.1
311	 */
312	public static function str_split($str, $split_len = 1)
313	{
314		jimport('phputf8.str_split');
315
316		return utf8_str_split($str, $split_len);
317	}
318
319	/**
320	 * UTF-8/LOCALE aware alternative to strcasecmp
321	 * A case insensitive string comparison
322	 *
323	 * @param   string  $str1    string 1 to compare
324	 * @param   string  $str2    string 2 to compare
325	 * @param   mixed   $locale  The locale used by strcoll or false to use classical comparison
326	 *
327	 * @return  integer   < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
328	 *
329	 * @see     http://www.php.net/strcasecmp
330	 * @see     http://www.php.net/strcoll
331	 * @see     http://www.php.net/setlocale
332	 * @since   11.1
333	 */
334	public static function strcasecmp($str1, $str2, $locale = false)
335	{
336		if ($locale)
337		{
338			// Get current locale
339			$locale0 = setlocale(LC_COLLATE, 0);
340			if (!$locale = setlocale(LC_COLLATE, $locale))
341			{
342				$locale = $locale0;
343			}
344
345			// See if we have successfully set locale to UTF-8
346			if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
347			{
348				$encoding = 'CP' . $m[1];
349			}
350			elseif (stristr($locale, 'UTF-8'))
351			{
352				$encoding = 'UTF-8';
353			}
354			else
355			{
356				$encoding = 'nonrecodable';
357			}
358
359			// if we successfully set encoding it to utf-8 or encoding is sth weird don't recode
360			if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
361			{
362				return strcoll(utf8_strtolower($str1), utf8_strtolower($str2));
363			}
364			else
365			{
366				return strcoll(
367					self::transcode(utf8_strtolower($str1), 'UTF-8', $encoding),
368					self::transcode(utf8_strtolower($str2), 'UTF-8', $encoding)
369				);
370			}
371		}
372		else
373		{
374			return utf8_strcasecmp($str1, $str2);
375		}
376	}
377
378	/**
379	 * UTF-8/LOCALE aware alternative to strcmp
380	 * A case sensitive string comparison
381	 *
382	 * @param   string  $str1    string 1 to compare
383	 * @param   string  $str2    string 2 to compare
384	 * @param   mixed   $locale  The locale used by strcoll or false to use classical comparison
385	 *
386	 * @return  integer  < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
387	 *
388	 * @see     http://www.php.net/strcmp
389	 * @see     http://www.php.net/strcoll
390	 * @see     http://www.php.net/setlocale
391	 * @since   11.1
392	 */
393	public static function strcmp($str1, $str2, $locale = false)
394	{
395		if ($locale)
396		{
397			// Get current locale
398			$locale0 = setlocale(LC_COLLATE, 0);
399			if (!$locale = setlocale(LC_COLLATE, $locale))
400			{
401				$locale = $locale0;
402			}
403
404			// See if we have successfully set locale to UTF-8
405			if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
406			{
407				$encoding = 'CP' . $m[1];
408			}
409			elseif (stristr($locale, 'UTF-8'))
410			{
411				$encoding = 'UTF-8';
412			}
413			else
414			{
415				$encoding = 'nonrecodable';
416			}
417
418			// If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
419			if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
420			{
421				return strcoll($str1, $str2);
422			}
423			else
424			{
425				return strcoll(self::transcode($str1, 'UTF-8', $encoding), self::transcode($str2, 'UTF-8', $encoding));
426			}
427		}
428		else
429		{
430			return strcmp($str1, $str2);
431		}
432	}
433
434	/**
435	 * UTF-8 aware alternative to strcspn
436	 * Find length of initial segment not matching mask
437	 *
438	 * @param   string   $str     The string to process
439	 * @param   string   $mask    The mask
440	 * @param   integer  $start   Optional starting character position (in characters)
441	 * @param   integer  $length  Optional length
442	 *
443	 * @return  integer  The length of the initial segment of str1 which does not contain any of the characters in str2
444	 *
445	 * @see     http://www.php.net/strcspn
446	 * @since   11.1
447	 */
448	public static function strcspn($str, $mask, $start = null, $length = null)
449	{
450		jimport('phputf8.strcspn');
451		if ($start === false && $length === false)
452		{
453			return utf8_strcspn($str, $mask);
454		}
455		elseif ($length === false)
456		{
457			return utf8_strcspn($str, $mask, $start);
458		}
459		else
460		{
461			return utf8_strcspn($str, $mask, $start, $length);
462		}
463	}
464
465	/**
466	 * UTF-8 aware alternative to stristr
467	 * Returns all of haystack from the first occurrence of needle to the end.
468	 * needle and haystack are examined in a case-insensitive manner
469	 * Find first occurrence of a string using case insensitive comparison
470	 *
471	 * @param   string  $str     The haystack
472	 * @param   string  $search  The needle
473	 *
474	 * @return string the sub string
475	 *
476	 * @see     http://www.php.net/stristr
477	 * @since   11.1
478	 */
479	public static function stristr($str, $search)
480	{
481		jimport('phputf8.stristr');
482		return utf8_stristr($str, $search);
483	}
484
485	/**
486	 * UTF-8 aware alternative to strrev
487	 * Reverse a string
488	 *
489	 * @param   string  $str  String to be reversed
490	 *
491	 * @return  string   The string in reverse character order
492	 *
493	 * @see     http://www.php.net/strrev
494	 * @since   11.1
495	 */
496	public static function strrev($str)
497	{
498		jimport('phputf8.strrev');
499
500		return utf8_strrev($str);
501	}
502
503	/**
504	 * UTF-8 aware alternative to strspn
505	 * Find length of initial segment matching mask
506	 *
507	 * @param   string   $str     The haystack
508	 * @param   string   $mask    The mask
509	 * @param   integer  $start   Start optional
510	 * @param   integer  $length  Length optional
511	 *
512	 * @return  integer
513	 *
514	 * @see     http://www.php.net/strspn
515	 * @since   11.1
516	 */
517	public static function strspn($str, $mask, $start = null, $length = null)
518	{
519		jimport('phputf8.strspn');
520		if ($start === null && $length === null)
521		{
522			return utf8_strspn($str, $mask);
523		}
524		elseif ($length === null)
525		{
526			return utf8_strspn($str, $mask, $start);
527		}
528		else
529		{
530			return utf8_strspn($str, $mask, $start, $length);
531		}
532	}
533
534	/**
535	 * UTF-8 aware substr_replace
536	 * Replace text within a portion of a string
537	 *
538	 * @param   string   $str     The haystack
539	 * @param   string   $repl    The replacement string
540	 * @param   integer  $start   Start
541	 * @param   integer  $length  Length (optional)
542	 *
543	 * @return  string
544	 *
545	 * @see     http://www.php.net/substr_replace
546	 * @since   11.1
547	 */
548	public static function substr_replace($str, $repl, $start, $length = null)
549	{
550		// loaded by library loader
551		if ($length === false)
552		{
553			return utf8_substr_replace($str, $repl, $start);
554		}
555		else
556		{
557			return utf8_substr_replace($str, $repl, $start, $length);
558		}
559	}
560
561	/**
562	 * UTF-8 aware replacement for ltrim()
563	 *
564	 * Strip whitespace (or other characters) from the beginning of a string
565	 * You only need to use this if you are supplying the charlist
566	 * optional arg and it contains UTF-8 characters. Otherwise ltrim will
567	 * work normally on a UTF-8 string
568	 *
569	 * @param   string  $str       The string to be trimmed
570	 * @param   string  $charlist  The optional charlist of additional characters to trim
571	 *
572	 * @return  string  The trimmed string
573	 *
574	 * @see     http://www.php.net/ltrim
575	 * @since   11.1
576	 */
577	public static function ltrim($str, $charlist = false)
578	{
579		if (empty($charlist) && $charlist !== false)
580		{
581			return $str;
582		}
583
584		jimport('phputf8.trim');
585		if ($charlist === false)
586		{
587			return utf8_ltrim($str);
588		}
589		else
590		{
591			return utf8_ltrim($str, $charlist);
592		}
593	}
594
595	/**
596	 * UTF-8 aware replacement for rtrim()
597	 * Strip whitespace (or other characters) from the end of a string
598	 * You only need to use this if you are supplying the charlist
599	 * optional arg and it contains UTF-8 characters. Otherwise rtrim will
600	 * work normally on a UTF-8 string
601	 *
602	 * @param   string  $str       The string to be trimmed
603	 * @param   string  $charlist  The optional charlist of additional characters to trim
604	 *
605	 * @return  string  The trimmed string
606	 *
607	 * @see     http://www.php.net/rtrim
608	 * @since   11.1
609	 */
610	public static function rtrim($str, $charlist = false)
611	{
612		if (empty($charlist) && $charlist !== false)
613		{
614			return $str;
615		}
616
617		jimport('phputf8.trim');
618		if ($charlist === false)
619		{
620			return utf8_rtrim($str);
621		}
622		else
623		{
624			return utf8_rtrim($str, $charlist);
625		}
626	}
627
628	/**
629	 * UTF-8 aware replacement for trim()
630	 * Strip whitespace (or other characters) from the beginning and end of a string
631	 * Note: you only need to use this if you are supplying the charlist
632	 * optional arg and it contains UTF-8 characters. Otherwise trim will
633	 * work normally on a UTF-8 string
634	 *
635	 * @param   string  $str       The string to be trimmed
636	 * @param   string  $charlist  The optional charlist of additional characters to trim
637	 *
638	 * @return  string  The trimmed string
639	 *
640	 * @see     http://www.php.net/trim
641	 * @since   11.1
642	 */
643	public static function trim($str, $charlist = false)
644	{
645		if (empty($charlist) && $charlist !== false)
646		{
647			return $str;
648		}
649
650		jimport('phputf8.trim');
651		if ($charlist === false)
652		{
653			return utf8_trim($str);
654		}
655		else
656		{
657			return utf8_trim($str, $charlist);
658		}
659	}
660
661	/**
662	 * UTF-8 aware alternative to ucfirst
663	 * Make a string's first character uppercase or all words' first character uppercase
664	 *
665	 * @param   string  $str           String to be processed
666	 * @param   string  $delimiter     The words delimiter (null means do not split the string)
667	 * @param   string  $newDelimiter  The new words delimiter (null means equal to $delimiter)
668	 *
669	 * @return  string  If $delimiter is null, return the string with first character as upper case (if applicable)
670	 *                  else consider the string of words separated by the delimiter, apply the ucfirst to each words
671	 *                  and return the string with the new delimiter
672	 *
673	 * @see     http://www.php.net/ucfirst
674	 * @since   11.1
675	 */
676	public static function ucfirst($str, $delimiter = null, $newDelimiter = null)
677	{
678		jimport('phputf8.ucfirst');
679		if ($delimiter === null)
680		{
681			return utf8_ucfirst($str);
682		}
683		else
684		{
685			if ($newDelimiter === null)
686			{
687				$newDelimiter = $delimiter;
688			}
689			return implode($newDelimiter, array_map('utf8_ucfirst', explode($delimiter, $str)));
690		}
691	}
692
693	/**
694	 * UTF-8 aware alternative to ucwords
695	 * Uppercase the first character of each word in a string
696	 *
697	 * @param   string  $str  String to be processed
698	 *
699	 * @return  string  String with first char of each word uppercase
700	 *
701	 * @see     http://www.php.net/ucwords
702	 * @since   11.1
703	 */
704	public static function ucwords($str)
705	{
706		jimport('phputf8.ucwords');
707		return utf8_ucwords($str);
708	}
709
710	/**
711	 * Catch an error and throw an exception.
712	 *
713	 * @param   integer  $number   Error level
714	 * @param   string   $message  Error message
715	 *
716	 * @return  void
717	 *
718	 * @link    https://bugs.php.net/bug.php?id=48147
719	 *
720	 * @throw   ErrorException
721	 */
722	private static function _iconvErrorHandler($number, $message)
723	{
724		throw new ErrorException($message, 0, $number);
725	}
726
727	/**
728	 * Transcode a string.
729	 *
730	 * @param   string  $source         The string to transcode.
731	 * @param   string  $from_encoding  The source encoding.
732	 * @param   string  $to_encoding    The target encoding.
733	 *
734	 * @return  mixed  The transcoded string, or null if the source was not a string.
735	 *
736	 * @link    https://bugs.php.net/bug.php?id=48147
737	 *
738	 * @since   11.1
739	 */
740	public static function transcode($source, $from_encoding, $to_encoding)
741	{
742		if (is_string($source))
743		{
744			set_error_handler(array(__CLASS__, '_iconvErrorHandler'), E_NOTICE);
745			try
746			{
747				/*
748				 * "//TRANSLIT//IGNORE" is appended to the $to_encoding to ensure that when iconv comes
749				 * across a character that cannot be represented in the target charset, it can
750				 * be approximated through one or several similarly looking characters or ignored.
751				 */
752				$iconv = iconv($from_encoding, $to_encoding . '//TRANSLIT//IGNORE', $source);
753			}
754			catch (ErrorException $e)
755			{
756				/*
757				 * "//IGNORE" is appended to the $to_encoding to ensure that when iconv comes
758				 * across a character that cannot be represented in the target charset, it is ignored.
759				 */
760				$iconv = iconv($from_encoding, $to_encoding . '//IGNORE', $source);
761			}
762			restore_error_handler();
763			return $iconv;
764		}
765
766		return null;
767	}
768
769	/**
770	 * Tests a string as to whether it's valid UTF-8 and supported by the Unicode standard.
771	 *
772	 * Note: this function has been modified to simple return true or false.
773	 *
774	 * @param   string  $str  UTF-8 encoded string.
775	 *
776	 * @return  boolean  true if valid
777	 *
778	 * @author  <hsivonen@iki.fi>
779	 * @see     http://hsivonen.iki.fi/php-utf8/
780	 * @see     compliant
781	 * @since   11.1
782	 */
783	public static function valid($str)
784	{
785		// Cached expected number of octets after the current octet
786		// until the beginning of the next UTF8 character sequence
787		$mState = 0;
788
789		// Cached Unicode character
790		$mUcs4 = 0;
791
792		// Cached expected number of octets in the current sequence
793		$mBytes = 1;
794
795		$len = strlen($str);
796
797		for ($i = 0; $i < $len; $i++)
798		{
799			$in = ord($str{$i});
800
801			if ($mState == 0)
802			{
803				// When mState is zero we expect either a US-ASCII character or a
804				// multi-octet sequence.
805				if (0 == (0x80 & ($in)))
806				{
807					// US-ASCII, pass straight through.
808					$mBytes = 1;
809				}
810				elseif (0xC0 == (0xE0 & ($in)))
811				{
812					// First octet of 2 octet sequence
813					$mUcs4 = ($in);
814					$mUcs4 = ($mUcs4 & 0x1F) << 6;
815					$mState = 1;
816					$mBytes = 2;
817				}
818				elseif (0xE0 == (0xF0 & ($in)))
819				{
820					// First octet of 3 octet sequence
821					$mUcs4 = ($in);
822					$mUcs4 = ($mUcs4 & 0x0F) << 12;
823					$mState = 2;
824					$mBytes = 3;
825				}
826				elseif (0xF0 == (0xF8 & ($in)))
827				{
828					// First octet of 4 octet sequence
829					$mUcs4 = ($in);
830					$mUcs4 = ($mUcs4 & 0x07) << 18;
831					$mState = 3;
832					$mBytes = 4;
833				}
834				elseif (0xF8 == (0xFC & ($in)))
835				{
836					/* First octet of 5 octet sequence.
837					 *
838					 * This is illegal because the encoded codepoint must be either
839					 * (a) not the shortest form or
840					 * (b) outside the Unicode range of 0-0x10FFFF.
841					 * Rather than trying to resynchronize, we will carry on until the end
842					 * of the sequence and let the later error handling code catch it.
843					 */
844					$mUcs4 = ($in);
845					$mUcs4 = ($mUcs4 & 0x03) << 24;
846					$mState = 4;
847					$mBytes = 5;
848				}
849				elseif (0xFC == (0xFE & ($in)))
850				{
851					// First octet of 6 octet sequence, see comments for 5 octet sequence.
852					$mUcs4 = ($in);
853					$mUcs4 = ($mUcs4 & 1) << 30;
854					$mState = 5;
855					$mBytes = 6;
856
857				}
858				else
859				{
860					/* Current octet is neither in the US-ASCII range nor a legal first
861					 * octet of a multi-octet sequence.
862					 */
863					return false;
864				}
865			}
866			else
867			{
868				// When mState is non-zero, we expect a continuation of the multi-octet
869				// sequence
870				if (0x80 == (0xC0 & ($in)))
871				{
872					// Legal continuation.
873					$shift = ($mState - 1) * 6;
874					$tmp = $in;
875					$tmp = ($tmp & 0x0000003F) << $shift;
876					$mUcs4 |= $tmp;
877
878					/**
879					 * End of the multi-octet sequence. mUcs4 now contains the final
880					 * Unicode codepoint to be output
881					 */
882					if (0 == --$mState)
883					{
884						/*
885						 * Check for illegal sequences and codepoints.
886						 */
887						// From Unicode 3.1, non-shortest form is illegal
888						if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || ((3 == $mBytes) && ($mUcs4 < 0x0800)) || ((4 == $mBytes) && ($mUcs4 < 0x10000))
889							|| (4 < $mBytes)
890							|| (($mUcs4 & 0xFFFFF800) == 0xD800) // From Unicode 3.2, surrogate characters are illegal
891							|| ($mUcs4 > 0x10FFFF)) // Codepoints outside the Unicode range are illegal
892						{
893							return false;
894						}
895
896						// Initialize UTF8 cache.
897						$mState = 0;
898						$mUcs4 = 0;
899						$mBytes = 1;
900					}
901				}
902				else
903				{
904					/**
905					 *((0xC0 & (*in) != 0x80) && (mState != 0))
906					 * Incomplete multi-octet sequence.
907					 */
908					return false;
909				}
910			}
911		}
912		return true;
913	}
914
915	/**
916	 * Tests whether a string complies as UTF-8. This will be much
917	 * faster than utf8_is_valid but will pass five and six octet
918	 * UTF-8 sequences, which are not supported by Unicode and
919	 * so cannot be displayed correctly in a browser. In other words
920	 * it is not as strict as utf8_is_valid but it's faster. If you use
921	 * it to validate user input, you place yourself at the risk that
922	 * attackers will be able to inject 5 and 6 byte sequences (which
923	 * may or may not be a significant risk, depending on what you are
924	 * are doing)
925	 *
926	 * @param   string  $str  UTF-8 string to check
927	 *
928	 * @return  boolean  TRUE if string is valid UTF-8
929	 *
930	 * @see     valid
931	 * @see     http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
932	 * @since   11.1
933	 */
934	public static function compliant($str)
935	{
936		if (strlen($str) == 0)
937		{
938			return true;
939		}
940		// If even just the first character can be matched, when the /u
941		// modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
942		// invalid, nothing at all will match, even if the string contains
943		// some valid sequences
944		return (preg_match('/^.{1}/us', $str, $ar) == 1);
945	}
946
947	/**
948	 * Does a UTF-8 safe version of PHP parse_url function
949	 *
950	 * @param   string  $url  URL to parse
951	 *
952	 * @return  mixed  Associative array or false if badly formed URL.
953	 *
954	 * @see     http://us3.php.net/manual/en/function.parse-url.php
955	 * @since   11.1
956	 */
957	public static function parse_url($url)
958	{
959		$result = array();
960		// Build arrays of values we need to decode before parsing
961		$entities = array('%21', '%2A', '%27', '%28', '%29', '%3B', '%3A', '%40', '%26', '%3D', '%24', '%2C', '%2F', '%3F', '%25', '%23', '%5B',
962			'%5D');
963		$replacements = array('!', '*', "'", "(", ")", ";", ":", "@", "&", "=", "$", ",", "/", "?", "%", "#", "[", "]");
964		// Create encoded URL with special URL characters decoded so it can be parsed
965		// All other characters will be encoded
966		$encodedURL = str_replace($entities, $replacements, urlencode($url));
967		// Parse the encoded URL
968		$encodedParts = parse_url($encodedURL);
969		// Now, decode each value of the resulting array
970		foreach ($encodedParts as $key => $value)
971		{
972			$result[$key] = urldecode($value);
973		}
974		return $result;
975	}
976}