PageRenderTime 148ms CodeModel.GetById 61ms app.highlight 32ms RepoModel.GetById 48ms app.codeStats 0ms

/libraries/joomla/utilities/string.php

https://bitbucket.org/joomla/joomla-platform/
PHP | 698 lines | 304 code | 42 blank | 352 comment | 99 complexity | c48502aaadcc10f8d3d9015a8f052b6c MD5 | raw file
  1<?php
  2/**
  3 * @package     Joomla.Platform
  4 * @subpackage  Utilities
  5 *
  6 * @copyright   Copyright (C) 2005 - 2011 Open Source Matters, Inc. All rights reserved.
  7 * @license     GNU General Public License version 2 or later; see LICENSE
  8 */
  9
 10defined('JPATH_PLATFORM') or die;
 11
 12/**
 13 * PHP mbstring and iconv local configuration
 14 */
 15// Check if mbstring extension is loaded and attempt to load it if not present except for windows
 16if (extension_loaded('mbstring') || ((!strtoupper(substr(PHP_OS, 0, 3)) === 'WIN' && dl('mbstring.so')))) {
 17	// Make sure to surpress the output in case ini_set is disabled
 18	@ini_set('mbstring.internal_encoding', 'UTF-8');
 19	@ini_set('mbstring.http_input', 'UTF-8');
 20	@ini_set('mbstring.http_output', 'UTF-8');
 21}
 22
 23// Same for iconv
 24if (function_exists('iconv') || ((!strtoupper(substr(PHP_OS, 0, 3)) === 'WIN' && dl('iconv.so')))) {
 25	// These are settings that can be set inside code
 26	iconv_set_encoding("internal_encoding", "UTF-8");
 27	iconv_set_encoding("input_encoding", "UTF-8");
 28	iconv_set_encoding("output_encoding", "UTF-8");
 29}
 30
 31/**
 32 * Include the utf8 package
 33 */
 34jimport('phputf8.utf8');
 35jimport('phputf8.strcasecmp');
 36
 37/**
 38 * String handling class for utf-8 data
 39 * Wraps the phputf8 library
 40 * All functions assume the validity of utf-8 strings.
 41 *
 42 * @static
 43 * @package		Joomla.Platform
 44 * @subpackage	Utilities
 45 * @since		11.1
 46 */
 47abstract class JString
 48{
 49	/**
 50	 * UTF-8 aware alternative to strpos
 51	 * Find position of first occurrence of a string
 52	 *
 53	 * @param $str - string String being examined
 54	 * @param $search - string String being searced for
 55	 * @param $offset - int Optional, specifies the position from which the search should be performed
 56	 * 
 57	 * @return mixed Number of characters before the first match or FALSE on failure
 58	 * @see http://www.php.net/strpos
 59	 */
 60	public static function strpos($str, $search, $offset = FALSE)
 61	{
 62		if ( $offset === FALSE ) {
 63			return utf8_strpos($str, $search);
 64		} else {
 65			return utf8_strpos($str, $search, $offset);
 66		}
 67	}
 68
 69	/**
 70	 * UTF-8 aware alternative to strrpos
 71	 * Finds position of last occurrence of a string
 72	 *
 73	 * @param $str - string String being examined
 74	 * @param $search - string String being searced for
 75	 * @return mixed Number of characters before the last match or FALSE on failure
 76	 * @see http://www.php.net/strrpos
 77	 */
 78	public static function strrpos($str, $search, $offset = false)
 79	{
 80		return utf8_strrpos($str, $search);
 81	}
 82
 83	/**
 84	 * UTF-8 aware alternative to substr
 85	 * Return part of a string given character offset (and optionally length)
 86	 *
 87	 * @param string
 88	 * @param integer number of UTF-8 characters offset (from left)
 89	 * @param integer (optional) length in UTF-8 characters from offset
 90	 * 
 91	 * @return mixed string or FALSE if failure
 92	 * @see http://www.php.net/substr
 93	 */
 94	public static function substr($str, $offset, $length = FALSE)
 95	{
 96		if ($length === FALSE) {
 97			return utf8_substr($str, $offset);
 98		} else {
 99			return utf8_substr($str, $offset, $length);
100		}
101	}
102
103	/**
104	 * UTF-8 aware alternative to strtlower
105	 * Make a string lowercase
106	 * Note: The concept of a characters "case" only exists is some alphabets
107	 * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
108	 * not exist in the Chinese alphabet, for example. See Unicode Standard
109	 * Annex #21: Case Mappings
110	 *
111	 * @param string
112	 * 
113	 * @return mixed either string in lowercase or FALSE is UTF-8 invalid
114	 * @see http://www.php.net/strtolower
115	 */
116	public static function strtolower($str){
117		return utf8_strtolower($str);
118	}
119
120	/**
121	 * UTF-8 aware alternative to strtoupper
122	 * Make a string uppercase
123	 * Note: The concept of a characters "case" only exists is some alphabets
124	 * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
125	 * not exist in the Chinese alphabet, for example. See Unicode Standard
126	 * Annex #21: Case Mappings
127	 *
128	 * @param string
129	 * 
130	 * @return mixed either string in uppercase or FALSE is UTF-8 invalid
131	 * @see http://www.php.net/strtoupper
132	 */
133	public static function strtoupper($str){
134		return utf8_strtoupper($str);
135	}
136
137	/**
138	 * UTF-8 aware alternative to strlen
139	 * Returns the number of characters in the string (NOT THE NUMBER OF BYTES),
140	 *
141	 * @param string UTF-8 string
142	 * 
143	 * @return int number of UTF-8 characters in string
144	 * @see http://www.php.net/strlen
145	 */
146	public static function strlen($str){
147		return utf8_strlen($str);
148	}
149
150	/**
151	 * UTF-8 aware alternative to str_ireplace
152	 * Case-insensitive version of str_replace
153	 *
154	 * @param string string to search
155	 * @param string existing string to replace
156	 * @param string new string to replace with
157	 * @param int optional count value to be passed by referene
158	 * @see http://www.php.net/str_ireplace
159	 */
160	public static function str_ireplace($search, $replace, $str, $count = NULL)
161	{
162		jimport('phputf8.str_ireplace');
163		if ( $count === FALSE ) {
164			return utf8_ireplace($search, $replace, $str);
165		} else {
166			return utf8_ireplace($search, $replace, $str, $count);
167		}
168	}
169
170	/**
171	 * UTF-8 aware alternative to str_split
172	 * Convert a string to an array
173	 *
174	 * @param string UTF-8 encoded
175	 * @param int number to characters to split string by
176	 * 
177	 * @return array
178	 * @see http://www.php.net/str_split
179	*/
180	public static function str_split($str, $split_len = 1)
181	{
182		jimport('phputf8.str_split');
183		return utf8_str_split($str, $split_len);
184	}
185
186	/**
187	 * UTF-8/LOCALE aware alternative to strcasecmp
188	 * A case insensivite string comparison
189	 *
190	 * @param string string 1 to compare
191	 * @param string string 2 to compare
192	 * @param mixed The locale used by strcoll or false to use classical comparison
193	 * 
194	 * @return int < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
195	 * @see http://www.php.net/strcasecmp
196	 * @see http://www.php.net/strcoll
197	 * @see http://www.php.net/setlocale
198	 */
199	public static function strcasecmp($str1, $str2, $locale = false)
200	{
201		if ($locale)
202		{
203			// Get current locale
204			$locale0 = setlocale(LC_COLLATE, 0);
205			if (!$locale = setlocale(LC_COLLATE, $locale)) {
206				$locale = $locale0;
207			}
208
209			// See if we have successfully set locale to UTF-8
210			if(!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m)) {
211				$encoding = 'CP' . $m[1];
212			}
213			else if(stristr($locale, 'UTF-8')){
214				$encoding = 'UTF-8';
215			}
216			else {
217				$encoding = 'nonrecodable';
218			}
219
220			// if we sucesfuly set encoding it to utf-8 or encoding is sth weird don't recode
221			if ($encoding == 'UTF-8' || $encoding == 'nonrecodable') {
222				return strcoll(utf8_strtolower($str1), utf8_strtolower($str2));
223			} else {
224				return strcoll(self::transcode(utf8_strtolower($str1),'UTF-8', $encoding), self::transcode(utf8_strtolower($str2),'UTF-8', $encoding));
225			}
226		}
227		else
228		{
229			return utf8_strcasecmp($str1, $str2);
230		}
231	}
232
233	/**
234	 * UTF-8/LOCALE aware alternative to strcmp
235	 * A case sensitive string comparison
236	 *
237	 * @param string string 1 to compare
238	 * @param string string 2 to compare
239	 * @param mixed The locale used by strcoll or false to use classical comparison
240	 * 
241	 * @return int < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
242	 * @see http://www.php.net/strcmp
243	 * @see http://www.php.net/strcoll
244	 * @see http://www.php.net/setlocale
245	 */
246	public static function strcmp($str1, $str2, $locale = false)
247	{
248		if ($locale)
249		{
250			// Get current locale
251			$locale0 = setlocale(LC_COLLATE, 0);
252			if (!$locale = setlocale(LC_COLLATE, $locale)) {
253				$locale = $locale0;
254			}
255
256			// See if we have successfully set locale to UTF-8
257			if(!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m)) {
258				$encoding = 'CP' . $m[1];
259			}
260			else if(stristr($locale, 'UTF-8')){
261				$encoding = 'UTF-8';
262			}
263			else {
264				$encoding = 'nonrecodable';
265			}
266
267			// If we sucesfuly set encoding it to utf-8 or encoding is sth weird don't recode
268			if ($encoding == 'UTF-8' || $encoding == 'nonrecodable') {
269				return strcoll($str1, $str2);
270			}
271			else {
272				return strcoll(self::transcode($str1,'UTF-8', $encoding), self::transcode($str2,'UTF-8', $encoding));
273			}
274		}
275		else
276		{
277			return strcmp($str1, $str2);
278		}
279	}
280
281	/**
282	 * UTF-8 aware alternative to strcspn
283	 * Find length of initial segment not matching mask
284	 *
285	 * @param string
286	 * @param string the mask
287	 * @param int Optional starting character position (in characters)
288	 * @param int Optional length
289	 *
290	 * @return int the length of the initial segment of str1 which does not contain any of the characters in str2
291	 * @see http://www.php.net/strcspn
292	 */
293	public static function strcspn($str, $mask, $start = NULL, $length = NULL)
294	{
295		jimport('phputf8.strcspn');
296		if ( $start === FALSE && $length === FALSE ) {
297			return utf8_strcspn($str, $mask);
298		} else if ( $length === FALSE ) {
299			return utf8_strcspn($str, $mask, $start);
300		} else {
301			return utf8_strcspn($str, $mask, $start, $length);
302		}
303	}
304
305	/**
306	 * UTF-8 aware alternative to stristr
307	 * Returns all of haystack from the first occurrence of needle to the end.
308	 * needle and haystack are examined in a case-insensitive manner
309	 * Find first occurrence of a string using case insensitive comparison
310	 *
311	 * @param string the haystack
312	 * @param string the needle
313	 * 
314	 * @return string the sub string
315	 * @see http://www.php.net/stristr
316	*/
317	public static function stristr($str, $search)
318	{
319		jimport('phputf8.stristr');
320		return utf8_stristr($str, $search);
321	}
322
323	/**
324	 * UTF-8 aware alternative to strrev
325	 * Reverse a string
326	 *
327	 * @param string String to be reversed
328	 * @return string The string in reverse character order
329	 * @see http://www.php.net/strrev
330	*/
331	public static function strrev($str)
332	{
333		jimport('phputf8.strrev');
334		
335		return utf8_strrev($str);
336	}
337
338	/**
339	 * UTF-8 aware alternative to strspn
340	 * Find length of initial segment matching mask
341	 *
342	 * @param string the haystack
343	 * @param string the mask
344	 * @param int start optional
345	 * @param int length optional
346	 * 
347	 * @return int
348	 * @see http://www.php.net/strspn
349	*/
350	public static function strspn($str, $mask, $start = NULL, $length = NULL)
351	{
352		jimport('phputf8.strspn');
353		if ( $start === NULL && $length === NULL ) {
354			return utf8_strspn($str, $mask);
355		} else if ( $length === NULL ) {
356			return utf8_strspn($str, $mask, $start);
357		} else {
358			return utf8_strspn($str, $mask, $start, $length);
359		}
360	}
361
362	/**
363	 * UTF-8 aware substr_replace
364	 * Replace text within a portion of a string
365	 *
366	 * @param string the haystack
367	 * @param string the replacement string
368	 * @param int start
369	 * @param int length (optional)
370	 * 
371	 * @retufrn string
372	 * @see http://www.php.net/substr_replace
373	*/
374	public static function substr_replace($str, $repl, $start, $length = NULL)
375	{
376		// loaded by library loader
377		if ( $length === FALSE ) {
378			return utf8_substr_replace($str, $repl, $start);
379		} else {
380			return utf8_substr_replace($str, $repl, $start, $length);
381		}
382	}
383
384	/**
385	 * UTF-8 aware replacement for ltrim()
386	 * Strip whitespace (or other characters) from the beginning of a string
387	 * Note: you only need to use this if you are supplying the charlist
388	 * optional arg and it contains UTF-8 characters. Otherwise ltrim will
389	 * work normally on a UTF-8 string
390	 *
391	 * @param string the string to be trimmed
392	 * @param string the optional charlist of additional characters to trim
393	 * 
394	 * @return string the trimmed string
395	 * @see http://www.php.net/ltrim
396	*/
397	public static function ltrim($str, $charlist = FALSE)
398	{
399		if (empty($charlist) && $charlist !== false) {
400			return $str;
401		}
402
403		jimport('phputf8.trim');
404		if ( $charlist === FALSE ) {
405			return utf8_ltrim( $str );
406		} else {
407			return utf8_ltrim( $str, $charlist );
408		}
409	}
410
411	/**
412	 * UTF-8 aware replacement for rtrim()
413	 * Strip whitespace (or other characters) from the end of a string
414	 * Note: you only need to use this if you are supplying the charlist
415	 * optional arg and it contains UTF-8 characters. Otherwise rtrim will
416	 * work normally on a UTF-8 string
417	 *
418	 * @param string the string to be trimmed
419	 * @param string the optional charlist of additional characters to trim
420	 * 
421	 * @return string the trimmed string
422	 * @see http://www.php.net/rtrim
423	 */
424	public static function rtrim($str, $charlist = FALSE)
425	{
426		if (empty($charlist) && $charlist !== false) {
427			return $str;
428		}
429
430		jimport('phputf8.trim');
431		if ( $charlist === FALSE ) {
432			return utf8_rtrim($str);
433		} else {
434			return utf8_rtrim( $str, $charlist );
435		}
436	}
437
438	/**
439	 * UTF-8 aware replacement for trim()
440	 * Strip whitespace (or other characters) from the beginning and end of a string
441	 * Note: you only need to use this if you are supplying the charlist
442	 * optional arg and it contains UTF-8 characters. Otherwise trim will
443	 * work normally on a UTF-8 string
444	 *
445	 * @param string the string to be trimmed
446	 * @param string the optional charlist of additional characters to trim
447	 * 
448	 * @return string the trimmed string
449	 * @see http://www.php.net/trim
450	*/
451	public static function trim($str, $charlist = FALSE)
452	{
453		if (empty($charlist) && $charlist !== false) {
454			return $str;
455		}
456
457		jimport('phputf8.trim');
458		if ( $charlist === FALSE ) {
459			return utf8_trim( $str );
460		} else {
461			return utf8_trim( $str, $charlist );
462		}
463	}
464
465	/**
466	 * UTF-8 aware alternative to ucfirst
467	 * Make a string's first character uppercase
468	 *
469	 * @param string
470	 * @return string with first character as upper case (if applicable)
471	 * @see http://www.php.net/ucfirst
472	*/
473	public static function ucfirst($str)
474	{
475		jimport('phputf8.ucfirst');
476		return utf8_ucfirst($str);
477	}
478
479	/**
480	 * UTF-8 aware alternative to ucwords
481	 * Uppercase the first character of each word in a string
482	 *
483	 * @param string
484	 * @return string with first char of each word uppercase
485	 * @see http://www.php.net/ucwords
486	*/
487	public static function ucwords($str)
488	{
489		jimport('phputf8.ucwords');
490		return utf8_ucwords($str);
491	}
492
493	/**
494	 * Transcode a string.
495	 *
496	 * @param string $source The string to transcode.
497	 * @param string $from_encoding The source encoding.
498	 * @param string $to_encoding The target encoding.
499	 * 
500	 * @return string Transcoded string
501	 * @since   11.1
502	 */
503	public static function transcode($source, $from_encoding, $to_encoding)
504	{
505		if (is_string($source)) {
506			/*
507			 * "//TRANSLIT" is appended to the $to_encoding to ensure that when iconv comes
508			 * across a character that cannot be represented in the target charset, it can
509			 * be approximated through one or several similarly looking characters.
510			 */
511			return iconv($from_encoding, $to_encoding.'//TRANSLIT', $source);
512		}
513	}
514
515	/**
516	 * Tests a string as to whether it's valid UTF-8 and supported by the
517	 * Unicode standard
518	 * Note: this function has been modified to simple return true or false
519	 * @author <hsivonen@iki.fi>
520	 * @param string UTF-8 encoded string
521	 * 
522	 * @return boolean true if valid
523	 * @since 11.1
524	 * @see http://hsivonen.iki.fi/php-utf8/
525	 * @see compliant
526	 */
527	public static function valid($str)
528	{
529		$mState = 0;	// cached expected number of octets after the current octet
530						// until the beginning of the next UTF8 character sequence
531		$mUcs4  = 0;	// cached Unicode character
532		$mBytes = 1;	// cached expected number of octets in the current sequence
533
534		$len = strlen($str);
535
536		for ($i = 0; $i < $len; $i++)
537		{
538			$in = ord($str{$i});
539
540			if ($mState == 0)
541			{
542				// When mState is zero we expect either a US-ASCII character or a
543				// multi-octet sequence.
544				if (0 == (0x80 & ($in))) {
545					// US-ASCII, pass straight through.
546					$mBytes = 1;
547				} else if (0xC0 == (0xE0 & ($in))) {
548					// First octet of 2 octet sequence
549					$mUcs4 = ($in);
550					$mUcs4 = ($mUcs4 & 0x1F) << 6;
551					$mState = 1;
552					$mBytes = 2;
553				} else if (0xE0 == (0xF0 & ($in))) {
554					// First octet of 3 octet sequence
555					$mUcs4 = ($in);
556					$mUcs4 = ($mUcs4 & 0x0F) << 12;
557					$mState = 2;
558					$mBytes = 3;
559				} else if (0xF0 == (0xF8 & ($in))) {
560					// First octet of 4 octet sequence
561					$mUcs4 = ($in);
562					$mUcs4 = ($mUcs4 & 0x07) << 18;
563					$mState = 3;
564					$mBytes = 4;
565				} else if (0xF8 == (0xFC & ($in))) {
566					/* First octet of 5 octet sequence.
567					 *
568					 * This is illegal because the encoded codepoint must be either
569					 * (a) not the shortest form or
570					 * (b) outside the Unicode range of 0-0x10FFFF.
571					 * Rather than trying to resynchronize, we will carry on until the end
572					 * of the sequence and let the later error handling code catch it.
573					 */
574					$mUcs4 = ($in);
575					$mUcs4 = ($mUcs4 & 0x03) << 24;
576					$mState = 4;
577					$mBytes = 5;
578				} else if (0xFC == (0xFE & ($in))) {
579					// First octet of 6 octet sequence, see comments for 5 octet sequence.
580					$mUcs4 = ($in);
581					$mUcs4 = ($mUcs4 & 1) << 30;
582					$mState = 5;
583					$mBytes = 6;
584
585				} else {
586					/* Current octet is neither in the US-ASCII range nor a legal first
587					 * octet of a multi-octet sequence.
588					 */
589					return FALSE;
590				}
591			}
592			else
593			{
594				// When mState is non-zero, we expect a continuation of the multi-octet
595				// sequence
596				if (0x80 == (0xC0 & ($in)))
597				{
598					// Legal continuation.
599					$shift = ($mState - 1) * 6;
600					$tmp = $in;
601					$tmp = ($tmp & 0x0000003F) << $shift;
602					$mUcs4 |= $tmp;
603
604					/**
605					 * End of the multi-octet sequence. mUcs4 now contains the final
606					 * Unicode codepoint to be output
607					 */
608					if (0 == --$mState)
609					{
610						/*
611						 * Check for illegal sequences and codepoints.
612						 */
613						// From Unicode 3.1, non-shortest form is illegal
614						if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
615							((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
616							((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
617							(4 < $mBytes) ||
618							// From Unicode 3.2, surrogate characters are illegal
619							(($mUcs4 & 0xFFFFF800) == 0xD800) ||
620							// Codepoints outside the Unicode range are illegal
621							($mUcs4 > 0x10FFFF)) {
622								return FALSE;
623							}
624
625						// Initialize UTF8 cache.
626						$mState = 0;
627						$mUcs4  = 0;
628						$mBytes = 1;
629					}
630				}
631				else
632				{
633					/**
634					 *((0xC0 & (*in) != 0x80) && (mState != 0))
635					 * Incomplete multi-octet sequence.
636					 */
637					return FALSE;
638				}
639			}
640		}
641		return TRUE;
642	}
643
644	/**
645	 * Tests whether a string complies as UTF-8. This will be much
646	 * faster than utf8_is_valid but will pass five and six octet
647	 * UTF-8 sequences, which are not supported by Unicode and
648	 * so cannot be displayed correctly in a browser. In other words
649	 * it is not as strict as utf8_is_valid but it's faster. If you use
650	 * it to validate user input, you place yourself at the risk that
651	 * attackers will be able to inject 5 and 6 byte sequences (which
652	 * may or may not be a significant risk, depending on what you are
653	 * are doing)
654	 * @see valid
655	 * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
656	 * @param string UTF-8 string to check
657	 * 
658	 * @return boolean TRUE if string is valid UTF-8
659	 * @since 11.1
660	 */
661	public static function compliant($str)
662	{
663		if (strlen($str) == 0) {
664			return TRUE;
665		}
666		// If even just the first character can be matched, when the /u
667		// modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
668		// invalid, nothing at all will match, even if the string contains
669		// some valid sequences
670		return (preg_match('/^.{1}/us',$str,$ar) == 1);
671	}
672
673	/**
674	 * Does a UTF-8 safe version of PHP parse_url function
675	 * @see http://us3.php.net/manual/en/function.parse-url.php
676	 *
677	 * @param string URL to parse
678	 * 
679	 * @return associative array or false if badly formed URL.
680	 * @since 11.1
681	 */
682	public static function parse_url($url) {
683		$result = array();
684		// Build arrays of values we need to decode before parsing
685		$entities = array('%21', '%2A', '%27', '%28', '%29', '%3B', '%3A', '%40', '%26', '%3D', '%24', '%2C', '%2F', '%3F', '%25', '%23', '%5B', '%5D');
686		$replacements = array('!', '*', "'", "(", ")", ";", ":", "@", "&", "=", "$", ",", "/", "?", "%", "#", "[", "]");
687		// Create encoded URL with special URL characters decoded so it can be parsed
688		// All other charcters will be encoded
689		$encodedURL = str_replace($entities, $replacements, urlencode($url));
690		// Parse the encoded URL
691		$encodedParts = parse_url($encodedURL);
692		// Now, decode each value of the resulting array
693		foreach ($encodedParts as $key => $value) {
694			$result[$key] = urldecode($value);
695		}
696		return $result;
697	}
698}