utf8.php | searchcode

/utf8.php

https://github.com/Xeoncross/php_utf8
PHP | 441 lines | 233 code | 47 blank | 161 comment | 21 complexity | 8ff72f4c373664364f1e170e34624691 MD5 | raw file

<?php
/**
 * php_utf8
 *
 * A simple collection of functions to provide a standardized framework for
 * working with multibyte strings (like UTF-8) in a variety of server
 * environments. Requires either mbstring or iconv to work!
 *
 * @author David Pennington <xeoncross.com>
 * @link http://sourceforge.net/projects/phputf8/
 * @link http://github.com/Xeoncross/php_utf8
 * @license http://opensource.org/licenses/mit-license.php MIT License
 */

// Is PCRE compiled with UTF-8 support? Please say YES!!!!
define('PCRE_SUPPORTS_UTF8', preg_match('/^.{1}$/u',"Ã±", array()));

// Default to English UTF-8
setlocale(LC_ALL, 'en_US.UTF8');

if(extension_loaded('mbstring'))
{
	if (ini_get('mbstring.func_overload') & MB_OVERLOAD_STRING)
	{
		trigger_error
		(
			'The <a href="http://php.net/mbstring">mbstring</a> extension is overloading '.
			'PHP\'s native string functions. Disable this by setting mbstring.func_overload '.
			'to 0, 1, 4 or 5 in php.ini or a .htaccess file.',
			E_USER_ERROR
		);
	}

	// Set internal character encoding to UTF-8
	mb_internal_encoding("UTF-8");

}
elseif (extension_loaded('iconv'))
{
	// Set internal character encoding to UTF-8
	iconv_set_encoding("internal_encoding", "UTF-8");
}
else
{
	trigger_error
	(
		'Neither the <a href="http://php.net/iconv">iconv</a> nor <a href="http://'.
		'php.net/mbstring">mbstring</a> PHP extensions are loaded. Without one of '.
		'these, UTF-8 strings cannot be properly handled.',
		E_USER_ERROR
	);
}


// Enable basic multibyte string support if mbstring is not installed!
if( ! extension_loaded('mbstring'))
{
	/**
	 * Unicode aware replacement for strlen(). Returns the number of characters
	 * in the string (not the number of bytes), replacing multibyte characters
	 * with a single byte equivalent utf8_decode() converts characters that are
	 * not in ISO-8859-1 to '?', which, for the purpose of counting, is alright
	 * - It's much faster than iconv_strlen.
	 *
	 * Note: this function does not count bad UTF-8 bytes in the string
	 *
	 * @author <chernyshevsky at hotmail dot com>
	 * @param string $string a valid UTF-8 string
	 * @return int
	 */
	function mb_strlen($string)
	{
		return strlen(utf8_decode($string));
	}


	/**
	 * UTF-8 aware alternative to substr
	 * Return part of a string given character offset (and optionally length)
	 *
	 * @param string $string to parse
	 * @param int $start the starting offset
	 * @param int $length of part to return
	 * @param string $encoding defaults to UTF-8
	 * @return string
	 */
	function mb_substr($string, $start, $length, $encoding = NULL)
	{
		return iconv_substr($string, $start, $length);
	}


	/**
	 * UTF-8 aware alternative to strpos
	 * Find position of first occurrence of a string
	 *
	 * @param string $haystack to search
	 * @param string $needle substring to look for
	 * @param int $offset to start from
	 * @param string $encoding defaults to UTF-8
	 * @return int
	 */
	function mb_strpos($haystack, $needle, $offset = 0, $encoding = NULL)
	{
		return iconv_strpos($haystack, $needel, $offset);
	}


	/**
	 * UTF-8 aware alternative to strrpos
	 * Finds the last occurrence of a needle within a haystack
	 *
	 * @param string $haystack to search
	 * @param string $needle substring to look for
	 * @param string $encoding defaults to UTF-8
	 * @return int
	 */
	function mb_strrpos($haystack, $needle, $encoding = NULL)
	{
		return iconv_strrpos($haystack, $needle);
	}


	/**
	 * Convert a UTF-8 string to lowercase
	 *
	 * @param string $string to convert
	 * @param string $encoding defaults to UTF-8
	 * @return string
	 */
	function mb_strtolower($string, $encoding)
	{
		return $string;
	}


	/**
	 * Convert a UTF-8 string to uppercase
	 *
	 * @param string $string to convert
	 * @param string $encoding defaults to UTF-8
	 * @return string
	 */
	function mb_strtoupper($string, $encoding)
	{
		return $string;
	}
}


/**
 * UTF-8 aware alternative to str_split to convert a string to an array
 *
 * @param string $string to split
 * @param int $split_len of characters to split string by
 * @return string
 */
function mb_str_split($string, $split_len = 1)
{
	if (mb_strlen($string) <= $split_len)
		return array($string);

	preg_match_all('/.{'.$split_len.'}|[^\x00]{1,'.$split_len.'}$/us', $string, $array);
		return $array[0];
}


/**
 * UTF-8 aware substr_replace.
 *
 * @param string $string to process
 * @param string $replacement text
 * @param int $start offset
 * @param int $length to replace
 * @return string
 */
function mb_substr_replace($string, $replacement, $start, $length = NULL )
{
	return mb_substr($str, 0, $start) . $replacement . mb_substr($str, $length + 1);
}


/**
 * UTF-8 aware alternative to strrev
 * Reverse a string
 *
 * @param string $string to reverse
 * @return string
 */
function mb_strrev($string)
{
	preg_match_all('/./us', $string, $ar);
	return join('',array_reverse($ar[0]));
}


/**
 * Tests whether a string contains only 7bit ASCII bytes.
 *
 * @param string $string to check
 * @return bool
 */
function is_ascii($string)
{
	return ! preg_match('/[^\x00-\x7F]/S', $string);
}


/**
 * Checks to see if a string is utf8 encoded.
 *
 * NOTE: This function checks for 5-Byte sequences, UTF8
 *       has Bytes Sequences with a maximum length of 4.
 *
 * @author bmorel at ssi dot fr (modified)
 * @param string $str The string to be checked
 * @return bool
 */
function seems_utf8($str)
{
	$length = strlen($str);
	for ($i=0; $i < $length; $i++) {
		$c = ord($str[$i]);
		if ($c < 0x80) $n = 0; # 0bbbbbbb
		elseif (($c & 0xE0) == 0xC0) $n=1; # 110bbbbb
		elseif (($c & 0xF0) == 0xE0) $n=2; # 1110bbbb
		elseif (($c & 0xF8) == 0xF0) $n=3; # 11110bbb
		elseif (($c & 0xFC) == 0xF8) $n=4; # 111110bb
		elseif (($c & 0xFE) == 0xFC) $n=5; # 1111110b
		else return false; # Does not match any model
		for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
			if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80))
			return false;
		}
	}
	return true;
}


/**
 * Converts most Latin accent characters to ASCII characters. If there are no
 * accent characters, then the string given is returned unchanged.
 *
 * @author wordpress.org
 * @param string $string that might have accent characters
 * @return string
 */
function remove_accents($string)
{
	// We only need to translate from U+0080 to U+00FF
	if ( ! preg_match('/[\x80-\xff]/', $string))
		return $string;

	$chars = array(
	// Decompositions for Latin-1 Supplement
	chr(195).chr(128) => 'A', chr(195).chr(129) => 'A',
	chr(195).chr(130) => 'A', chr(195).chr(131) => 'A',
	chr(195).chr(132) => 'A', chr(195).chr(133) => 'A',
	chr(195).chr(135) => 'C', chr(195).chr(136) => 'E',
	chr(195).chr(137) => 'E', chr(195).chr(138) => 'E',
	chr(195).chr(139) => 'E', chr(195).chr(140) => 'I',
	chr(195).chr(141) => 'I', chr(195).chr(142) => 'I',
	chr(195).chr(143) => 'I', chr(195).chr(145) => 'N',
	chr(195).chr(146) => 'O', chr(195).chr(147) => 'O',
	chr(195).chr(148) => 'O', chr(195).chr(149) => 'O',
	chr(195).chr(150) => 'O', chr(195).chr(153) => 'U',
	chr(195).chr(154) => 'U', chr(195).chr(155) => 'U',
	chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y',
	chr(195).chr(159) => 's', chr(195).chr(160) => 'a',
	chr(195).chr(161) => 'a', chr(195).chr(162) => 'a',
	chr(195).chr(163) => 'a', chr(195).chr(164) => 'a',
	chr(195).chr(165) => 'a', chr(195).chr(167) => 'c',
	chr(195).chr(168) => 'e', chr(195).chr(169) => 'e',
	chr(195).chr(170) => 'e', chr(195).chr(171) => 'e',
	chr(195).chr(172) => 'i', chr(195).chr(173) => 'i',
	chr(195).chr(174) => 'i', chr(195).chr(175) => 'i',
	chr(195).chr(177) => 'n', chr(195).chr(178) => 'o',
	chr(195).chr(179) => 'o', chr(195).chr(180) => 'o',
	chr(195).chr(181) => 'o', chr(195).chr(182) => 'o',
	chr(195).chr(182) => 'o', chr(195).chr(185) => 'u',
	chr(195).chr(186) => 'u', chr(195).chr(187) => 'u',
	chr(195).chr(188) => 'u', chr(195).chr(189) => 'y',
	chr(195).chr(191) => 'y',
	// Decompositions for Latin Extended-A
	chr(196).chr(128) => 'A', chr(196).chr(129) => 'a',
	chr(196).chr(130) => 'A', chr(196).chr(131) => 'a',
	chr(196).chr(132) => 'A', chr(196).chr(133) => 'a',
	chr(196).chr(134) => 'C', chr(196).chr(135) => 'c',
	chr(196).chr(136) => 'C', chr(196).chr(137) => 'c',
	chr(196).chr(138) => 'C', chr(196).chr(139) => 'c',
	chr(196).chr(140) => 'C', chr(196).chr(141) => 'c',
	chr(196).chr(142) => 'D', chr(196).chr(143) => 'd',
	chr(196).chr(144) => 'D', chr(196).chr(145) => 'd',
	chr(196).chr(146) => 'E', chr(196).chr(147) => 'e',
	chr(196).chr(148) => 'E', chr(196).chr(149) => 'e',
	chr(196).chr(150) => 'E', chr(196).chr(151) => 'e',
	chr(196).chr(152) => 'E', chr(196).chr(153) => 'e',
	chr(196).chr(154) => 'E', chr(196).chr(155) => 'e',
	chr(196).chr(156) => 'G', chr(196).chr(157) => 'g',
	chr(196).chr(158) => 'G', chr(196).chr(159) => 'g',
	chr(196).chr(160) => 'G', chr(196).chr(161) => 'g',
	chr(196).chr(162) => 'G', chr(196).chr(163) => 'g',
	chr(196).chr(164) => 'H', chr(196).chr(165) => 'h',
	chr(196).chr(166) => 'H', chr(196).chr(167) => 'h',
	chr(196).chr(168) => 'I', chr(196).chr(169) => 'i',
	chr(196).chr(170) => 'I', chr(196).chr(171) => 'i',
	chr(196).chr(172) => 'I', chr(196).chr(173) => 'i',
	chr(196).chr(174) => 'I', chr(196).chr(175) => 'i',
	chr(196).chr(176) => 'I', chr(196).chr(177) => 'i',
	chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij',
	chr(196).chr(180) => 'J', chr(196).chr(181) => 'j',
	chr(196).chr(182) => 'K', chr(196).chr(183) => 'k',
	chr(196).chr(184) => 'k', chr(196).chr(185) => 'L',
	chr(196).chr(186) => 'l', chr(196).chr(187) => 'L',
	chr(196).chr(188) => 'l', chr(196).chr(189) => 'L',
	chr(196).chr(190) => 'l', chr(196).chr(191) => 'L',
	chr(197).chr(128) => 'l', chr(197).chr(129) => 'L',
	chr(197).chr(130) => 'l', chr(197).chr(131) => 'N',
	chr(197).chr(132) => 'n', chr(197).chr(133) => 'N',
	chr(197).chr(134) => 'n', chr(197).chr(135) => 'N',
	chr(197).chr(136) => 'n', chr(197).chr(137) => 'N',
	chr(197).chr(138) => 'n', chr(197).chr(139) => 'N',
	chr(197).chr(140) => 'O', chr(197).chr(141) => 'o',
	chr(197).chr(142) => 'O', chr(197).chr(143) => 'o',
	chr(197).chr(144) => 'O', chr(197).chr(145) => 'o',
	chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe',
	chr(197).chr(148) => 'R',chr(197).chr(149) => 'r',
	chr(197).chr(150) => 'R',chr(197).chr(151) => 'r',
	chr(197).chr(152) => 'R',chr(197).chr(153) => 'r',
	chr(197).chr(154) => 'S',chr(197).chr(155) => 's',
	chr(197).chr(156) => 'S',chr(197).chr(157) => 's',
	chr(197).chr(158) => 'S',chr(197).chr(159) => 's',
	chr(197).chr(160) => 'S', chr(197).chr(161) => 's',
	chr(197).chr(162) => 'T', chr(197).chr(163) => 't',
	chr(197).chr(164) => 'T', chr(197).chr(165) => 't',
	chr(197).chr(166) => 'T', chr(197).chr(167) => 't',
	chr(197).chr(168) => 'U', chr(197).chr(169) => 'u',
	chr(197).chr(170) => 'U', chr(197).chr(171) => 'u',
	chr(197).chr(172) => 'U', chr(197).chr(173) => 'u',
	chr(197).chr(174) => 'U', chr(197).chr(175) => 'u',
	chr(197).chr(176) => 'U', chr(197).chr(177) => 'u',
	chr(197).chr(178) => 'U', chr(197).chr(179) => 'u',
	chr(197).chr(180) => 'W', chr(197).chr(181) => 'w',
	chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y',
	chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z',
	chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z',
	chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z',
	chr(197).chr(190) => 'z', chr(197).chr(191) => 's',
	// Euro Sign
	chr(226).chr(130).chr(172) => 'E',
	// GBP (Pound) Sign
	chr(194).chr(163) => ''
	);

	return strtr($string, $chars);
}


/**
 * Filter a valid UTF-8 string so that it contains only words, numbers,
 * dashes, underscores, periods, and spaces - all of which are safe
 * characters to use in file names, URI, XML, JSON, and (X)HTML.
 *
 * @param string $string to clean
 * @param bool $remove_spaces if set to TRUE
 * @return string
 */
function sanitize($string, $remove_spaces = FALSE)
{
	// Only allow words (letters or numbers) and a couple other characters
	$string = preg_replace('/[^\w\-\. ]+/u', ' ', $string);

	// Remove doubles of all non-word characters
	$string = preg_replace(array('/\s\s+/', '/\.\.+/', '/--+/', '/__+/'), array(' ', '.', '-', '_'), $string);

	// Remove spaces?
	if($remove_spaces)
	{
		$string = preg_replace('/--+/', '-', str_replace(' ', '-', $string));
	}

	// Remove starting/ending symbols
	return trim($string, '-._ ');
}


/**
 * Create a SEO friendly URL string from a valid UTF-8 string
 *
 * @param string $string to filter
 * @return string
 */
function sanitize_url($string)
{
	return urlencode(remove_accents(mb_strtolower(sanitize($string, TRUE))));
}


/**
 * Filter a valid UTF-8 string to be file name safe.
 *
 * @param string $string to filter
 * @return string
 */
function sanitize_filename($string)
{
	return sanitize($string, TRUE);
}


/**
 * Convert a string from one encoding to another encoding (Defaults to UTF-8)
 *
 * @param string $string to convert
 * @param string $to_encoding you want the string in
 * @param string $from_encoding that string is in
 * @return string
 */
function encode($string, $to_encoding = 'UTF-8', $from_encoding = 'UTF-8')
{
	// ASCII-7 is valid UTF-8 already
	if ($to_encoding === 'UTF-8' AND is_ascii($string))
		return $string;

	if(function_exists('iconv'))
	{
		// Disable notices
		$ER = error_reporting(~E_NOTICE);

		$string = iconv($from_encoding, $to_encoding.'//TRANSLIT', $string);

		// Turn notices back on
		error_reporting($ER);

		return $string;
	}
	else
	{
		return mb_convert_encoding($string, $to_encoding, mb_detect_encoding($string, "auto", TRUE));
	}
}