/bitrix/modules/main/lib/text/encoding.php
PHP | 528 lines | 424 code | 57 blank | 47 comment | 108 complexity | 6484e30334b7d926a9a36f9c8f96ddfc MD5 | raw file
Possible License(s): Apache-2.0
- <?php
- namespace Bitrix\Main\Text;
- use Bitrix\Main\Loader;
- use Bitrix\Main\Application;
- use Bitrix\Main\Config\Configuration;
- use Bitrix\Main\ErrorCollection;
- use Bitrix\Main\Error;
- class Encoding
- {
- const PATH_TO_CONVERT_TABLES = "/bitrix/modules/main/cvtables/";
- /** @var ErrorCollection */
- protected $errors;
- protected function __construct()
- {
- $this->errors = new ErrorCollection();
- }
- /**
- * Converts data from a source encoding to a target encoding.
- *
- * @param string|array|\SplFixedArray $data The data to convert. From main 16.0.10 data can be an array.
- * @param string $charsetFrom The source encoding.
- * @param string $charsetTo The target encoding.
- * @param string $errorMessage Reference to a variable containing error messages.
- * @return string|array|\SplFixedArray|bool Returns converted data or false on error.
- */
- public static function convertEncoding($data, $charsetFrom, $charsetTo, &$errorMessage = "")
- {
- if(strcasecmp($charsetFrom, $charsetTo) == 0)
- {
- //no need to convert
- return $data;
- }
- if(is_array($data) || $data instanceof \SplFixedArray)
- {
- //let's do a recursion
- foreach($data as $key => $value)
- {
- $newKey = self::convertEncoding($key, $charsetFrom, $charsetTo, $errorMessage);
- $newValue = self::convertEncoding($value, $charsetFrom, $charsetTo, $errorMessage);
- $data[$newKey] = $newValue;
- if($newKey != $key)
- {
- unset($data[$key]);
- }
- }
- return $data;
- }
- elseif(is_string($data))
- {
- if($data == '')
- {
- return '';
- }
- $cvt = new static;
- $res = $cvt->convertByMbstring($data, $charsetFrom, $charsetTo);
- if (!is_string($res) || $res === '')
- {
- $res = $cvt->convertByIconv($data, $charsetFrom, $charsetTo);
- if (!is_string($res) || $res === '')
- {
- $res = $cvt->convertByTables($data, $charsetFrom, $charsetTo);
- }
- }
- $errors = $cvt->getErrors();
- if (!empty($errors))
- {
- $errorMessage .= implode("\n", $errors);
- }
- return $res;
- }
- return $data;
- }
- /**
- * @deprecated Deprecated in main 16.0.10. Use Encoding::convertEncoding().
- * @param $data
- * @param $charsetFrom
- * @param $charsetTo
- * @param string $errorMessage
- * @return mixed
- */
- public static function convertEncodingArray($data, $charsetFrom, $charsetTo, &$errorMessage = "")
- {
- return self::convertEncoding($data, $charsetFrom, $charsetTo, $errorMessage);
- }
- /**
- * @param string $string
- * @return bool|string
- */
- public static function convertEncodingToCurrent($string)
- {
- $isUtf8String = self::detectUtf8($string);
- $isUtf8Config = Application::isUtfMode();
- $currentCharset = null;
- if (!$isUtf8Config && $isUtf8String)
- {
- $context = Application::getInstance()->getContext();
- if ($context != null)
- {
- $culture = $context->getCulture();
- if ($culture != null)
- {
- $currentCharset = $culture->getCharset();
- }
- }
- }
- if ($currentCharset == null)
- {
- $currentCharset = Configuration::getValue("default_charset");
- }
- if ($currentCharset == null)
- {
- $currentCharset = "Windows-1251";
- }
- $fromCp = "";
- $toCp = "";
- if ($isUtf8Config && !$isUtf8String)
- {
- $fromCp = $currentCharset;
- $toCp = "UTF-8";
- }
- elseif (!$isUtf8Config && $isUtf8String)
- {
- $fromCp = "UTF-8";
- $toCp = $currentCharset;
- }
- if ($fromCp !== $toCp)
- {
- $string = self::convertEncoding($string, $fromCp, $toCp);
- }
- return $string;
- }
- /**
- * @param string $string
- * @return bool
- */
- public static function detectUtf8($string)
- {
- //http://mail.nl.linux.org/linux-utf8/1999-09/msg00110.html
- if(preg_match_all("/(?:%)([0-9A-F]{2})/i", $string, $match))
- {
- $string = pack("H*", strtr(implode('', $match[1]), 'abcdef', 'ABCDEF'));
- }
- //valid UTF-8 octet sequences
- //0xxxxxxx
- //110xxxxx 10xxxxxx
- //1110xxxx 10xxxxxx 10xxxxxx
- //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- $prevBits8and7 = 0;
- $isUtf = 0;
- foreach(unpack("C*", $string) as $byte)
- {
- $hiBits8and7 = $byte & 0xC0;
- if ($hiBits8and7 == 0x80)
- {
- if ($prevBits8and7 == 0xC0)
- $isUtf++;
- elseif (($prevBits8and7 & 0x80) == 0x00)
- $isUtf--;
- }
- elseif ($prevBits8and7 == 0xC0)
- {
- $isUtf--;
- }
- $prevBits8and7 = $hiBits8and7;
- }
- return ($isUtf > 0);
- }
- protected function convertByMbstring($data, $charsetFrom, $charsetTo)
- {
- $res = '';
- // mb_encoding_aliases emits an E_WARNING level error if encoding is unknown
- if (extension_loaded("mbstring") && @mb_encoding_aliases($charsetFrom) !== false && @mb_encoding_aliases($charsetTo) !== false)
- {
- //For UTF-16 we have to detect the order of bytes
- //Default for mbstring extension is Big endian
- //Little endian have to pointed explicitly
- if (strtoupper($charsetFrom) == "UTF-16")
- {
- $ch = substr($data, 0, 1);
- if ($ch == "\xFF" && substr($data, 1, 1) == "\xFE")
- {
- //If Little endian found - cutoff BOF bytes and point mbstring to this fact explicitly
- $res = mb_convert_encoding(substr($data, 2), $charsetTo, "UTF-16LE");
- }
- elseif ($ch == "\xFE" && substr($data, 1, 1) == "\xFF")
- {
- //If it is Big endian, just remove BOF bytes
- $res = mb_convert_encoding(substr($data, 2), $charsetTo, $charsetFrom);
- }
- else
- {
- //Otherwise assime Little endian without BOF
- $res = mb_convert_encoding($data, $charsetTo, "UTF-16LE");
- }
- }
- else
- {
- $res = mb_convert_encoding($data, $charsetTo, $charsetFrom);
- }
- }
- return $res;
- }
- protected function convertByIconv($data, $charsetFrom, $charsetTo)
- {
- $res = '';
- if (Configuration::getValue("disable_iconv") !== true)
- {
- $utfString = false;
- if (strtoupper($charsetFrom) == "UTF-16")
- {
- $ch = substr($data, 0, 1);
- if (($ch != "\xFF") || ($ch != "\xFE"))
- {
- $utfString = "\xFF\xFE".$data;
- }
- }
- if (function_exists('iconv'))
- {
- if ($utfString)
- {
- $res = iconv($charsetFrom, $charsetTo."//IGNORE", $utfString);
- }
- else
- {
- $res = iconv($charsetFrom, $charsetTo."//IGNORE", $data);
- }
- if (!$res)
- {
- $this->errors[] = new Error("Iconv reported failure while converting string to requested character encoding.");
- }
- }
- elseif (function_exists('libiconv'))
- {
- if ($utfString)
- {
- $res = libiconv($charsetFrom, $charsetTo, $utfString);
- }
- else
- {
- $res = libiconv($charsetFrom, $charsetTo, $data);
- }
- if (!$res)
- {
- $this->errors[] = new Error("Libiconv reported failure while converting string to requested character encoding.");
- }
- }
- }
- return $res;
- }
- protected function buildConvertTable()
- {
- static $cvTables = array();
- for($i = 0, $cnt = func_num_args(); $i < $cnt; $i++)
- {
- $fileName = func_get_arg($i);
- if(isset($cvTables[$fileName]))
- {
- continue;
- }
- $pathToTable = Loader::getDocumentRoot().self::PATH_TO_CONVERT_TABLES.$fileName;
- if (!file_exists($pathToTable))
- {
- $this->errors[] = new Error(str_replace("#FILE#", $pathToTable, "File #FILE# is not found."));
- return false;
- }
- if (!is_file($pathToTable))
- {
- $this->errors[] = new Error(str_replace("#FILE#", $pathToTable, "File #FILE# is not a file."));
- return false;
- }
- if (!($hFile = fopen($pathToTable, "r")))
- {
- $this->errors[] = new Error(str_replace("#FILE#", $pathToTable, "Can not open file #FILE# for reading."));
- return false;
- }
- $cvTables[$fileName] = array();
- while (!feof($hFile))
- {
- if ($line = trim(fgets($hFile, 1024)))
- {
- if (substr($line, 0, 1) != "#")
- {
- $hexValue = preg_split("/[\\s,]+/", $line, 3);
- if (substr($hexValue[1], 0, 1) != "#")
- {
- $key = strtoupper(str_replace("0x", "", $hexValue[1]));
- $value = strtoupper(str_replace("0x", "", $hexValue[0]));
- $cvTables[$fileName][$key] = $value;
- }
- }
- }
- }
- fclose($hFile);
- }
- return $cvTables;
- }
- protected function hexToUtf($utfCharInHex)
- {
- $result = "";
- $utfCharInDec = hexdec($utfCharInHex);
- if ($utfCharInDec < 128)
- $result .= chr($utfCharInDec);
- elseif ($utfCharInDec < 2048)
- $result .= chr(($utfCharInDec >> 6) + 192).chr(($utfCharInDec & 63) + 128);
- elseif ($utfCharInDec < 65536)
- $result .= chr(($utfCharInDec >> 12) + 224).chr((($utfCharInDec >> 6) & 63) + 128).chr(($utfCharInDec & 63) + 128);
- elseif ($utfCharInDec < 2097152)
- $result .= chr($utfCharInDec >> 18 + 240).chr((($utfCharInDec >> 12) & 63) + 128).chr(($utfCharInDec >> 6) & 63 + 128). chr($utfCharInDec & 63 + 128);
- return $result;
- }
- /**
- * @param string $sourceString
- * @param string $charsetFrom
- * @param string $charsetTo
- * @return bool|string
- */
- protected function convertByTables($sourceString, $charsetFrom, $charsetTo)
- {
- if($charsetFrom == '')
- {
- $this->errors[] = new Error("Source charset is not set.");
- return false;
- }
- if($charsetTo == '')
- {
- $this->errors[] = new Error("Destination charset is not set.");
- return false;
- }
- $charsetFrom = strtolower($charsetFrom);
- $charsetTo = strtolower($charsetTo);
- $resultString = "";
- if($charsetFrom == "ucs-2")
- {
- $convertTable = $this->buildConvertTable($charsetTo);
- if(!$convertTable)
- {
- return false;
- }
- $len = strlen($sourceString);
- for($i = 0; $i < $len; $i+=2)
- {
- $hexChar = strtoupper(dechex(ord($sourceString[$i])).dechex(ord($sourceString[$i+1])));
- $hexChar = str_pad($hexChar, 4, "0", STR_PAD_LEFT);
- if($convertTable[$charsetTo][$hexChar])
- {
- if($charsetTo != "utf-8")
- {
- $resultString .= chr(hexdec($convertTable[$charsetTo][$hexChar]));
- }
- else
- {
- $resultString .= $this->hexToUtf($convertTable[$charsetTo][$hexChar]);
- }
- }
- }
- }
- elseif($charsetFrom == "utf-16")
- {
- $convertTable = $this->buildConvertTable($charsetTo);
- if(!$convertTable)
- {
- return false;
- }
- $len = strlen($sourceString);
- for($i = 0; $i < $len; $i+=2)
- {
- $hexChar = sprintf("%02X%02X", ord($sourceString[$i+1]), ord($sourceString[$i]));
- if($convertTable[$charsetTo][$hexChar])
- {
- if($charsetTo != "utf-8")
- {
- $resultString .= chr(hexdec($convertTable[$charsetTo][$hexChar]));
- }
- else
- {
- $resultString .= $this->hexToUtf($convertTable[$charsetTo][$hexChar]);
- }
- }
- }
- }
- elseif($charsetFrom != "utf-8")
- {
- if($charsetTo != "utf-8")
- {
- $convertTable = $this->buildConvertTable($charsetFrom, $charsetTo);
- }
- else
- {
- $convertTable = $this->buildConvertTable($charsetFrom);
- }
- if(!$convertTable)
- {
- return false;
- }
- $stringLength = BinaryString::getLength($sourceString);
- for ($i = 0; $i < $stringLength; $i++)
- {
- $hexChar = strtoupper(dechex(ord($sourceString[$i])));
- if(strlen($hexChar) == 1)
- {
- $hexChar = "0".$hexChar;
- }
- if(($charsetFrom == "gsm0338") && ($hexChar == '1B'))
- {
- $i++;
- $hexChar .= strtoupper(dechex(ord($sourceString[$i])));
- }
- if($charsetTo != "utf-8")
- {
- if(in_array($hexChar, $convertTable[$charsetFrom]))
- {
- $unicodeHexChar = array_search($hexChar, $convertTable[$charsetFrom]);
- $arUnicodeHexChar = explode("+", $unicodeHexChar);
- $len = count($arUnicodeHexChar);
- for ($j = 0; $j < $len; $j++)
- {
- if (array_key_exists($arUnicodeHexChar[$j], $convertTable[$charsetTo]))
- {
- $resultString .= chr(hexdec($convertTable[$charsetTo][$arUnicodeHexChar[$j]]));
- }
- else
- {
- $this->errors[] = new Error(str_replace("#CHAR#", $sourceString[$i], "Cannot find matching char \"#CHAR#\" in destination encoding table."));
- }
- }
- }
- else
- {
- $this->errors[] = new Error(str_replace("#CHAR#", $sourceString[$i], "Cannot find matching char \"#CHAR#\" in source encoding table."));
- }
- }
- else
- {
- if(in_array($hexChar, $convertTable[$charsetFrom]))
- {
- $unicodeHexChar = array_search($hexChar, $convertTable[$charsetFrom]);
- $arUnicodeHexChar = explode("+", $unicodeHexChar);
- $len = count($arUnicodeHexChar);
- for ($j = 0; $j < $len; $j++)
- {
- $resultString .= $this->hexToUtf($arUnicodeHexChar[$j]);
- }
- }
- else
- {
- $this->errors[] = new Error(str_replace("#CHAR#", $sourceString[$i], "Cannot find matching char \"#CHAR#\" in source encoding table."));
- }
- }
- }
- }
- else
- {
- $convertTable = $this->buildConvertTable($charsetTo);
- if(!$convertTable)
- {
- return false;
- }
- foreach($convertTable[$charsetTo] as $unicodeHexChar => $hexChar)
- {
- $EntitieOrChar = chr(hexdec($hexChar));
- $sourceString = str_replace($this->hexToUtf($unicodeHexChar), $EntitieOrChar, $sourceString);
- }
- $resultString = $sourceString;
- }
- return $resultString;
- }
- public function getErrors()
- {
- return $this->errors->toArray();
- }
- }