PageRenderTime 30ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/bitrix/modules/main/lib/text/encoding.php

https://bitbucket.org/alex_poluektov/itech_test
PHP | 528 lines | 424 code | 57 blank | 47 comment | 108 complexity | 6484e30334b7d926a9a36f9c8f96ddfc MD5 | raw file
Possible License(s): Apache-2.0
  1. <?php
  2. namespace Bitrix\Main\Text;
  3. use Bitrix\Main\Loader;
  4. use Bitrix\Main\Application;
  5. use Bitrix\Main\Config\Configuration;
  6. use Bitrix\Main\ErrorCollection;
  7. use Bitrix\Main\Error;
  8. class Encoding
  9. {
  10. const PATH_TO_CONVERT_TABLES = "/bitrix/modules/main/cvtables/";
  11. /** @var ErrorCollection */
  12. protected $errors;
  13. protected function __construct()
  14. {
  15. $this->errors = new ErrorCollection();
  16. }
  17. /**
  18. * Converts data from a source encoding to a target encoding.
  19. *
  20. * @param string|array|\SplFixedArray $data The data to convert. From main 16.0.10 data can be an array.
  21. * @param string $charsetFrom The source encoding.
  22. * @param string $charsetTo The target encoding.
  23. * @param string $errorMessage Reference to a variable containing error messages.
  24. * @return string|array|\SplFixedArray|bool Returns converted data or false on error.
  25. */
  26. public static function convertEncoding($data, $charsetFrom, $charsetTo, &$errorMessage = "")
  27. {
  28. if(strcasecmp($charsetFrom, $charsetTo) == 0)
  29. {
  30. //no need to convert
  31. return $data;
  32. }
  33. if(is_array($data) || $data instanceof \SplFixedArray)
  34. {
  35. //let's do a recursion
  36. foreach($data as $key => $value)
  37. {
  38. $newKey = self::convertEncoding($key, $charsetFrom, $charsetTo, $errorMessage);
  39. $newValue = self::convertEncoding($value, $charsetFrom, $charsetTo, $errorMessage);
  40. $data[$newKey] = $newValue;
  41. if($newKey != $key)
  42. {
  43. unset($data[$key]);
  44. }
  45. }
  46. return $data;
  47. }
  48. elseif(is_string($data))
  49. {
  50. if($data == '')
  51. {
  52. return '';
  53. }
  54. $cvt = new static;
  55. $res = $cvt->convertByMbstring($data, $charsetFrom, $charsetTo);
  56. if (!is_string($res) || $res === '')
  57. {
  58. $res = $cvt->convertByIconv($data, $charsetFrom, $charsetTo);
  59. if (!is_string($res) || $res === '')
  60. {
  61. $res = $cvt->convertByTables($data, $charsetFrom, $charsetTo);
  62. }
  63. }
  64. $errors = $cvt->getErrors();
  65. if (!empty($errors))
  66. {
  67. $errorMessage .= implode("\n", $errors);
  68. }
  69. return $res;
  70. }
  71. return $data;
  72. }
  73. /**
  74. * @deprecated Deprecated in main 16.0.10. Use Encoding::convertEncoding().
  75. * @param $data
  76. * @param $charsetFrom
  77. * @param $charsetTo
  78. * @param string $errorMessage
  79. * @return mixed
  80. */
  81. public static function convertEncodingArray($data, $charsetFrom, $charsetTo, &$errorMessage = "")
  82. {
  83. return self::convertEncoding($data, $charsetFrom, $charsetTo, $errorMessage);
  84. }
  85. /**
  86. * @param string $string
  87. * @return bool|string
  88. */
  89. public static function convertEncodingToCurrent($string)
  90. {
  91. $isUtf8String = self::detectUtf8($string);
  92. $isUtf8Config = Application::isUtfMode();
  93. $currentCharset = null;
  94. if (!$isUtf8Config && $isUtf8String)
  95. {
  96. $context = Application::getInstance()->getContext();
  97. if ($context != null)
  98. {
  99. $culture = $context->getCulture();
  100. if ($culture != null)
  101. {
  102. $currentCharset = $culture->getCharset();
  103. }
  104. }
  105. }
  106. if ($currentCharset == null)
  107. {
  108. $currentCharset = Configuration::getValue("default_charset");
  109. }
  110. if ($currentCharset == null)
  111. {
  112. $currentCharset = "Windows-1251";
  113. }
  114. $fromCp = "";
  115. $toCp = "";
  116. if ($isUtf8Config && !$isUtf8String)
  117. {
  118. $fromCp = $currentCharset;
  119. $toCp = "UTF-8";
  120. }
  121. elseif (!$isUtf8Config && $isUtf8String)
  122. {
  123. $fromCp = "UTF-8";
  124. $toCp = $currentCharset;
  125. }
  126. if ($fromCp !== $toCp)
  127. {
  128. $string = self::convertEncoding($string, $fromCp, $toCp);
  129. }
  130. return $string;
  131. }
  132. /**
  133. * @param string $string
  134. * @return bool
  135. */
  136. public static function detectUtf8($string)
  137. {
  138. //http://mail.nl.linux.org/linux-utf8/1999-09/msg00110.html
  139. if(preg_match_all("/(?:%)([0-9A-F]{2})/i", $string, $match))
  140. {
  141. $string = pack("H*", strtr(implode('', $match[1]), 'abcdef', 'ABCDEF'));
  142. }
  143. //valid UTF-8 octet sequences
  144. //0xxxxxxx
  145. //110xxxxx 10xxxxxx
  146. //1110xxxx 10xxxxxx 10xxxxxx
  147. //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  148. $prevBits8and7 = 0;
  149. $isUtf = 0;
  150. foreach(unpack("C*", $string) as $byte)
  151. {
  152. $hiBits8and7 = $byte & 0xC0;
  153. if ($hiBits8and7 == 0x80)
  154. {
  155. if ($prevBits8and7 == 0xC0)
  156. $isUtf++;
  157. elseif (($prevBits8and7 & 0x80) == 0x00)
  158. $isUtf--;
  159. }
  160. elseif ($prevBits8and7 == 0xC0)
  161. {
  162. $isUtf--;
  163. }
  164. $prevBits8and7 = $hiBits8and7;
  165. }
  166. return ($isUtf > 0);
  167. }
  168. protected function convertByMbstring($data, $charsetFrom, $charsetTo)
  169. {
  170. $res = '';
  171. // mb_encoding_aliases emits an E_WARNING level error if encoding is unknown
  172. if (extension_loaded("mbstring") && @mb_encoding_aliases($charsetFrom) !== false && @mb_encoding_aliases($charsetTo) !== false)
  173. {
  174. //For UTF-16 we have to detect the order of bytes
  175. //Default for mbstring extension is Big endian
  176. //Little endian have to pointed explicitly
  177. if (strtoupper($charsetFrom) == "UTF-16")
  178. {
  179. $ch = substr($data, 0, 1);
  180. if ($ch == "\xFF" && substr($data, 1, 1) == "\xFE")
  181. {
  182. //If Little endian found - cutoff BOF bytes and point mbstring to this fact explicitly
  183. $res = mb_convert_encoding(substr($data, 2), $charsetTo, "UTF-16LE");
  184. }
  185. elseif ($ch == "\xFE" && substr($data, 1, 1) == "\xFF")
  186. {
  187. //If it is Big endian, just remove BOF bytes
  188. $res = mb_convert_encoding(substr($data, 2), $charsetTo, $charsetFrom);
  189. }
  190. else
  191. {
  192. //Otherwise assime Little endian without BOF
  193. $res = mb_convert_encoding($data, $charsetTo, "UTF-16LE");
  194. }
  195. }
  196. else
  197. {
  198. $res = mb_convert_encoding($data, $charsetTo, $charsetFrom);
  199. }
  200. }
  201. return $res;
  202. }
  203. protected function convertByIconv($data, $charsetFrom, $charsetTo)
  204. {
  205. $res = '';
  206. if (Configuration::getValue("disable_iconv") !== true)
  207. {
  208. $utfString = false;
  209. if (strtoupper($charsetFrom) == "UTF-16")
  210. {
  211. $ch = substr($data, 0, 1);
  212. if (($ch != "\xFF") || ($ch != "\xFE"))
  213. {
  214. $utfString = "\xFF\xFE".$data;
  215. }
  216. }
  217. if (function_exists('iconv'))
  218. {
  219. if ($utfString)
  220. {
  221. $res = iconv($charsetFrom, $charsetTo."//IGNORE", $utfString);
  222. }
  223. else
  224. {
  225. $res = iconv($charsetFrom, $charsetTo."//IGNORE", $data);
  226. }
  227. if (!$res)
  228. {
  229. $this->errors[] = new Error("Iconv reported failure while converting string to requested character encoding.");
  230. }
  231. }
  232. elseif (function_exists('libiconv'))
  233. {
  234. if ($utfString)
  235. {
  236. $res = libiconv($charsetFrom, $charsetTo, $utfString);
  237. }
  238. else
  239. {
  240. $res = libiconv($charsetFrom, $charsetTo, $data);
  241. }
  242. if (!$res)
  243. {
  244. $this->errors[] = new Error("Libiconv reported failure while converting string to requested character encoding.");
  245. }
  246. }
  247. }
  248. return $res;
  249. }
  250. protected function buildConvertTable()
  251. {
  252. static $cvTables = array();
  253. for($i = 0, $cnt = func_num_args(); $i < $cnt; $i++)
  254. {
  255. $fileName = func_get_arg($i);
  256. if(isset($cvTables[$fileName]))
  257. {
  258. continue;
  259. }
  260. $pathToTable = Loader::getDocumentRoot().self::PATH_TO_CONVERT_TABLES.$fileName;
  261. if (!file_exists($pathToTable))
  262. {
  263. $this->errors[] = new Error(str_replace("#FILE#", $pathToTable, "File #FILE# is not found."));
  264. return false;
  265. }
  266. if (!is_file($pathToTable))
  267. {
  268. $this->errors[] = new Error(str_replace("#FILE#", $pathToTable, "File #FILE# is not a file."));
  269. return false;
  270. }
  271. if (!($hFile = fopen($pathToTable, "r")))
  272. {
  273. $this->errors[] = new Error(str_replace("#FILE#", $pathToTable, "Can not open file #FILE# for reading."));
  274. return false;
  275. }
  276. $cvTables[$fileName] = array();
  277. while (!feof($hFile))
  278. {
  279. if ($line = trim(fgets($hFile, 1024)))
  280. {
  281. if (substr($line, 0, 1) != "#")
  282. {
  283. $hexValue = preg_split("/[\\s,]+/", $line, 3);
  284. if (substr($hexValue[1], 0, 1) != "#")
  285. {
  286. $key = strtoupper(str_replace("0x", "", $hexValue[1]));
  287. $value = strtoupper(str_replace("0x", "", $hexValue[0]));
  288. $cvTables[$fileName][$key] = $value;
  289. }
  290. }
  291. }
  292. }
  293. fclose($hFile);
  294. }
  295. return $cvTables;
  296. }
  297. protected function hexToUtf($utfCharInHex)
  298. {
  299. $result = "";
  300. $utfCharInDec = hexdec($utfCharInHex);
  301. if ($utfCharInDec < 128)
  302. $result .= chr($utfCharInDec);
  303. elseif ($utfCharInDec < 2048)
  304. $result .= chr(($utfCharInDec >> 6) + 192).chr(($utfCharInDec & 63) + 128);
  305. elseif ($utfCharInDec < 65536)
  306. $result .= chr(($utfCharInDec >> 12) + 224).chr((($utfCharInDec >> 6) & 63) + 128).chr(($utfCharInDec & 63) + 128);
  307. elseif ($utfCharInDec < 2097152)
  308. $result .= chr($utfCharInDec >> 18 + 240).chr((($utfCharInDec >> 12) & 63) + 128).chr(($utfCharInDec >> 6) & 63 + 128). chr($utfCharInDec & 63 + 128);
  309. return $result;
  310. }
  311. /**
  312. * @param string $sourceString
  313. * @param string $charsetFrom
  314. * @param string $charsetTo
  315. * @return bool|string
  316. */
  317. protected function convertByTables($sourceString, $charsetFrom, $charsetTo)
  318. {
  319. if($charsetFrom == '')
  320. {
  321. $this->errors[] = new Error("Source charset is not set.");
  322. return false;
  323. }
  324. if($charsetTo == '')
  325. {
  326. $this->errors[] = new Error("Destination charset is not set.");
  327. return false;
  328. }
  329. $charsetFrom = strtolower($charsetFrom);
  330. $charsetTo = strtolower($charsetTo);
  331. $resultString = "";
  332. if($charsetFrom == "ucs-2")
  333. {
  334. $convertTable = $this->buildConvertTable($charsetTo);
  335. if(!$convertTable)
  336. {
  337. return false;
  338. }
  339. $len = strlen($sourceString);
  340. for($i = 0; $i < $len; $i+=2)
  341. {
  342. $hexChar = strtoupper(dechex(ord($sourceString[$i])).dechex(ord($sourceString[$i+1])));
  343. $hexChar = str_pad($hexChar, 4, "0", STR_PAD_LEFT);
  344. if($convertTable[$charsetTo][$hexChar])
  345. {
  346. if($charsetTo != "utf-8")
  347. {
  348. $resultString .= chr(hexdec($convertTable[$charsetTo][$hexChar]));
  349. }
  350. else
  351. {
  352. $resultString .= $this->hexToUtf($convertTable[$charsetTo][$hexChar]);
  353. }
  354. }
  355. }
  356. }
  357. elseif($charsetFrom == "utf-16")
  358. {
  359. $convertTable = $this->buildConvertTable($charsetTo);
  360. if(!$convertTable)
  361. {
  362. return false;
  363. }
  364. $len = strlen($sourceString);
  365. for($i = 0; $i < $len; $i+=2)
  366. {
  367. $hexChar = sprintf("%02X%02X", ord($sourceString[$i+1]), ord($sourceString[$i]));
  368. if($convertTable[$charsetTo][$hexChar])
  369. {
  370. if($charsetTo != "utf-8")
  371. {
  372. $resultString .= chr(hexdec($convertTable[$charsetTo][$hexChar]));
  373. }
  374. else
  375. {
  376. $resultString .= $this->hexToUtf($convertTable[$charsetTo][$hexChar]);
  377. }
  378. }
  379. }
  380. }
  381. elseif($charsetFrom != "utf-8")
  382. {
  383. if($charsetTo != "utf-8")
  384. {
  385. $convertTable = $this->buildConvertTable($charsetFrom, $charsetTo);
  386. }
  387. else
  388. {
  389. $convertTable = $this->buildConvertTable($charsetFrom);
  390. }
  391. if(!$convertTable)
  392. {
  393. return false;
  394. }
  395. $stringLength = BinaryString::getLength($sourceString);
  396. for ($i = 0; $i < $stringLength; $i++)
  397. {
  398. $hexChar = strtoupper(dechex(ord($sourceString[$i])));
  399. if(strlen($hexChar) == 1)
  400. {
  401. $hexChar = "0".$hexChar;
  402. }
  403. if(($charsetFrom == "gsm0338") && ($hexChar == '1B'))
  404. {
  405. $i++;
  406. $hexChar .= strtoupper(dechex(ord($sourceString[$i])));
  407. }
  408. if($charsetTo != "utf-8")
  409. {
  410. if(in_array($hexChar, $convertTable[$charsetFrom]))
  411. {
  412. $unicodeHexChar = array_search($hexChar, $convertTable[$charsetFrom]);
  413. $arUnicodeHexChar = explode("+", $unicodeHexChar);
  414. $len = count($arUnicodeHexChar);
  415. for ($j = 0; $j < $len; $j++)
  416. {
  417. if (array_key_exists($arUnicodeHexChar[$j], $convertTable[$charsetTo]))
  418. {
  419. $resultString .= chr(hexdec($convertTable[$charsetTo][$arUnicodeHexChar[$j]]));
  420. }
  421. else
  422. {
  423. $this->errors[] = new Error(str_replace("#CHAR#", $sourceString[$i], "Cannot find matching char \"#CHAR#\" in destination encoding table."));
  424. }
  425. }
  426. }
  427. else
  428. {
  429. $this->errors[] = new Error(str_replace("#CHAR#", $sourceString[$i], "Cannot find matching char \"#CHAR#\" in source encoding table."));
  430. }
  431. }
  432. else
  433. {
  434. if(in_array($hexChar, $convertTable[$charsetFrom]))
  435. {
  436. $unicodeHexChar = array_search($hexChar, $convertTable[$charsetFrom]);
  437. $arUnicodeHexChar = explode("+", $unicodeHexChar);
  438. $len = count($arUnicodeHexChar);
  439. for ($j = 0; $j < $len; $j++)
  440. {
  441. $resultString .= $this->hexToUtf($arUnicodeHexChar[$j]);
  442. }
  443. }
  444. else
  445. {
  446. $this->errors[] = new Error(str_replace("#CHAR#", $sourceString[$i], "Cannot find matching char \"#CHAR#\" in source encoding table."));
  447. }
  448. }
  449. }
  450. }
  451. else
  452. {
  453. $convertTable = $this->buildConvertTable($charsetTo);
  454. if(!$convertTable)
  455. {
  456. return false;
  457. }
  458. foreach($convertTable[$charsetTo] as $unicodeHexChar => $hexChar)
  459. {
  460. $EntitieOrChar = chr(hexdec($hexChar));
  461. $sourceString = str_replace($this->hexToUtf($unicodeHexChar), $EntitieOrChar, $sourceString);
  462. }
  463. $resultString = $sourceString;
  464. }
  465. return $resultString;
  466. }
  467. public function getErrors()
  468. {
  469. return $this->errors->toArray();
  470. }
  471. }