PageRenderTime 53ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/libraries/idna_convert/uctc.php

https://bitbucket.org/biojazzard/joomla-eboracast
PHP | 300 lines | 236 code | 14 blank | 50 comment | 84 complexity | 6591c9a5509bf525b29fc6ded3962d34 MD5 | raw file
Possible License(s): LGPL-2.1, GPL-2.0, MIT, BSD-3-Clause
  1. <?php
  2. /**
  3. * UCTC - The Unicode Transcoder
  4. *
  5. * Converts between various flavours of Unicode representations like UCS-4 or UTF-8
  6. * Supported schemes:
  7. * - UCS-4 Little Endian / Big Endian / Array (partially)
  8. * - UTF-16 Little Endian / Big Endian (not yet)
  9. * - UTF-8
  10. * - UTF-7
  11. * - UTF-7 IMAP (modified UTF-7)
  12. *
  13. * @package phlyMail Nahariya 4.0+ Default branch
  14. * @author Matthias Sommerfeld <mso@phlyLabs.de>
  15. * @copyright 2003-2009 phlyLabs Berlin, http://phlylabs.de
  16. * @version 0.0.6 2009-05-10
  17. */
  18. class uctc {
  19. private static $mechs = array('ucs4', /*'ucs4le', 'ucs4be', */'ucs4array', /*'utf16', 'utf16le', 'utf16be', */'utf8', 'utf7', 'utf7imap');
  20. private static $allow_overlong = false;
  21. private static $safe_mode;
  22. private static $safe_char;
  23. /**
  24. * The actual conversion routine
  25. *
  26. * @param mixed $data The data to convert, usually a string, array when converting from UCS-4 array
  27. * @param string $from Original encoding of the data
  28. * @param string $to Target encoding of the data
  29. * @param bool $safe_mode SafeMode tries to correct invalid codepoints
  30. * @return mixed False on failure, String or array on success, depending on target encoding
  31. * @access public
  32. * @since 0.0.1
  33. */
  34. public static function convert($data, $from, $to, $safe_mode = false, $safe_char = 0xFFFC)
  35. {
  36. self::$safe_mode = ($safe_mode) ? true : false;
  37. self::$safe_char = ($safe_char) ? $safe_char : 0xFFFC;
  38. if (self::$safe_mode) self::$allow_overlong = true;
  39. if (!in_array($from, self::$mechs)) throw new Exception('Invalid input format specified');
  40. if (!in_array($to, self::$mechs)) throw new Exception('Invalid output format specified');
  41. if ($from != 'ucs4array') eval('$data = self::'.$from.'_ucs4array($data);');
  42. if ($to != 'ucs4array') eval('$data = self::ucs4array_'.$to.'($data);');
  43. return $data;
  44. }
  45. /**
  46. * This converts an UTF-8 encoded string to its UCS-4 representation
  47. *
  48. * @param string $input The UTF-8 string to convert
  49. * @return array Array of 32bit values representing each codepoint
  50. * @access private
  51. */
  52. private static function utf8_ucs4array($input)
  53. {
  54. $output = array();
  55. $out_len = 0;
  56. $inp_len = strlen($input);
  57. $mode = 'next';
  58. $test = 'none';
  59. for ($k = 0; $k < $inp_len; ++$k) {
  60. $v = ord($input{$k}); // Extract byte from input string
  61. if ($v < 128) { // We found an ASCII char - put into stirng as is
  62. $output[$out_len] = $v;
  63. ++$out_len;
  64. if ('add' == $mode) {
  65. if (self::$safe_mode) {
  66. $output[$out_len-2] = self::$safe_char;
  67. $mode = 'next';
  68. } else {
  69. throw new Exception('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
  70. }
  71. }
  72. continue;
  73. }
  74. if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
  75. $start_byte = $v;
  76. $mode = 'add';
  77. $test = 'range';
  78. if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
  79. $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
  80. $v = ($v - 192) << 6;
  81. } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
  82. $next_byte = 1;
  83. $v = ($v - 224) << 12;
  84. } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  85. $next_byte = 2;
  86. $v = ($v - 240) << 18;
  87. } elseif (self::$safe_mode) {
  88. $mode = 'next';
  89. $output[$out_len] = self::$safe_char;
  90. ++$out_len;
  91. continue;
  92. } else {
  93. throw new Exception('This might be UTF-8, but I don\'t understand it at byte '.$k);
  94. }
  95. if ($inp_len-$k-$next_byte < 2) {
  96. $output[$out_len] = self::$safe_char;
  97. $mode = 'no';
  98. continue;
  99. }
  100. if ('add' == $mode) {
  101. $output[$out_len] = (int) $v;
  102. ++$out_len;
  103. continue;
  104. }
  105. }
  106. if ('add' == $mode) {
  107. if (!self::$allow_overlong && $test == 'range') {
  108. $test = 'none';
  109. if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
  110. throw new Exception('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
  111. }
  112. }
  113. if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
  114. $v = ($v-128) << ($next_byte*6);
  115. $output[($out_len-1)] += $v;
  116. --$next_byte;
  117. } else {
  118. if (self::$safe_mode) {
  119. $output[$out_len-1] = ord(self::$safe_char);
  120. $k--;
  121. $mode = 'next';
  122. continue;
  123. } else {
  124. throw new Exception('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
  125. }
  126. }
  127. if ($next_byte < 0) {
  128. $mode = 'next';
  129. }
  130. }
  131. } // for
  132. return $output;
  133. }
  134. /**
  135. * Convert UCS-4 string into UTF-8 string
  136. * See utf8_ucs4array() for details
  137. * @access private
  138. */
  139. private static function ucs4array_utf8($input)
  140. {
  141. $output = '';
  142. foreach ($input as $v) {
  143. if ($v < 128) { // 7bit are transferred literally
  144. $output .= chr($v);
  145. } elseif ($v < (1 << 11)) { // 2 bytes
  146. $output .= chr(192+($v >> 6)).chr(128+($v & 63));
  147. } elseif ($v < (1 << 16)) { // 3 bytes
  148. $output .= chr(224+($v >> 12)).chr(128+(($v >> 6) & 63)).chr(128+($v & 63));
  149. } elseif ($v < (1 << 21)) { // 4 bytes
  150. $output .= chr(240+($v >> 18)).chr(128+(($v >> 12) & 63)).chr(128+(($v >> 6) & 63)).chr(128+($v & 63));
  151. } elseif (self::$safe_mode) {
  152. $output .= self::$safe_char;
  153. } else {
  154. throw new Exception('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
  155. }
  156. }
  157. return $output;
  158. }
  159. private static function utf7imap_ucs4array($input)
  160. {
  161. return self::utf7_ucs4array(str_replace(',', '/', $input), '&');
  162. }
  163. private static function utf7_ucs4array($input, $sc = '+')
  164. {
  165. $output = array();
  166. $out_len = 0;
  167. $inp_len = strlen($input);
  168. $mode = 'd';
  169. $b64 = '';
  170. for ($k = 0; $k < $inp_len; ++$k) {
  171. $c = $input{$k};
  172. if (0 == ord($c)) continue; // Ignore zero bytes
  173. if ('b' == $mode) {
  174. // Sequence got terminated
  175. if (!preg_match('![A-Za-z0-9/'.preg_quote($sc, '!').']!', $c)) {
  176. if ('-' == $c) {
  177. if ($b64 == '') {
  178. $output[$out_len] = ord($sc);
  179. $out_len++;
  180. $mode = 'd';
  181. continue;
  182. }
  183. }
  184. $tmp = base64_decode($b64);
  185. $tmp = substr($tmp, -1 * (strlen($tmp) % 2));
  186. for ($i = 0; $i < strlen($tmp); $i++) {
  187. if ($i % 2) {
  188. $output[$out_len] += ord($tmp{$i});
  189. $out_len++;
  190. } else {
  191. $output[$out_len] = ord($tmp{$i}) << 8;
  192. }
  193. }
  194. $mode = 'd';
  195. $b64 = '';
  196. continue;
  197. } else {
  198. $b64 .= $c;
  199. }
  200. }
  201. if ('d' == $mode) {
  202. if ($sc == $c) {
  203. $mode = 'b';
  204. continue;
  205. }
  206. $output[$out_len] = ord($c);
  207. $out_len++;
  208. }
  209. }
  210. return $output;
  211. }
  212. private static function ucs4array_utf7imap($input)
  213. {
  214. return str_replace('/', ',', self::ucs4array_utf7($input, '&'));
  215. }
  216. private static function ucs4array_utf7($input, $sc = '+')
  217. {
  218. $output = '';
  219. $mode = 'd';
  220. $b64 = '';
  221. while (true) {
  222. $v = (!empty($input)) ? array_shift($input) : false;
  223. $is_direct = (false !== $v) ? (0x20 <= $v && $v <= 0x7e && $v != ord($sc)) : true;
  224. if ($mode == 'b') {
  225. if ($is_direct) {
  226. if ($b64 == chr(0).$sc) {
  227. $output .= $sc.'-';
  228. $b64 = '';
  229. } elseif ($b64) {
  230. $output .= $sc.str_replace('=', '', base64_encode($b64)).'-';
  231. $b64 = '';
  232. }
  233. $mode = 'd';
  234. } elseif (false !== $v) {
  235. $b64 .= chr(($v >> 8) & 255). chr($v & 255);
  236. }
  237. }
  238. if ($mode == 'd' && false !== $v) {
  239. if ($is_direct) {
  240. $output .= chr($v);
  241. } else {
  242. $b64 = chr(($v >> 8) & 255). chr($v & 255);
  243. $mode = 'b';
  244. }
  245. }
  246. if (false === $v && $b64 == '') break;
  247. }
  248. return $output;
  249. }
  250. /**
  251. * Convert UCS-4 array into UCS-4 string (Little Endian at the moment)
  252. * @access private
  253. */
  254. private static function ucs4array_ucs4($input)
  255. {
  256. $output = '';
  257. foreach ($input as $v) {
  258. $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
  259. }
  260. return $output;
  261. }
  262. /**
  263. * Convert UCS-4 string (LE in the moment) into UCS-4 garray
  264. * @access private
  265. */
  266. private static function ucs4_ucs4array($input)
  267. {
  268. $output = array();
  269. $inp_len = strlen($input);
  270. // Input length must be dividable by 4
  271. if ($inp_len % 4) {
  272. throw new Exception('Input UCS4 string is broken');
  273. }
  274. // Empty input - return empty output
  275. if (!$inp_len) return $output;
  276. for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
  277. if (!($i % 4)) { // Increment output position every 4 input bytes
  278. $out_len++;
  279. $output[$out_len] = 0;
  280. }
  281. $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
  282. }
  283. return $output;
  284. }
  285. }
  286. ?>