PageRenderTime 54ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/external/SimplePie/idn/idna_convert.class.php

https://github.com/JojoCMS-Plugins/jojo_galleryrss
PHP | 991 lines | 647 code | 56 blank | 288 comment | 164 complexity | 791305b6414cd09383055d7db2fdfbaa MD5 | raw file
Possible License(s): LGPL-2.1
  1. <?php
  2. /* ------------------------------------------------------------------------- */
  3. /* idna_convert.class.php - Encode / Decode Internationalized Domain Names */
  4. /* (c) 2004-2005 phlyLabs, Berlin (http://phlylabs.de) */
  5. /* All rights reserved */
  6. /* v0.4.2 */
  7. /* ------------------------------------------------------------------------- */
  8. // {{{ license
  9. /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
  10. //
  11. // +----------------------------------------------------------------------+
  12. // | This library is free software; you can redistribute it and/or modify |
  13. // | it under the terms of the GNU Lesser General Public License as |
  14. // | published by the Free Software Foundation; either version 2.1 of the |
  15. // | License, or (at your option) any later version. |
  16. // | |
  17. // | This library is distributed in the hope that it will be useful, but |
  18. // | WITHOUT ANY WARRANTY; without even the implied warranty of |
  19. // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
  20. // | Lesser General Public License for more details. |
  21. // | |
  22. // | You should have received a copy of the GNU Lesser General Public |
  23. // | License along with this library; if not, write to the Free Software |
  24. // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
  25. // | USA. |
  26. // +----------------------------------------------------------------------+
  27. //
  28. // }}}
  29. /**
  30. * Encode/decode Internationalized Domain Names.
  31. *
  32. * The class allows to convert internationalized domain names
  33. * (see RFC 3490 for details) as they can be used with various registries worldwide
  34. * to be translated between their original (localized) form and their encoded form
  35. * as it will be used in the DNS (Domain Name System).
  36. *
  37. * The class provides two public methods, encode() and decode(), which do exactly
  38. * what you would expect them to do. You are allowed to use complete domain names,
  39. * simple strings and complete email addresses as well. That means, that you might
  40. * use any of the following notations:
  41. *
  42. * - www.nรถrgler.com
  43. * - xn--nrgler-wxa
  44. * - xn--brse-5qa.xn--knrz-1ra.info
  45. *
  46. * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
  47. * array. Unicode output is available in the same formats.
  48. * You can select your preferred format via {@link set_paramter()}.
  49. *
  50. * ACE input and output is always expected to be ASCII.
  51. *
  52. * @author Matthias Sommerfeld <mso@phlylabs.de>
  53. * @version 0.4.2
  54. *
  55. */
  56. class idna_convert
  57. {
  58. // {{{ npdata
  59. /**
  60. * Holds all relevant mapping tables, loaded from a seperate file on construct
  61. * See RFC3454 for details
  62. *
  63. * @var array
  64. * @access private
  65. */
  66. var $_np_ = array();
  67. // }}}
  68. // Internal settings, do not mess with them
  69. var $_punycode_prefix = 'xn--';
  70. var $_invalid_ucs = 0x80000000;
  71. var $_max_ucs = 0x10FFFF;
  72. var $_base = 36;
  73. var $_tmin = 1;
  74. var $_tmax = 26;
  75. var $_skew = 38;
  76. var $_damp = 700;
  77. var $_initial_bias = 72;
  78. var $_initial_n = 0x80;
  79. var $_sbase = 0xAC00;
  80. var $_lbase = 0x1100;
  81. var $_vbase = 0x1161;
  82. var $_tbase = 0x11a7;
  83. var $_lcount = 19;
  84. var $_vcount = 21;
  85. var $_tcount = 28;
  86. var $_ncount = 588; // _vcount * _tcount
  87. var $_scount = 11172; // _lcount * _tcount * _vcount
  88. var $_error = false;
  89. // See set_parameter() for details of how to change the following settings
  90. // from within your script / application
  91. var $_api_encoding = 'utf8'; // Default input charset is UTF-8
  92. var $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden
  93. var $_strict_mode = false; // Behave strict or not
  94. // The constructor
  95. function idna_convert($options = false)
  96. {
  97. $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
  98. if (function_exists('file_get_contents')) {
  99. $this->_np_ = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser'));
  100. } else {
  101. $this->_np_ = unserialize(join('', file(dirname(__FILE__).'/npdata.ser')));
  102. }
  103. // If parameters are given, pass these to the respective method
  104. if (is_array($options)) {
  105. return $this->set_parameter($options);
  106. }
  107. return true;
  108. }
  109. /**
  110. * Sets a new option value. Available options and values:
  111. * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
  112. * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
  113. * [overlong - Unicode does not allow unnecessarily long encodings of chars,
  114. * to allow this, set this parameter to true, else to false;
  115. * default is false.]
  116. * [strict - true: strict mode, good for registration purposes - Causes errors
  117. * on failures; false: loose mode, ideal for "wildlife" applications
  118. * by silently ignoring errors and returning the original input instead
  119. *
  120. * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
  121. * @param string Value to use (if parameter 1 is a string)
  122. * @return boolean true on success, false otherwise
  123. * @access public
  124. */
  125. function set_parameter($option, $value = false)
  126. {
  127. if (!is_array($option)) {
  128. $option = array($option => $value);
  129. }
  130. foreach ($option as $k => $v) {
  131. switch ($k) {
  132. case 'encoding':
  133. switch ($v) {
  134. case 'utf8':
  135. case 'ucs4_string':
  136. case 'ucs4_array':
  137. $this->_api_encoding = $v;
  138. break;
  139. default:
  140. $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k);
  141. return false;
  142. }
  143. break;
  144. case 'overlong':
  145. $this->_allow_overlong = ($v) ? true : false;
  146. break;
  147. case 'strict':
  148. $this->_strict_mode = ($v) ? true : false;
  149. break;
  150. default:
  151. $this->_error('Set Parameter: Unknown option '.$k);
  152. return false;
  153. }
  154. }
  155. return true;
  156. }
  157. /**
  158. * Decode a given ACE domain name
  159. * @param string Domain name (ACE string)
  160. * [@param string Desired output encoding, see {@link set_parameter}]
  161. * @return string Decoded Domain name (UTF-8 or UCS-4)
  162. * @access public
  163. */
  164. function decode($input, $one_time_encoding = false)
  165. {
  166. // Optionally set
  167. if ($one_time_encoding) {
  168. switch ($one_time_encoding) {
  169. case 'utf8':
  170. case 'ucs4_string':
  171. case 'ucs4_array':
  172. break;
  173. default:
  174. $this->_error('Unknown encoding '.$one_time_encoding);
  175. return false;
  176. }
  177. }
  178. // Make sure to drop any newline characters around
  179. $input = trim($input);
  180. // Negotiate input and try to determine, wether it is a plain string,
  181. // an email address or something like a complete URL
  182. if (strpos($input, '@')) { // Maybe it is an email address
  183. // No no in strict mode
  184. if ($this->_strict_mode) {
  185. $this->_error('Only simple domain name parts can be handled in strict mode');
  186. return false;
  187. }
  188. list($email_pref, $input) = explode('@', $input, 2);
  189. $arr = explode('.', $input);
  190. foreach ($arr as $k => $v) {
  191. $conv = $this->_decode($v);
  192. if ($conv) $arr[$k] = $conv;
  193. }
  194. $return = $email_pref . '@' . join('.', $arr);
  195. } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
  196. // No no in strict mode
  197. if ($this->_strict_mode) {
  198. $this->_error('Only simple domain name parts can be handled in strict mode');
  199. return false;
  200. }
  201. $parsed = parse_url($input);
  202. if (isset($parsed['host'])) {
  203. $arr = explode('.', $parsed['host']);
  204. foreach ($arr as $k => $v) {
  205. $conv = $this->_decode($v);
  206. if ($conv) $arr[$k] = $conv;
  207. }
  208. $parsed['host'] = join('.', $arr);
  209. $return =
  210. (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
  211. .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@')
  212. .$parsed['host']
  213. .(empty($parsed['port']) ? '' : ':'.$parsed['port'])
  214. .$parsed['path']
  215. .(empty($parsed['query']) ? '' : '?'.$parsed['query'])
  216. .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']);
  217. } else { // parse_url seems to have failed, try without it
  218. $arr = explode('.', $input);
  219. foreach ($arr as $k => $v) {
  220. $conv = $this->_decode($v);
  221. if ($conv) $arr[$k] = $conv;
  222. }
  223. $return = join('.', $arr);
  224. }
  225. } else { // Otherwise we consider it being a pure domain name string
  226. $return = $this->_decode($input);
  227. }
  228. // The output is UTF-8 by default, other output formats need conversion here
  229. // If one time encoding is given, use this, else the objects property
  230. switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
  231. case 'utf8':
  232. return $return;
  233. break;
  234. case 'ucs4_string':
  235. return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
  236. break;
  237. case 'ucs4_array':
  238. return $this->_utf8_to_ucs4($return);
  239. break;
  240. default:
  241. $this->_error('Unsupported output format');
  242. return false;
  243. }
  244. }
  245. /**
  246. * Encode a given UTF-8 domain name
  247. * @param string Domain name (UTF-8 or UCS-4)
  248. * [@param string Desired input encoding, see {@link set_parameter}]
  249. * @return string Encoded Domain name (ACE string)
  250. * @access public
  251. */
  252. function encode($decoded, $one_time_encoding = false)
  253. {
  254. // Forcing conversion of input to UCS4 array
  255. // If one time encoding is given, use this, else the objects property
  256. switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
  257. case 'utf8':
  258. $decoded = $this->_utf8_to_ucs4($decoded);
  259. break;
  260. case 'ucs4_string':
  261. $decoded = $this->_ucs4_string_to_ucs4($decoded);
  262. case 'ucs4_array':
  263. break;
  264. default:
  265. // $this->_error('Unsupported input format: '.$this->_api_encoding);
  266. $this->_error('Unsupported input format');
  267. return false;
  268. }
  269. // No input, no output, what else did you expect?
  270. if (empty($decoded)) return '';
  271. // Anchors for iteration
  272. $last_begin = 0;
  273. // Output string
  274. $output = '';
  275. foreach ($decoded as $k => $v) {
  276. // Make sure to use just the plain dot
  277. switch($v) {
  278. case 0x3002:
  279. case 0xFF0E:
  280. case 0xFF61:
  281. $decoded[$k] = 0x2E;
  282. // It's right, no break here
  283. // The codepoints above have to be converted to dots anyway
  284. // Stumbling across an anchoring character
  285. case 0x2E:
  286. case 0x2F:
  287. case 0x3A:
  288. case 0x3F:
  289. case 0x40:
  290. // Neither email addresses nor URLs allowed in strict mode
  291. if ($this->_strict_mode) {
  292. $this->_error('Neither email addresses nor URLs are allowed in strict mode.');
  293. return false;
  294. } else {
  295. // Skip first char
  296. if ($k) {
  297. $encoded = '';
  298. $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin)));
  299. if ($encoded) {
  300. $output .= $encoded;
  301. } else {
  302. $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin)));
  303. }
  304. $output .= chr($decoded[$k]);
  305. }
  306. $last_begin = $k + 1;
  307. }
  308. }
  309. }
  310. // Catch the rest of the string
  311. if ($last_begin) {
  312. $inp_len = sizeof($decoded);
  313. $encoded = '';
  314. $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
  315. if ($encoded) {
  316. $output .= $encoded;
  317. } else {
  318. $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
  319. }
  320. return $output;
  321. } else {
  322. if ($output = $this->_encode($decoded)) {
  323. return $output;
  324. } else {
  325. return $this->_ucs4_to_utf8($decoded);
  326. }
  327. }
  328. }
  329. /**
  330. * Use this method to get the last error ocurred
  331. * @param void
  332. * @return string The last error, that occured
  333. * @access public
  334. */
  335. function get_last_error()
  336. {
  337. return $this->_error;
  338. }
  339. /**
  340. * The actual decoding algorithm
  341. * @access private
  342. */
  343. function _decode($encoded)
  344. {
  345. // We do need to find the Punycode prefix
  346. if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) {
  347. $this->_error('This is not a punycode string');
  348. return false;
  349. }
  350. $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded);
  351. // If nothing left after removing the prefix, it is hopeless
  352. if (!$encode_test) {
  353. $this->_error('The given encoded string was empty');
  354. return false;
  355. }
  356. // Find last occurence of the delimiter
  357. $delim_pos = strrpos($encoded, '-');
  358. if ($delim_pos > strlen($this->_punycode_prefix)) {
  359. for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) {
  360. $decoded[] = ord($encoded{$k});
  361. }
  362. } else {
  363. $decoded = array();
  364. }
  365. $deco_len = count($decoded);
  366. $enco_len = strlen($encoded);
  367. // Wandering through the strings; init
  368. $is_first = true;
  369. $bias = $this->_initial_bias;
  370. $idx = 0;
  371. $char = $this->_initial_n;
  372. for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
  373. for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) {
  374. $digit = $this->_decode_digit($encoded{$enco_idx++});
  375. $idx += $digit * $w;
  376. $t = ($k <= $bias) ? $this->_tmin :
  377. (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
  378. if ($digit < $t) break;
  379. $w = (int) ($w * ($this->_base - $t));
  380. }
  381. $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
  382. $is_first = false;
  383. $char += (int) ($idx / ($deco_len + 1));
  384. $idx %= ($deco_len + 1);
  385. if ($deco_len > 0) {
  386. // Make room for the decoded char
  387. for ($i = $deco_len; $i > $idx; $i--) {
  388. $decoded[$i] = $decoded[($i - 1)];
  389. }
  390. }
  391. $decoded[$idx++] = $char;
  392. }
  393. return $this->_ucs4_to_utf8($decoded);
  394. }
  395. /**
  396. * The actual encoding algorithm
  397. * @access private
  398. */
  399. function _encode($decoded)
  400. {
  401. // We cannot encode a domain name containing the Punycode prefix
  402. $extract = strlen($this->_punycode_prefix);
  403. $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
  404. $check_deco = array_slice($decoded, 0, $extract);
  405. if ($check_pref == $check_deco) {
  406. $this->_error('This is already a punycode string');
  407. return false;
  408. }
  409. // We will not try to encode strings consisting of basic code points only
  410. $encodable = false;
  411. foreach ($decoded as $k => $v) {
  412. if ($v > 0x7a) {
  413. $encodable = true;
  414. break;
  415. }
  416. }
  417. if (!$encodable) {
  418. $this->_error('The given string does not contain encodable chars');
  419. return false;
  420. }
  421. // Do NAMEPREP
  422. $decoded = $this->_nameprep($decoded);
  423. if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed
  424. $deco_len = count($decoded);
  425. if (!$deco_len) return false; // Empty array
  426. $codecount = 0; // How many chars have been consumed
  427. $encoded = '';
  428. // Copy all basic code points to output
  429. for ($i = 0; $i < $deco_len; ++$i) {
  430. $test = $decoded[$i];
  431. // Will match [0-9a-zA-Z-]
  432. if ((0x2F < $test && $test < 0x40)
  433. || (0x40 < $test && $test < 0x5B)
  434. || (0x60 < $test && $test <= 0x7B)
  435. || (0x2D == $test)) {
  436. $encoded .= chr($decoded[$i]);
  437. $codecount++;
  438. }
  439. }
  440. if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones
  441. // Start with the prefix; copy it to output
  442. $encoded = $this->_punycode_prefix.$encoded;
  443. // If we have basic code points in output, add an hyphen to the end
  444. if ($codecount) $encoded .= '-';
  445. // Now find and encode all non-basic code points
  446. $is_first = true;
  447. $cur_code = $this->_initial_n;
  448. $bias = $this->_initial_bias;
  449. $delta = 0;
  450. while ($codecount < $deco_len) {
  451. // Find the smallest code point >= the current code point and
  452. // remember the last ouccrence of it in the input
  453. for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
  454. if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
  455. $next_code = $decoded[$i];
  456. }
  457. }
  458. $delta += ($next_code - $cur_code) * ($codecount + 1);
  459. $cur_code = $next_code;
  460. // Scan input again and encode all characters whose code point is $cur_code
  461. for ($i = 0; $i < $deco_len; $i++) {
  462. if ($decoded[$i] < $cur_code) {
  463. $delta++;
  464. } elseif ($decoded[$i] == $cur_code) {
  465. for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
  466. $t = ($k <= $bias) ? $this->_tmin :
  467. (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
  468. if ($q < $t) break;
  469. $encoded .= $this->_encode_digit(ceil($t + (($q - $t) % ($this->_base - $t))));
  470. $q = ($q - $t) / ($this->_base - $t);
  471. }
  472. $encoded .= $this->_encode_digit($q);
  473. $bias = $this->_adapt($delta, $codecount+1, $is_first);
  474. $codecount++;
  475. $delta = 0;
  476. $is_first = false;
  477. }
  478. }
  479. $delta++;
  480. $cur_code++;
  481. }
  482. return $encoded;
  483. }
  484. /**
  485. * Adapt the bias according to the current code point and position
  486. * @access private
  487. */
  488. function _adapt($delta, $npoints, $is_first)
  489. {
  490. $delta = (int) ($is_first ? ($delta / $this->_damp) : ($delta / 2));
  491. $delta += (int) ($delta / $npoints);
  492. for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
  493. $delta = (int) ($delta / ($this->_base - $this->_tmin));
  494. }
  495. return (int) ($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
  496. }
  497. /**
  498. * Encoding a certain digit
  499. * @access private
  500. */
  501. function _encode_digit($d)
  502. {
  503. return chr($d + 22 + 75 * ($d < 26));
  504. }
  505. /**
  506. * Decode a certain digit
  507. * @access private
  508. */
  509. function _decode_digit($cp)
  510. {
  511. $cp = ord($cp);
  512. return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
  513. }
  514. /**
  515. * Internal error handling method
  516. * @access private
  517. */
  518. function _error($error = '')
  519. {
  520. $this->_error = $error;
  521. }
  522. /**
  523. * Do Nameprep according to RFC3491 and RFC3454
  524. * @param array Unicode Characters
  525. * @return string Unicode Characters, Nameprep'd
  526. * @access private
  527. */
  528. function _nameprep($input)
  529. {
  530. $output = array();
  531. $error = false;
  532. //
  533. // Mapping
  534. // Walking through the input array, performing the required steps on each of
  535. // the input chars and putting the result into the output array
  536. // While mapping required chars we apply the cannonical ordering
  537. // $this->_show_hex($input);
  538. foreach ($input as $v) {
  539. // Map to nothing == skip that code point
  540. if (in_array($v, $this->_np_['map_nothing'])) continue;
  541. // Try to find prohibited input
  542. if (in_array($v, $this->_np_['prohibit']) || in_array($v, $this->_np_['general_prohibited'])) {
  543. $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
  544. return false;
  545. }
  546. foreach ($this->_np_['prohibit_ranges'] as $range) {
  547. if ($range[0] <= $v && $v <= $range[1]) {
  548. $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
  549. return false;
  550. }
  551. }
  552. //
  553. // Hangul syllable decomposition
  554. if (0xAC00 <= $v && $v <= 0xD7AF) {
  555. foreach ($this->_hangul_decompose($v) as $out) {
  556. $output[] = $out;
  557. }
  558. // There's a decomposition mapping for that code point
  559. } elseif (isset($this->_np_['replacemaps'][$v])) {
  560. foreach ($this->_apply_cannonical_ordering($this->_np_['replacemaps'][$v]) as $out) {
  561. $output[] = $out;
  562. }
  563. } else {
  564. $output[] = $v;
  565. }
  566. }
  567. //
  568. // Combine code points
  569. //
  570. $last_class = 0;
  571. $last_starter = 0;
  572. $out_len = count($output);
  573. for ($i = 0; $i < $out_len; ++$i) {
  574. $class = $this->_get_combining_class($output[$i]);
  575. if ((!$last_class || $last_class != $class) && $class) {
  576. // Try to match
  577. $seq_len = $i - $last_starter;
  578. $out = $this->_combine(array_slice($output, $last_starter, $seq_len));
  579. // On match: Replace the last starter with the composed character and remove
  580. // the now redundant non-starter(s)
  581. if ($out) {
  582. $output[$last_starter] = $out;
  583. if (count($out) != $seq_len) {
  584. for ($j = $i+1; $j < $out_len; ++$j) {
  585. $output[$j-1] = $output[$j];
  586. }
  587. unset($output[$out_len]);
  588. }
  589. // Rewind the for loop by one, since there can be more possible compositions
  590. $i--;
  591. $out_len--;
  592. $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]);
  593. continue;
  594. }
  595. }
  596. if (!$class) { // The current class is 0
  597. $last_starter = $i;
  598. }
  599. $last_class = $class;
  600. }
  601. return $output;
  602. }
  603. /**
  604. * Decomposes a Hangul syllable
  605. * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
  606. * @param integer 32bit UCS4 code point
  607. * @return array Either Hangul Syllable decomposed or original 32bit value as one value array
  608. * @access private
  609. */
  610. function _hangul_decompose($char)
  611. {
  612. $sindex = $char - $this->_sbase;
  613. if ($sindex < 0 || $sindex >= $this->_scount) {
  614. return array($char);
  615. }
  616. $result = array();
  617. $T = $this->_tbase + $sindex % $this->_tcount;
  618. $result[] = (int) ($this->_lbase + $sindex / $this->_ncount);
  619. $result[] = (int) ($this->_vbase + ($sindex % $this->_ncount) / $this->_tcount);
  620. if ($T != $this->_tbase) $result[] = $T;
  621. return $result;
  622. }
  623. /**
  624. * Ccomposes a Hangul syllable
  625. * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
  626. * @param array Decomposed UCS4 sequence
  627. * @return array UCS4 sequence with syllables composed
  628. * @access private
  629. */
  630. function _hangul_compose($input)
  631. {
  632. $inp_len = count($input);
  633. if (!$inp_len) return array();
  634. $result = array();
  635. $last = $input[0];
  636. $result[] = $last; // copy first char from input to output
  637. for ($i = 1; $i < $inp_len; ++$i) {
  638. $char = $input[$i];
  639. // Find out, wether two current characters from L and V
  640. $lindex = $last - $this->_lbase;
  641. if (0 <= $lindex && $lindex < $this->_lcount) {
  642. $vindex = $char - $this->_vbase;
  643. if (0 <= $vindex && $vindex < $this->_vcount) {
  644. // create syllable of form LV
  645. $last = ($this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount);
  646. $out_off = count($result) - 1;
  647. $result[$out_off] = $last; // reset last
  648. continue; // discard char
  649. }
  650. }
  651. // Find out, wether two current characters are LV and T
  652. $sindex = $last - $this->_sbase;
  653. if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount) == 0) {
  654. $tindex = $char - $this->_tbase;
  655. if (0 <= $tindex && $tindex <= $this->_tcount) {
  656. // create syllable of form LVT
  657. $last += $tindex;
  658. $out_off = count($result) - 1;
  659. $result[$out_off] = $last; // reset last
  660. continue; // discard char
  661. }
  662. }
  663. // if neither case was true, just add the character
  664. $last = $char;
  665. $result[] = $char;
  666. }
  667. return $result;
  668. }
  669. /**
  670. * Returns the combining class of a certain wide char
  671. * @param integer Wide char to check (32bit integer)
  672. * @return integer Combining class if found, else 0
  673. * @access private
  674. */
  675. function _get_combining_class($char)
  676. {
  677. return isset($this->_np_['norm_combcls'][$char]) ? $this->_np_['norm_combcls'][$char] : 0;
  678. }
  679. /**
  680. * Apllies the cannonical ordering of a decomposed UCS4 sequence
  681. * @param array Decomposed UCS4 sequence
  682. * @return array Ordered USC4 sequence
  683. * @access private
  684. */
  685. function _apply_cannonical_ordering($input)
  686. {
  687. $swap = true;
  688. $size = count($input);
  689. while ($swap) {
  690. $swap = false;
  691. $last = $this->_get_combining_class($input[0]);
  692. for ($i = 0; $i < $size - 1; ++$i) {
  693. $next = $this->_get_combining_class($input[$i+1]);
  694. if ($next != 0 && $last > $next) {
  695. // Move item leftward until it fits
  696. for ($j = $i + 1; $j > 0; --$j) {
  697. if ($this->_get_combining_class($input[$j - 1]) <= $next) break;
  698. $t = $input[$j];
  699. $input[$j] = $input[$j - 1];
  700. $input[$j - 1] = $t;
  701. $swap = 1;
  702. }
  703. // Reentering the loop looking at the old character again
  704. $next = $last;
  705. }
  706. $last = $next;
  707. }
  708. }
  709. return $input;
  710. }
  711. /**
  712. * Do composition of a sequence of starter and non-starter
  713. * @param array UCS4 Decomposed sequence
  714. * @return array Ordered USC4 sequence
  715. * @access private
  716. */
  717. function _combine($input)
  718. {
  719. $inp_len = count($input);
  720. // Is it a Hangul syllable?
  721. if (1 != $inp_len) {
  722. $hangul = $this->_hangul_compose($input);
  723. if (count($hangul) != $inp_len) return $hangul; // This place is probably wrong
  724. }
  725. foreach ($this->_np_['replacemaps'] as $np_src => $np_target) {
  726. if ($np_target[0] != $input[0]) continue;
  727. if (count($np_target) != $inp_len) continue;
  728. $hit = false;
  729. foreach ($input as $k2 => $v2) {
  730. if ($v2 == $np_target[$k2]) {
  731. $hit = true;
  732. } else {
  733. $hit = false;
  734. break;
  735. }
  736. }
  737. if ($hit) return $np_src;
  738. }
  739. return false;
  740. }
  741. /**
  742. * This converts an UTF-8 encoded string to its UCS-4 representation
  743. * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
  744. * each of the "chars". This is due to PHP not being able to handle strings with
  745. * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
  746. * The following UTF-8 encodings are supported:
  747. * bytes bits representation
  748. * 1 7 0xxxxxxx
  749. * 2 11 110xxxxx 10xxxxxx
  750. * 3 16 1110xxxx 10xxxxxx 10xxxxxx
  751. * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  752. * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  753. * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  754. * Each x represents a bit that can be used to store character data.
  755. * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
  756. * @access private
  757. */
  758. function _utf8_to_ucs4($input)
  759. {
  760. $output = array();
  761. $out_len = 0;
  762. $inp_len = strlen($input);
  763. $mode = 'next';
  764. $test = 'none';
  765. for ($k = 0; $k < $inp_len; ++$k) {
  766. $v = ord($input{$k}); // Extract byte from input string
  767. if ($v < 128) { // We found an ASCII char - put into stirng as is
  768. $output[$out_len] = $v;
  769. ++$out_len;
  770. if ('add' == $mode) {
  771. $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
  772. return false;
  773. }
  774. continue;
  775. }
  776. if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
  777. $start_byte = $v;
  778. $mode = 'add';
  779. $test = 'range';
  780. if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
  781. $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
  782. $v = ($v - 192) << 6;
  783. } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
  784. $next_byte = 1;
  785. $v = ($v - 224) << 12;
  786. } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  787. $next_byte = 2;
  788. $v = ($v - 240) << 18;
  789. } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  790. $next_byte = 3;
  791. $v = ($v - 248) << 24;
  792. } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  793. $next_byte = 4;
  794. $v = ($v - 252) << 30;
  795. } else {
  796. $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
  797. return false;
  798. }
  799. if ('add' == $mode) {
  800. $output[$out_len] = (int) $v;
  801. ++$out_len;
  802. continue;
  803. }
  804. }
  805. if ('add' == $mode) {
  806. if (!$this->_allow_overlong && $test == 'range') {
  807. $test = 'none';
  808. if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
  809. $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
  810. return false;
  811. }
  812. }
  813. if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
  814. $v = ($v - 128) << ($next_byte * 6);
  815. $output[($out_len - 1)] += $v;
  816. --$next_byte;
  817. } else {
  818. $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
  819. return false;
  820. }
  821. if ($next_byte < 0) {
  822. $mode = 'next';
  823. }
  824. }
  825. } // for
  826. return $output;
  827. }
  828. /**
  829. * Convert UCS-4 string into UTF-8 string
  830. * See _utf8_to_ucs4() for details
  831. * @access private
  832. */
  833. function _ucs4_to_utf8($input)
  834. {
  835. $output = '';
  836. foreach ($input as $v) {
  837. // $v = ord($v);
  838. if ($v < 128) { // 7bit are transferred literally
  839. $output .= chr($v);
  840. } elseif ($v < (1 << 11)) { // 2 bytes
  841. $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
  842. } elseif ($v < (1 << 16)) { // 3 bytes
  843. $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
  844. } elseif ($v < (1 << 21)) { // 4 bytes
  845. $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
  846. . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
  847. } elseif ($v < (1 << 26)) { // 5 bytes
  848. $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
  849. . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
  850. . chr(128 + ($v & 63));
  851. } elseif ($v < (1 << 31)) { // 6 bytes
  852. $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
  853. . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
  854. . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
  855. } else {
  856. $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
  857. return false;
  858. }
  859. }
  860. return $output;
  861. }
  862. /**
  863. * Convert UCS-4 array into UCS-4 string
  864. *
  865. * @access private
  866. */
  867. function _ucs4_to_ucs4_string($input)
  868. {
  869. $output = '';
  870. // Take array values and split output to 4 bytes per value
  871. // The bit mask is 255, which reads &11111111
  872. foreach ($input as $v) {
  873. $output .= chr(($v >> 24) & 255)
  874. . chr(($v >> 16) & 255)
  875. . chr(($v >> 8) & 255)
  876. . chr($v & 255);
  877. }
  878. return $output;
  879. }
  880. /**
  881. * Convert UCS-4 strin into UCS-4 garray
  882. *
  883. * @access private
  884. */
  885. function _ucs4_string_to_ucs4($input)
  886. {
  887. $output = array();
  888. $inp_len = strlen($input);
  889. // Input length must be dividable by 4
  890. if ($inp_len % 4) {
  891. $this->_error('Input UCS4 string is broken');
  892. return false;
  893. }
  894. // Empty input - return empty output
  895. if (!$inp_len) return $output;
  896. for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
  897. // Increment output position every 4 input bytes
  898. if (!($i % 4)) {
  899. $out_len++;
  900. $output[$out_len] = 0;
  901. }
  902. $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
  903. }
  904. return $output;
  905. }
  906. }
  907. /**
  908. * Adapter class for aligning the API of idna_convert with that of
  909. * Net_IDNA
  910. * @author Matthias Sommerfeld <mso@phlylabs.de>
  911. */
  912. class Net_IDNA_php4 extends idna_convert
  913. {
  914. /**
  915. * Sets a new option value. Available options and values:
  916. * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
  917. * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
  918. * [overlong - Unicode does not allow unnecessarily long encodings of chars,
  919. * to allow this, set this parameter to true, else to false;
  920. * default is false.]
  921. * [strict - true: strict mode, good for registration purposes - Causes errors
  922. * on failures; false: loose mode, ideal for "wildlife" applications
  923. * by silently ignoring errors and returning the original input instead
  924. *
  925. * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
  926. * @param string Value to use (if parameter 1 is a string)
  927. * @return boolean true on success, false otherwise
  928. * @access public
  929. */
  930. function setParams($option, $param = false)
  931. {
  932. return $this->IC->set_parameters($option, $param);
  933. }
  934. }
  935. ?>