PageRenderTime 51ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/libraries/simplepie/idn/idna_convert.class.php

https://bitbucket.org/eternaware/joomus
PHP | 969 lines | 646 code | 45 blank | 278 comment | 162 complexity | 138eb836916be69b77c9bf2afd7f68d9 MD5 | raw file
Possible License(s): LGPL-2.1
  1. <?php
  2. // {{{ license
  3. /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
  4. //
  5. // +----------------------------------------------------------------------+
  6. // | This library is free software; you can redistribute it and/or modify |
  7. // | it under the terms of the GNU Lesser General Public License as |
  8. // | published by the Free Software Foundation; either version 2.1 of the |
  9. // | License, or (at your option) any later version. |
  10. // | |
  11. // | This library is distributed in the hope that it will be useful, but |
  12. // | WITHOUT ANY WARRANTY; without even the implied warranty of |
  13. // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
  14. // | Lesser General Public License for more details. |
  15. // | |
  16. // | You should have received a copy of the GNU Lesser General Public |
  17. // | License along with this library; if not, write to the Free Software |
  18. // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
  19. // | USA. |
  20. // +----------------------------------------------------------------------+
  21. //
  22. // }}}
  23. /**
  24. * Encode/decode Internationalized Domain Names.
  25. *
  26. * The class allows to convert internationalized domain names
  27. * (see RFC 3490 for details) as they can be used with various registries worldwide
  28. * to be translated between their original (localized) form and their encoded form
  29. * as it will be used in the DNS (Domain Name System).
  30. *
  31. * The class provides two public methods, encode() and decode(), which do exactly
  32. * what you would expect them to do. You are allowed to use complete domain names,
  33. * simple strings and complete email addresses as well. That means, that you might
  34. * use any of the following notations:
  35. *
  36. * - www.nörgler.com
  37. * - xn--nrgler-wxa
  38. * - xn--brse-5qa.xn--knrz-1ra.info
  39. *
  40. * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
  41. * array. Unicode output is available in the same formats.
  42. * You can select your preferred format via {@link set_paramter()}.
  43. *
  44. * ACE input and output is always expected to be ASCII.
  45. *
  46. * @author Matthias Sommerfeld <mso@phlylabs.de>
  47. * @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de
  48. * @version 0.5.1
  49. *
  50. */
  51. class idna_convert
  52. {
  53. /**
  54. * Holds all relevant mapping tables, loaded from a seperate file on construct
  55. * See RFC3454 for details
  56. *
  57. * @var array
  58. * @access private
  59. */
  60. var $NP = array();
  61. // Internal settings, do not mess with them
  62. var $_punycode_prefix = 'xn--';
  63. var $_invalid_ucs = 0x80000000;
  64. var $_max_ucs = 0x10FFFF;
  65. var $_base = 36;
  66. var $_tmin = 1;
  67. var $_tmax = 26;
  68. var $_skew = 38;
  69. var $_damp = 700;
  70. var $_initial_bias = 72;
  71. var $_initial_n = 0x80;
  72. var $_sbase = 0xAC00;
  73. var $_lbase = 0x1100;
  74. var $_vbase = 0x1161;
  75. var $_tbase = 0x11A7;
  76. var $_lcount = 19;
  77. var $_vcount = 21;
  78. var $_tcount = 28;
  79. var $_ncount = 588; // _vcount * _tcount
  80. var $_scount = 11172; // _lcount * _tcount * _vcount
  81. var $_error = false;
  82. // See {@link set_paramter()} for details of how to change the following
  83. // settings from within your script / application
  84. var $_api_encoding = 'utf8'; // Default input charset is UTF-8
  85. var $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden
  86. var $_strict_mode = false; // Behave strict or not
  87. // The constructor
  88. function idna_convert($options = false)
  89. {
  90. $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
  91. if (function_exists('file_get_contents')) {
  92. $this->NP = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser'));
  93. } else {
  94. $this->NP = unserialize(join('', file(dirname(__FILE__).'/npdata.ser')));
  95. }
  96. // If parameters are given, pass these to the respective method
  97. if (is_array($options)) {
  98. return $this->set_parameter($options);
  99. }
  100. return true;
  101. }
  102. /**
  103. * Sets a new option value. Available options and values:
  104. * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
  105. * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
  106. * [overlong - Unicode does not allow unnecessarily long encodings of chars,
  107. * to allow this, set this parameter to true, else to false;
  108. * default is false.]
  109. * [strict - true: strict mode, good for registration purposes - Causes errors
  110. * on failures; false: loose mode, ideal for "wildlife" applications
  111. * by silently ignoring errors and returning the original input instead
  112. *
  113. * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
  114. * @param string Value to use (if parameter 1 is a string)
  115. * @return boolean true on success, false otherwise
  116. * @access public
  117. */
  118. function set_parameter($option, $value = false)
  119. {
  120. if (!is_array($option)) {
  121. $option = array($option => $value);
  122. }
  123. foreach ($option as $k => $v) {
  124. switch ($k) {
  125. case 'encoding':
  126. switch ($v) {
  127. case 'utf8':
  128. case 'ucs4_string':
  129. case 'ucs4_array':
  130. $this->_api_encoding = $v;
  131. break;
  132. default:
  133. $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k);
  134. return false;
  135. }
  136. break;
  137. case 'overlong':
  138. $this->_allow_overlong = ($v) ? true : false;
  139. break;
  140. case 'strict':
  141. $this->_strict_mode = ($v) ? true : false;
  142. break;
  143. default:
  144. $this->_error('Set Parameter: Unknown option '.$k);
  145. return false;
  146. }
  147. }
  148. return true;
  149. }
  150. /**
  151. * Decode a given ACE domain name
  152. * @param string Domain name (ACE string)
  153. * [@param string Desired output encoding, see {@link set_parameter}]
  154. * @return string Decoded Domain name (UTF-8 or UCS-4)
  155. * @access public
  156. */
  157. function decode($input, $one_time_encoding = false)
  158. {
  159. // Optionally set
  160. if ($one_time_encoding) {
  161. switch ($one_time_encoding) {
  162. case 'utf8':
  163. case 'ucs4_string':
  164. case 'ucs4_array':
  165. break;
  166. default:
  167. $this->_error('Unknown encoding '.$one_time_encoding);
  168. return false;
  169. }
  170. }
  171. // Make sure to drop any newline characters around
  172. $input = trim($input);
  173. // Negotiate input and try to determine, whether it is a plain string,
  174. // an email address or something like a complete URL
  175. if (strpos($input, '@')) { // Maybe it is an email address
  176. // No no in strict mode
  177. if ($this->_strict_mode) {
  178. $this->_error('Only simple domain name parts can be handled in strict mode');
  179. return false;
  180. }
  181. list ($email_pref, $input) = explode('@', $input, 2);
  182. $arr = explode('.', $input);
  183. foreach ($arr as $k => $v) {
  184. if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
  185. $conv = $this->_decode($v);
  186. if ($conv) $arr[$k] = $conv;
  187. }
  188. }
  189. $input = join('.', $arr);
  190. $arr = explode('.', $email_pref);
  191. foreach ($arr as $k => $v) {
  192. if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
  193. $conv = $this->_decode($v);
  194. if ($conv) $arr[$k] = $conv;
  195. }
  196. }
  197. $email_pref = join('.', $arr);
  198. $return = $email_pref . '@' . $input;
  199. } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
  200. // No no in strict mode
  201. if ($this->_strict_mode) {
  202. $this->_error('Only simple domain name parts can be handled in strict mode');
  203. return false;
  204. }
  205. $parsed = parse_url($input);
  206. if (isset($parsed['host'])) {
  207. $arr = explode('.', $parsed['host']);
  208. foreach ($arr as $k => $v) {
  209. $conv = $this->_decode($v);
  210. if ($conv) $arr[$k] = $conv;
  211. }
  212. $parsed['host'] = join('.', $arr);
  213. $return =
  214. (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
  215. .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@')
  216. .$parsed['host']
  217. .(empty($parsed['port']) ? '' : ':'.$parsed['port'])
  218. .(empty($parsed['path']) ? '' : $parsed['path'])
  219. .(empty($parsed['query']) ? '' : '?'.$parsed['query'])
  220. .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']);
  221. } else { // parse_url seems to have failed, try without it
  222. $arr = explode('.', $input);
  223. foreach ($arr as $k => $v) {
  224. $conv = $this->_decode($v);
  225. $arr[$k] = ($conv) ? $conv : $v;
  226. }
  227. $return = join('.', $arr);
  228. }
  229. } else { // Otherwise we consider it being a pure domain name string
  230. $return = $this->_decode($input);
  231. if (!$return) $return = $input;
  232. }
  233. // The output is UTF-8 by default, other output formats need conversion here
  234. // If one time encoding is given, use this, else the objects property
  235. switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
  236. case 'utf8':
  237. return $return;
  238. break;
  239. case 'ucs4_string':
  240. return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
  241. break;
  242. case 'ucs4_array':
  243. return $this->_utf8_to_ucs4($return);
  244. break;
  245. default:
  246. $this->_error('Unsupported output format');
  247. return false;
  248. }
  249. }
  250. /**
  251. * Encode a given UTF-8 domain name
  252. * @param string Domain name (UTF-8 or UCS-4)
  253. * [@param string Desired input encoding, see {@link set_parameter}]
  254. * @return string Encoded Domain name (ACE string)
  255. * @access public
  256. */
  257. function encode($decoded, $one_time_encoding = false)
  258. {
  259. // Forcing conversion of input to UCS4 array
  260. // If one time encoding is given, use this, else the objects property
  261. switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) {
  262. case 'utf8':
  263. $decoded = $this->_utf8_to_ucs4($decoded);
  264. break;
  265. case 'ucs4_string':
  266. $decoded = $this->_ucs4_string_to_ucs4($decoded);
  267. case 'ucs4_array':
  268. break;
  269. default:
  270. $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding));
  271. return false;
  272. }
  273. // No input, no output, what else did you expect?
  274. if (empty($decoded)) return '';
  275. // Anchors for iteration
  276. $last_begin = 0;
  277. // Output string
  278. $output = '';
  279. foreach ($decoded as $k => $v) {
  280. // Make sure to use just the plain dot
  281. switch($v) {
  282. case 0x3002:
  283. case 0xFF0E:
  284. case 0xFF61:
  285. $decoded[$k] = 0x2E;
  286. // Right, no break here, the above are converted to dots anyway
  287. // Stumbling across an anchoring character
  288. case 0x2E:
  289. case 0x2F:
  290. case 0x3A:
  291. case 0x3F:
  292. case 0x40:
  293. // Neither email addresses nor URLs allowed in strict mode
  294. if ($this->_strict_mode) {
  295. $this->_error('Neither email addresses nor URLs are allowed in strict mode.');
  296. return false;
  297. } else {
  298. // Skip first char
  299. if ($k) {
  300. $encoded = '';
  301. $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin)));
  302. if ($encoded) {
  303. $output .= $encoded;
  304. } else {
  305. $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin)));
  306. }
  307. $output .= chr($decoded[$k]);
  308. }
  309. $last_begin = $k + 1;
  310. }
  311. }
  312. }
  313. // Catch the rest of the string
  314. if ($last_begin) {
  315. $inp_len = sizeof($decoded);
  316. $encoded = '';
  317. $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
  318. if ($encoded) {
  319. $output .= $encoded;
  320. } else {
  321. $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
  322. }
  323. return $output;
  324. } else {
  325. if ($output = $this->_encode($decoded)) {
  326. return $output;
  327. } else {
  328. return $this->_ucs4_to_utf8($decoded);
  329. }
  330. }
  331. }
  332. /**
  333. * Use this method to get the last error ocurred
  334. * @param void
  335. * @return string The last error, that occured
  336. * @access public
  337. */
  338. function get_last_error()
  339. {
  340. return $this->_error;
  341. }
  342. /**
  343. * The actual decoding algorithm
  344. * @access private
  345. */
  346. function _decode($encoded)
  347. {
  348. // We do need to find the Punycode prefix
  349. if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) {
  350. $this->_error('This is not a punycode string');
  351. return false;
  352. }
  353. $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded);
  354. // If nothing left after removing the prefix, it is hopeless
  355. if (!$encode_test) {
  356. $this->_error('The given encoded string was empty');
  357. return false;
  358. }
  359. // Find last occurence of the delimiter
  360. $delim_pos = strrpos($encoded, '-');
  361. if ($delim_pos > strlen($this->_punycode_prefix)) {
  362. for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) {
  363. $decoded[] = ord($encoded{$k});
  364. }
  365. } else {
  366. $decoded = array();
  367. }
  368. $deco_len = count($decoded);
  369. $enco_len = strlen($encoded);
  370. // Wandering through the strings; init
  371. $is_first = true;
  372. $bias = $this->_initial_bias;
  373. $idx = 0;
  374. $char = $this->_initial_n;
  375. for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
  376. for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) {
  377. $digit = $this->_decode_digit($encoded{$enco_idx++});
  378. $idx += $digit * $w;
  379. $t = ($k <= $bias) ? $this->_tmin :
  380. (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
  381. if ($digit < $t) break;
  382. $w = (int) ($w * ($this->_base - $t));
  383. }
  384. $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
  385. $is_first = false;
  386. $char += (int) ($idx / ($deco_len + 1));
  387. $idx %= ($deco_len + 1);
  388. if ($deco_len > 0) {
  389. // Make room for the decoded char
  390. for ($i = $deco_len; $i > $idx; $i--) {
  391. $decoded[$i] = $decoded[($i - 1)];
  392. }
  393. }
  394. $decoded[$idx++] = $char;
  395. }
  396. return $this->_ucs4_to_utf8($decoded);
  397. }
  398. /**
  399. * The actual encoding algorithm
  400. * @access private
  401. */
  402. function _encode($decoded)
  403. {
  404. // We cannot encode a domain name containing the Punycode prefix
  405. $extract = strlen($this->_punycode_prefix);
  406. $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
  407. $check_deco = array_slice($decoded, 0, $extract);
  408. if ($check_pref == $check_deco) {
  409. $this->_error('This is already a punycode string');
  410. return false;
  411. }
  412. // We will not try to encode strings consisting of basic code points only
  413. $encodable = false;
  414. foreach ($decoded as $k => $v) {
  415. if ($v > 0x7a) {
  416. $encodable = true;
  417. break;
  418. }
  419. }
  420. if (!$encodable) {
  421. $this->_error('The given string does not contain encodable chars');
  422. return false;
  423. }
  424. // Do NAMEPREP
  425. $decoded = $this->_nameprep($decoded);
  426. if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed
  427. $deco_len = count($decoded);
  428. if (!$deco_len) return false; // Empty array
  429. $codecount = 0; // How many chars have been consumed
  430. $encoded = '';
  431. // Copy all basic code points to output
  432. for ($i = 0; $i < $deco_len; ++$i) {
  433. $test = $decoded[$i];
  434. // Will match [-0-9a-zA-Z]
  435. if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B)
  436. || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) {
  437. $encoded .= chr($decoded[$i]);
  438. $codecount++;
  439. }
  440. }
  441. if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones
  442. // Start with the prefix; copy it to output
  443. $encoded = $this->_punycode_prefix.$encoded;
  444. // If we have basic code points in output, add an hyphen to the end
  445. if ($codecount) $encoded .= '-';
  446. // Now find and encode all non-basic code points
  447. $is_first = true;
  448. $cur_code = $this->_initial_n;
  449. $bias = $this->_initial_bias;
  450. $delta = 0;
  451. while ($codecount < $deco_len) {
  452. // Find the smallest code point >= the current code point and
  453. // remember the last ouccrence of it in the input
  454. for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
  455. if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
  456. $next_code = $decoded[$i];
  457. }
  458. }
  459. $delta += ($next_code - $cur_code) * ($codecount + 1);
  460. $cur_code = $next_code;
  461. // Scan input again and encode all characters whose code point is $cur_code
  462. for ($i = 0; $i < $deco_len; $i++) {
  463. if ($decoded[$i] < $cur_code) {
  464. $delta++;
  465. } elseif ($decoded[$i] == $cur_code) {
  466. for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
  467. $t = ($k <= $bias) ? $this->_tmin :
  468. (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
  469. if ($q < $t) break;
  470. $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval()
  471. $q = (int) (($q - $t) / ($this->_base - $t));
  472. }
  473. $encoded .= $this->_encode_digit($q);
  474. $bias = $this->_adapt($delta, $codecount+1, $is_first);
  475. $codecount++;
  476. $delta = 0;
  477. $is_first = false;
  478. }
  479. }
  480. $delta++;
  481. $cur_code++;
  482. }
  483. return $encoded;
  484. }
  485. /**
  486. * Adapt the bias according to the current code point and position
  487. * @access private
  488. */
  489. function _adapt($delta, $npoints, $is_first)
  490. {
  491. $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2));
  492. $delta += intval($delta / $npoints);
  493. for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
  494. $delta = intval($delta / ($this->_base - $this->_tmin));
  495. }
  496. return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
  497. }
  498. /**
  499. * Encoding a certain digit
  500. * @access private
  501. */
  502. function _encode_digit($d)
  503. {
  504. return chr($d + 22 + 75 * ($d < 26));
  505. }
  506. /**
  507. * Decode a certain digit
  508. * @access private
  509. */
  510. function _decode_digit($cp)
  511. {
  512. $cp = ord($cp);
  513. return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
  514. }
  515. /**
  516. * Internal error handling method
  517. * @access private
  518. */
  519. function _error($error = '')
  520. {
  521. $this->_error = $error;
  522. }
  523. /**
  524. * Do Nameprep according to RFC3491 and RFC3454
  525. * @param array Unicode Characters
  526. * @return string Unicode Characters, Nameprep'd
  527. * @access private
  528. */
  529. function _nameprep($input)
  530. {
  531. $output = array();
  532. $error = false;
  533. //
  534. // Mapping
  535. // Walking through the input array, performing the required steps on each of
  536. // the input chars and putting the result into the output array
  537. // While mapping required chars we apply the cannonical ordering
  538. foreach ($input as $v) {
  539. // Map to nothing == skip that code point
  540. if (in_array($v, $this->NP['map_nothing'])) continue;
  541. // Try to find prohibited input
  542. if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) {
  543. $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
  544. return false;
  545. }
  546. foreach ($this->NP['prohibit_ranges'] as $range) {
  547. if ($range[0] <= $v && $v <= $range[1]) {
  548. $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
  549. return false;
  550. }
  551. }
  552. //
  553. // Hangul syllable decomposition
  554. if (0xAC00 <= $v && $v <= 0xD7AF) {
  555. foreach ($this->_hangul_decompose($v) as $out) {
  556. $output[] = (int) $out;
  557. }
  558. // There's a decomposition mapping for that code point
  559. } elseif (isset($this->NP['replacemaps'][$v])) {
  560. foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) {
  561. $output[] = (int) $out;
  562. }
  563. } else {
  564. $output[] = (int) $v;
  565. }
  566. }
  567. // Before applying any Combining, try to rearrange any Hangul syllables
  568. $output = $this->_hangul_compose($output);
  569. //
  570. // Combine code points
  571. //
  572. $last_class = 0;
  573. $last_starter = 0;
  574. $out_len = count($output);
  575. for ($i = 0; $i < $out_len; ++$i) {
  576. $class = $this->_get_combining_class($output[$i]);
  577. if ((!$last_class || $last_class > $class) && $class) {
  578. // Try to match
  579. $seq_len = $i - $last_starter;
  580. $out = $this->_combine(array_slice($output, $last_starter, $seq_len));
  581. // On match: Replace the last starter with the composed character and remove
  582. // the now redundant non-starter(s)
  583. if ($out) {
  584. $output[$last_starter] = $out;
  585. if (count($out) != $seq_len) {
  586. for ($j = $i+1; $j < $out_len; ++$j) {
  587. $output[$j-1] = $output[$j];
  588. }
  589. unset($output[$out_len]);
  590. }
  591. // Rewind the for loop by one, since there can be more possible compositions
  592. $i--;
  593. $out_len--;
  594. $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]);
  595. continue;
  596. }
  597. }
  598. // The current class is 0
  599. if (!$class) $last_starter = $i;
  600. $last_class = $class;
  601. }
  602. return $output;
  603. }
  604. /**
  605. * Decomposes a Hangul syllable
  606. * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
  607. * @param integer 32bit UCS4 code point
  608. * @return array Either Hangul Syllable decomposed or original 32bit value as one value array
  609. * @access private
  610. */
  611. function _hangul_decompose($char)
  612. {
  613. $sindex = (int) $char - $this->_sbase;
  614. if ($sindex < 0 || $sindex >= $this->_scount) {
  615. return array($char);
  616. }
  617. $result = array();
  618. $result[] = (int) $this->_lbase + $sindex / $this->_ncount;
  619. $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount;
  620. $T = intval($this->_tbase + $sindex % $this->_tcount);
  621. if ($T != $this->_tbase) $result[] = $T;
  622. return $result;
  623. }
  624. /**
  625. * Ccomposes a Hangul syllable
  626. * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
  627. * @param array Decomposed UCS4 sequence
  628. * @return array UCS4 sequence with syllables composed
  629. * @access private
  630. */
  631. function _hangul_compose($input)
  632. {
  633. $inp_len = count($input);
  634. if (!$inp_len) return array();
  635. $result = array();
  636. $last = (int) $input[0];
  637. $result[] = $last; // copy first char from input to output
  638. for ($i = 1; $i < $inp_len; ++$i) {
  639. $char = (int) $input[$i];
  640. $sindex = $last - $this->_sbase;
  641. $lindex = $last - $this->_lbase;
  642. $vindex = $char - $this->_vbase;
  643. $tindex = $char - $this->_tbase;
  644. // Find out, whether two current characters are LV and T
  645. if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0)
  646. && 0 <= $tindex && $tindex <= $this->_tcount) {
  647. // create syllable of form LVT
  648. $last += $tindex;
  649. $result[(count($result) - 1)] = $last; // reset last
  650. continue; // discard char
  651. }
  652. // Find out, whether two current characters form L and V
  653. if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) {
  654. // create syllable of form LV
  655. $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount;
  656. $result[(count($result) - 1)] = $last; // reset last
  657. continue; // discard char
  658. }
  659. // if neither case was true, just add the character
  660. $last = $char;
  661. $result[] = $char;
  662. }
  663. return $result;
  664. }
  665. /**
  666. * Returns the combining class of a certain wide char
  667. * @param integer Wide char to check (32bit integer)
  668. * @return integer Combining class if found, else 0
  669. * @access private
  670. */
  671. function _get_combining_class($char)
  672. {
  673. return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0;
  674. }
  675. /**
  676. * Apllies the cannonical ordering of a decomposed UCS4 sequence
  677. * @param array Decomposed UCS4 sequence
  678. * @return array Ordered USC4 sequence
  679. * @access private
  680. */
  681. function _apply_cannonical_ordering($input)
  682. {
  683. $swap = true;
  684. $size = count($input);
  685. while ($swap) {
  686. $swap = false;
  687. $last = $this->_get_combining_class(intval($input[0]));
  688. for ($i = 0; $i < $size-1; ++$i) {
  689. $next = $this->_get_combining_class(intval($input[$i+1]));
  690. if ($next != 0 && $last > $next) {
  691. // Move item leftward until it fits
  692. for ($j = $i + 1; $j > 0; --$j) {
  693. if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break;
  694. $t = intval($input[$j]);
  695. $input[$j] = intval($input[$j-1]);
  696. $input[$j-1] = $t;
  697. $swap = true;
  698. }
  699. // Reentering the loop looking at the old character again
  700. $next = $last;
  701. }
  702. $last = $next;
  703. }
  704. }
  705. return $input;
  706. }
  707. /**
  708. * Do composition of a sequence of starter and non-starter
  709. * @param array UCS4 Decomposed sequence
  710. * @return array Ordered USC4 sequence
  711. * @access private
  712. */
  713. function _combine($input)
  714. {
  715. $inp_len = count($input);
  716. foreach ($this->NP['replacemaps'] as $np_src => $np_target) {
  717. if ($np_target[0] != $input[0]) continue;
  718. if (count($np_target) != $inp_len) continue;
  719. $hit = false;
  720. foreach ($input as $k2 => $v2) {
  721. if ($v2 == $np_target[$k2]) {
  722. $hit = true;
  723. } else {
  724. $hit = false;
  725. break;
  726. }
  727. }
  728. if ($hit) return $np_src;
  729. }
  730. return false;
  731. }
  732. /**
  733. * This converts an UTF-8 encoded string to its UCS-4 representation
  734. * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
  735. * each of the "chars". This is due to PHP not being able to handle strings with
  736. * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
  737. * The following UTF-8 encodings are supported:
  738. * bytes bits representation
  739. * 1 7 0xxxxxxx
  740. * 2 11 110xxxxx 10xxxxxx
  741. * 3 16 1110xxxx 10xxxxxx 10xxxxxx
  742. * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  743. * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  744. * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  745. * Each x represents a bit that can be used to store character data.
  746. * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
  747. * @access private
  748. */
  749. function _utf8_to_ucs4($input)
  750. {
  751. $output = array();
  752. $out_len = 0;
  753. $inp_len = strlen($input);
  754. $mode = 'next';
  755. $test = 'none';
  756. for ($k = 0; $k < $inp_len; ++$k) {
  757. $v = ord($input{$k}); // Extract byte from input string
  758. if ($v < 128) { // We found an ASCII char - put into stirng as is
  759. $output[$out_len] = $v;
  760. ++$out_len;
  761. if ('add' == $mode) {
  762. $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
  763. return false;
  764. }
  765. continue;
  766. }
  767. if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
  768. $start_byte = $v;
  769. $mode = 'add';
  770. $test = 'range';
  771. if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
  772. $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
  773. $v = ($v - 192) << 6;
  774. } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
  775. $next_byte = 1;
  776. $v = ($v - 224) << 12;
  777. } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  778. $next_byte = 2;
  779. $v = ($v - 240) << 18;
  780. } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  781. $next_byte = 3;
  782. $v = ($v - 248) << 24;
  783. } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  784. $next_byte = 4;
  785. $v = ($v - 252) << 30;
  786. } else {
  787. $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
  788. return false;
  789. }
  790. if ('add' == $mode) {
  791. $output[$out_len] = (int) $v;
  792. ++$out_len;
  793. continue;
  794. }
  795. }
  796. if ('add' == $mode) {
  797. if (!$this->_allow_overlong && $test == 'range') {
  798. $test = 'none';
  799. if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
  800. $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
  801. return false;
  802. }
  803. }
  804. if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
  805. $v = ($v - 128) << ($next_byte * 6);
  806. $output[($out_len - 1)] += $v;
  807. --$next_byte;
  808. } else {
  809. $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
  810. return false;
  811. }
  812. if ($next_byte < 0) {
  813. $mode = 'next';
  814. }
  815. }
  816. } // for
  817. return $output;
  818. }
  819. /**
  820. * Convert UCS-4 string into UTF-8 string
  821. * See _utf8_to_ucs4() for details
  822. * @access private
  823. */
  824. function _ucs4_to_utf8($input)
  825. {
  826. $output = '';
  827. $k = 0;
  828. foreach ($input as $v) {
  829. ++$k;
  830. // $v = ord($v);
  831. if ($v < 128) { // 7bit are transferred literally
  832. $output .= chr($v);
  833. } elseif ($v < (1 << 11)) { // 2 bytes
  834. $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
  835. } elseif ($v < (1 << 16)) { // 3 bytes
  836. $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
  837. } elseif ($v < (1 << 21)) { // 4 bytes
  838. $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
  839. . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
  840. } elseif ($v < (1 << 26)) { // 5 bytes
  841. $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
  842. . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
  843. . chr(128 + ($v & 63));
  844. } elseif ($v < (1 << 31)) { // 6 bytes
  845. $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
  846. . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
  847. . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
  848. } else {
  849. $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
  850. return false;
  851. }
  852. }
  853. return $output;
  854. }
  855. /**
  856. * Convert UCS-4 array into UCS-4 string
  857. *
  858. * @access private
  859. */
  860. function _ucs4_to_ucs4_string($input)
  861. {
  862. $output = '';
  863. // Take array values and split output to 4 bytes per value
  864. // The bit mask is 255, which reads &11111111
  865. foreach ($input as $v) {
  866. $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
  867. }
  868. return $output;
  869. }
  870. /**
  871. * Convert UCS-4 strin into UCS-4 garray
  872. *
  873. * @access private
  874. */
  875. function _ucs4_string_to_ucs4($input)
  876. {
  877. $output = array();
  878. $inp_len = strlen($input);
  879. // Input length must be dividable by 4
  880. if ($inp_len % 4) {
  881. $this->_error('Input UCS4 string is broken');
  882. return false;
  883. }
  884. // Empty input - return empty output
  885. if (!$inp_len) return $output;
  886. for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
  887. // Increment output position every 4 input bytes
  888. if (!($i % 4)) {
  889. $out_len++;
  890. $output[$out_len] = 0;
  891. }
  892. $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
  893. }
  894. return $output;
  895. }
  896. }
  897. /**
  898. * Adapter class for aligning the API of idna_convert with that of Net_IDNA
  899. * @author Matthias Sommerfeld <mso@phlylabs.de>
  900. */
  901. class Net_IDNA_php4 extends idna_convert
  902. {
  903. /**
  904. * Sets a new option value. Available options and values:
  905. * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
  906. * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
  907. * [overlong - Unicode does not allow unnecessarily long encodings of chars,
  908. * to allow this, set this parameter to true, else to false;
  909. * default is false.]
  910. * [strict - true: strict mode, good for registration purposes - Causes errors
  911. * on failures; false: loose mode, ideal for "wildlife" applications
  912. * by silently ignoring errors and returning the original input instead
  913. *
  914. * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
  915. * @param string Value to use (if parameter 1 is a string)
  916. * @return boolean true on success, false otherwise
  917. * @access public
  918. */
  919. function setParams($option, $param = false)
  920. {
  921. return $this->IC->set_parameters($option, $param);
  922. }
  923. }
  924. ?>