/yii/framework/vendors/idna_convert/idna_convert.class.php
PHP | 1605 lines | 1266 code | 42 blank | 297 comment | 173 complexity | cb01aa804140e362ff6e72e466369917 MD5 | raw file
Possible License(s): LGPL-2.1, BSD-2-Clause, GPL-2.0, GPL-3.0, BSD-3-Clause, LGPL-3.0
Large files files are truncated, but you can click here to view the full file
- <?php
- // {{{ license
- /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
- //
- // +----------------------------------------------------------------------+
- // | This library is free software; you can redistribute it and/or modify |
- // | it under the terms of the GNU Lesser General Public License as |
- // | published by the Free Software Foundation; either version 2.1 of the |
- // | License, or (at your option) any later version. |
- // | |
- // | This library is distributed in the hope that it will be useful, but |
- // | WITHOUT ANY WARRANTY; without even the implied warranty of |
- // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
- // | Lesser General Public License for more details. |
- // | |
- // | You should have received a copy of the GNU Lesser General Public |
- // | License along with this library; if not, write to the Free Software |
- // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
- // | USA. |
- // +----------------------------------------------------------------------+
- //
- // }}}
- /**
- * Encode/decode Internationalized Domain Names.
- *
- * The class allows to convert internationalized domain names
- * (see RFC 3490 for details) as they can be used with various registries worldwide
- * to be translated between their original (localized) form and their encoded form
- * as it will be used in the DNS (Domain Name System).
- *
- * The class provides two public methods, encode() and decode(), which do exactly
- * what you would expect them to do. You are allowed to use complete domain names,
- * simple strings and complete email addresses as well. That means, that you might
- * use any of the following notations:
- *
- * - www.nรถrgler.com
- * - xn--nrgler-wxa
- * - xn--brse-5qa.xn--knrz-1ra.info
- *
- * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4 array.
- * Unicode output is available in the same formats.
- * You can select your preferred format via {@link set_paramter()}.
- *
- * ACE input and output is always expected to be ASCII.
- *
- * @author Matthias Sommerfeld <mso@phlylabs.de>
- * @copyright 2004-2011 phlyLabs Berlin, http://phlylabs.de
- * @version 0.8.0 2011-03-11
- */
- class idna_convert
- {
- // NP See below
- // Internal settings, do not mess with them
- protected $_punycode_prefix = 'xn--';
- protected $_invalid_ucs = 0x80000000;
- protected $_max_ucs = 0x10FFFF;
- protected $_base = 36;
- protected $_tmin = 1;
- protected $_tmax = 26;
- protected $_skew = 38;
- protected $_damp = 700;
- protected $_initial_bias = 72;
- protected $_initial_n = 0x80;
- protected $_sbase = 0xAC00;
- protected $_lbase = 0x1100;
- protected $_vbase = 0x1161;
- protected $_tbase = 0x11A7;
- protected $_lcount = 19;
- protected $_vcount = 21;
- protected $_tcount = 28;
- protected $_ncount = 588; // _vcount * _tcount
- protected $_scount = 11172; // _lcount * _tcount * _vcount
- protected $_error = false;
- protected static $_mb_string_overload = null;
- // See {@link set_paramter()} for details of how to change the following
- // settings from within your script / application
- protected $_api_encoding = 'utf8'; // Default input charset is UTF-8
- protected $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden
- protected $_strict_mode = false; // Behave strict or not
- protected $_idn_version = 2003; // Can be either 2003 (old, default) or 2008
- /**
- * the constructor
- *
- * @param array $options
- * @return boolean
- * @since 0.5.2
- */
- public function __construct($options = false)
- {
- $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
- // If parameters are given, pass these to the respective method
- if (is_array($options)) {
- $this->set_parameter($options);
- }
- // populate mbstring overloading cache if not set
- if (self::$_mb_string_overload === null) {
- self::$_mb_string_overload = (extension_loaded('mbstring')
- && (ini_get('mbstring.func_overload') & 0x02) === 0x02);
- }
- }
- /**
- * Sets a new option value. Available options and values:
- * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
- * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
- * [overlong - Unicode does not allow unnecessarily long encodings of chars,
- * to allow this, set this parameter to true, else to false;
- * default is false.]
- * [strict - true: strict mode, good for registration purposes - Causes errors
- * on failures; false: loose mode, ideal for "wildlife" applications
- * by silently ignoring errors and returning the original input instead
- *
- * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
- * @param string Value to use (if parameter 1 is a string)
- * @return boolean true on success, false otherwise
- */
- public function set_parameter($option, $value = false)
- {
- if (!is_array($option)) {
- $option = array($option => $value);
- }
- foreach ($option as $k => $v) {
- switch ($k) {
- case 'encoding':
- switch ($v) {
- case 'utf8':
- case 'ucs4_string':
- case 'ucs4_array':
- $this->_api_encoding = $v;
- break;
- default:
- $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k);
- return false;
- }
- break;
- case 'overlong':
- $this->_allow_overlong = ($v) ? true : false;
- break;
- case 'strict':
- $this->_strict_mode = ($v) ? true : false;
- break;
- case 'idn_version':
- if (in_array($v, array('2003', '2008'))) {
- $this->_idn_version = $v;
- } else {
- $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k);
- }
- break;
- case 'encode_german_sz': // Deprecated
- if (!$v) {
- self::$NP['replacemaps'][0xDF] = array(0x73, 0x73);
- } else {
- unset(self::$NP['replacemaps'][0xDF]);
- }
- break;
- default:
- $this->_error('Set Parameter: Unknown option '.$k);
- return false;
- }
- }
- return true;
- }
- /**
- * Decode a given ACE domain name
- * @param string Domain name (ACE string)
- * [@param string Desired output encoding, see {@link set_parameter}]
- * @return string Decoded Domain name (UTF-8 or UCS-4)
- */
- public function decode($input, $one_time_encoding = false)
- {
- // Optionally set
- if ($one_time_encoding) {
- switch ($one_time_encoding) {
- case 'utf8':
- case 'ucs4_string':
- case 'ucs4_array':
- break;
- default:
- $this->_error('Unknown encoding '.$one_time_encoding);
- return false;
- }
- }
- // Make sure to drop any newline characters around
- $input = trim($input);
- // Negotiate input and try to determine, whether it is a plain string,
- // an email address or something like a complete URL
- if (strpos($input, '@')) { // Maybe it is an email address
- // No no in strict mode
- if ($this->_strict_mode) {
- $this->_error('Only simple domain name parts can be handled in strict mode');
- return false;
- }
- list ($email_pref, $input) = explode('@', $input, 2);
- $arr = explode('.', $input);
- foreach ($arr as $k => $v) {
- if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
- $conv = $this->_decode($v);
- if ($conv) $arr[$k] = $conv;
- }
- }
- $input = join('.', $arr);
- $arr = explode('.', $email_pref);
- foreach ($arr as $k => $v) {
- if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
- $conv = $this->_decode($v);
- if ($conv) $arr[$k] = $conv;
- }
- }
- $email_pref = join('.', $arr);
- $return = $email_pref . '@' . $input;
- } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
- // No no in strict mode
- if ($this->_strict_mode) {
- $this->_error('Only simple domain name parts can be handled in strict mode');
- return false;
- }
- $parsed = parse_url($input);
- if (isset($parsed['host'])) {
- $arr = explode('.', $parsed['host']);
- foreach ($arr as $k => $v) {
- $conv = $this->_decode($v);
- if ($conv) $arr[$k] = $conv;
- }
- $parsed['host'] = join('.', $arr);
- $return =
- (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
- .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@')
- .$parsed['host']
- .(empty($parsed['port']) ? '' : ':'.$parsed['port'])
- .(empty($parsed['path']) ? '' : $parsed['path'])
- .(empty($parsed['query']) ? '' : '?'.$parsed['query'])
- .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']);
- } else { // parse_url seems to have failed, try without it
- $arr = explode('.', $input);
- foreach ($arr as $k => $v) {
- $conv = $this->_decode($v);
- $arr[$k] = ($conv) ? $conv : $v;
- }
- $return = join('.', $arr);
- }
- } else { // Otherwise we consider it being a pure domain name string
- $return = $this->_decode($input);
- if (!$return) $return = $input;
- }
- // The output is UTF-8 by default, other output formats need conversion here
- // If one time encoding is given, use this, else the objects property
- switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
- case 'utf8':
- return $return;
- break;
- case 'ucs4_string':
- return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
- break;
- case 'ucs4_array':
- return $this->_utf8_to_ucs4($return);
- break;
- default:
- $this->_error('Unsupported output format');
- return false;
- }
- }
- /**
- * Encode a given UTF-8 domain name
- * @param string Domain name (UTF-8 or UCS-4)
- * [@param string Desired input encoding, see {@link set_parameter}]
- * @return string Encoded Domain name (ACE string)
- */
- public function encode($decoded, $one_time_encoding = false)
- {
- // Forcing conversion of input to UCS4 array
- // If one time encoding is given, use this, else the objects property
- switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) {
- case 'utf8':
- $decoded = $this->_utf8_to_ucs4($decoded);
- break;
- case 'ucs4_string':
- $decoded = $this->_ucs4_string_to_ucs4($decoded);
- case 'ucs4_array':
- break;
- default:
- $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding));
- return false;
- }
- // No input, no output, what else did you expect?
- if (empty($decoded)) return '';
- // Anchors for iteration
- $last_begin = 0;
- // Output string
- $output = '';
- foreach ($decoded as $k => $v) {
- // Make sure to use just the plain dot
- switch($v) {
- case 0x3002:
- case 0xFF0E:
- case 0xFF61:
- $decoded[$k] = 0x2E;
- // Right, no break here, the above are converted to dots anyway
- // Stumbling across an anchoring character
- case 0x2E:
- case 0x2F:
- case 0x3A:
- case 0x3F:
- case 0x40:
- // Neither email addresses nor URLs allowed in strict mode
- if ($this->_strict_mode) {
- $this->_error('Neither email addresses nor URLs are allowed in strict mode.');
- return false;
- } else {
- // Skip first char
- if ($k) {
- $encoded = '';
- $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin)));
- if ($encoded) {
- $output .= $encoded;
- } else {
- $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin)));
- }
- $output .= chr($decoded[$k]);
- }
- $last_begin = $k + 1;
- }
- }
- }
- // Catch the rest of the string
- if ($last_begin) {
- $inp_len = sizeof($decoded);
- $encoded = '';
- $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
- if ($encoded) {
- $output .= $encoded;
- } else {
- $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
- }
- return $output;
- } else {
- if ($output = $this->_encode($decoded)) {
- return $output;
- } else {
- return $this->_ucs4_to_utf8($decoded);
- }
- }
- }
- /**
- * Removes a weakness of encode(), which cannot properly handle URIs but instead encodes their
- * path or query components, too.
- * @param string $uri Expects the URI as a UTF-8 (or ASCII) string
- * @return string The URI encoded to Punycode, everything but the host component is left alone
- * @since 0.6.4
- */
- public function encode_uri($uri)
- {
- $parsed = parse_url($uri);
- if (!isset($parsed['host'])) {
- $this->_error('The given string does not look like a URI');
- return false;
- }
- $arr = explode('.', $parsed['host']);
- foreach ($arr as $k => $v) {
- $conv = $this->encode($v, 'utf8');
- if ($conv) $arr[$k] = $conv;
- }
- $parsed['host'] = join('.', $arr);
- $return =
- (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
- .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@')
- .$parsed['host']
- .(empty($parsed['port']) ? '' : ':'.$parsed['port'])
- .(empty($parsed['path']) ? '' : $parsed['path'])
- .(empty($parsed['query']) ? '' : '?'.$parsed['query'])
- .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']);
- return $return;
- }
- /**
- * Use this method to get the last error ocurred
- * @param void
- * @return string The last error, that occured
- */
- public function get_last_error()
- {
- return $this->_error;
- }
- /**
- * The actual decoding algorithm
- * @param string
- * @return mixed
- */
- protected function _decode($encoded)
- {
- $decoded = array();
- // find the Punycode prefix
- if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) {
- $this->_error('This is not a punycode string');
- return false;
- }
- $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded);
- // If nothing left after removing the prefix, it is hopeless
- if (!$encode_test) {
- $this->_error('The given encoded string was empty');
- return false;
- }
- // Find last occurence of the delimiter
- $delim_pos = strrpos($encoded, '-');
- if ($delim_pos > self::byteLength($this->_punycode_prefix)) {
- for ($k = self::byteLength($this->_punycode_prefix); $k < $delim_pos; ++$k) {
- $decoded[] = ord($encoded{$k});
- }
- }
- $deco_len = count($decoded);
- $enco_len = self::byteLength($encoded);
- // Wandering through the strings; init
- $is_first = true;
- $bias = $this->_initial_bias;
- $idx = 0;
- $char = $this->_initial_n;
- for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
- for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) {
- $digit = $this->_decode_digit($encoded{$enco_idx++});
- $idx += $digit * $w;
- $t = ($k <= $bias) ? $this->_tmin :
- (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
- if ($digit < $t) break;
- $w = (int) ($w * ($this->_base - $t));
- }
- $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
- $is_first = false;
- $char += (int) ($idx / ($deco_len + 1));
- $idx %= ($deco_len + 1);
- if ($deco_len > 0) {
- // Make room for the decoded char
- for ($i = $deco_len; $i > $idx; $i--) $decoded[$i] = $decoded[($i - 1)];
- }
- $decoded[$idx++] = $char;
- }
- return $this->_ucs4_to_utf8($decoded);
- }
- /**
- * The actual encoding algorithm
- * @param string
- * @return mixed
- */
- protected function _encode($decoded)
- {
- // We cannot encode a domain name containing the Punycode prefix
- $extract = self::byteLength($this->_punycode_prefix);
- $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
- $check_deco = array_slice($decoded, 0, $extract);
- if ($check_pref == $check_deco) {
- $this->_error('This is already a punycode string');
- return false;
- }
- // We will not try to encode strings consisting of basic code points only
- $encodable = false;
- foreach ($decoded as $k => $v) {
- if ($v > 0x7a) {
- $encodable = true;
- break;
- }
- }
- if (!$encodable) {
- $this->_error('The given string does not contain encodable chars');
- return false;
- }
- // Do NAMEPREP
- $decoded = $this->_nameprep($decoded);
- if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed
- $deco_len = count($decoded);
- if (!$deco_len) return false; // Empty array
- $codecount = 0; // How many chars have been consumed
- $encoded = '';
- // Copy all basic code points to output
- for ($i = 0; $i < $deco_len; ++$i) {
- $test = $decoded[$i];
- // Will match [-0-9a-zA-Z]
- if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B)
- || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) {
- $encoded .= chr($decoded[$i]);
- $codecount++;
- }
- }
- if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones
- // Start with the prefix; copy it to output
- $encoded = $this->_punycode_prefix.$encoded;
- // If we have basic code points in output, add an hyphen to the end
- if ($codecount) $encoded .= '-';
- // Now find and encode all non-basic code points
- $is_first = true;
- $cur_code = $this->_initial_n;
- $bias = $this->_initial_bias;
- $delta = 0;
- while ($codecount < $deco_len) {
- // Find the smallest code point >= the current code point and
- // remember the last ouccrence of it in the input
- for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
- if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
- $next_code = $decoded[$i];
- }
- }
- $delta += ($next_code - $cur_code) * ($codecount + 1);
- $cur_code = $next_code;
- // Scan input again and encode all characters whose code point is $cur_code
- for ($i = 0; $i < $deco_len; $i++) {
- if ($decoded[$i] < $cur_code) {
- $delta++;
- } elseif ($decoded[$i] == $cur_code) {
- for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
- $t = ($k <= $bias) ? $this->_tmin :
- (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
- if ($q < $t) break;
- $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval()
- $q = (int) (($q - $t) / ($this->_base - $t));
- }
- $encoded .= $this->_encode_digit($q);
- $bias = $this->_adapt($delta, $codecount+1, $is_first);
- $codecount++;
- $delta = 0;
- $is_first = false;
- }
- }
- $delta++;
- $cur_code++;
- }
- return $encoded;
- }
- /**
- * Adapt the bias according to the current code point and position
- * @param int $delta
- * @param int $npoints
- * @param int $is_first
- * @return int
- */
- protected function _adapt($delta, $npoints, $is_first)
- {
- $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2));
- $delta += intval($delta / $npoints);
- for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
- $delta = intval($delta / ($this->_base - $this->_tmin));
- }
- return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
- }
- /**
- * Encoding a certain digit
- * @param int $d
- * @return string
- */
- protected function _encode_digit($d)
- {
- return chr($d + 22 + 75 * ($d < 26));
- }
- /**
- * Decode a certain digit
- * @param int $cp
- * @return int
- */
- protected function _decode_digit($cp)
- {
- $cp = ord($cp);
- return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
- }
- /**
- * Internal error handling method
- * @param string $error
- */
- protected function _error($error = '')
- {
- $this->_error = $error;
- }
- /**
- * Do Nameprep according to RFC3491 and RFC3454
- * @param array Unicode Characters
- * @return string Unicode Characters, Nameprep'd
- */
- protected function _nameprep($input)
- {
- $output = array();
- $error = false;
- //
- // Mapping
- // Walking through the input array, performing the required steps on each of
- // the input chars and putting the result into the output array
- // While mapping required chars we apply the cannonical ordering
- foreach ($input as $v) {
- // Map to nothing == skip that code point
- if (in_array($v, self::$NP['map_nothing'])) continue;
- // Try to find prohibited input
- if (in_array($v, self::$NP['prohibit']) || in_array($v, self::$NP['general_prohibited'])) {
- $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
- return false;
- }
- foreach (self::$NP['prohibit_ranges'] as $range) {
- if ($range[0] <= $v && $v <= $range[1]) {
- $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
- return false;
- }
- }
- if (0xAC00 <= $v && $v <= 0xD7AF) {
- // Hangul syllable decomposition
- foreach ($this->_hangul_decompose($v) as $out) {
- $output[] = (int) $out;
- }
- } elseif (($this->_idn_version == '2003') && isset(self::$NP['replacemaps'][$v])) {
- // There's a decomposition mapping for that code point
- // Decompositions only in version 2003 (original) of IDNA
- foreach ($this->_apply_cannonical_ordering(self::$NP['replacemaps'][$v]) as $out) {
- $output[] = (int) $out;
- }
- } else {
- $output[] = (int) $v;
- }
- }
- // Before applying any Combining, try to rearrange any Hangul syllables
- $output = $this->_hangul_compose($output);
- //
- // Combine code points
- //
- $last_class = 0;
- $last_starter = 0;
- $out_len = count($output);
- for ($i = 0; $i < $out_len; ++$i) {
- $class = $this->_get_combining_class($output[$i]);
- if ((!$last_class || $last_class > $class) && $class) {
- // Try to match
- $seq_len = $i - $last_starter;
- $out = $this->_combine(array_slice($output, $last_starter, $seq_len));
- // On match: Replace the last starter with the composed character and remove
- // the now redundant non-starter(s)
- if ($out) {
- $output[$last_starter] = $out;
- if (count($out) != $seq_len) {
- for ($j = $i+1; $j < $out_len; ++$j) $output[$j-1] = $output[$j];
- unset($output[$out_len]);
- }
- // Rewind the for loop by one, since there can be more possible compositions
- $i--;
- $out_len--;
- $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]);
- continue;
- }
- }
- // The current class is 0
- if (!$class) $last_starter = $i;
- $last_class = $class;
- }
- return $output;
- }
- /**
- * Decomposes a Hangul syllable
- * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
- * @param integer 32bit UCS4 code point
- * @return array Either Hangul Syllable decomposed or original 32bit value as one value array
- */
- protected function _hangul_decompose($char)
- {
- $sindex = (int) $char - $this->_sbase;
- if ($sindex < 0 || $sindex >= $this->_scount) return array($char);
- $result = array();
- $result[] = (int) $this->_lbase + $sindex / $this->_ncount;
- $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount;
- $T = intval($this->_tbase + $sindex % $this->_tcount);
- if ($T != $this->_tbase) $result[] = $T;
- return $result;
- }
- /**
- * Ccomposes a Hangul syllable
- * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
- * @param array Decomposed UCS4 sequence
- * @return array UCS4 sequence with syllables composed
- */
- protected function _hangul_compose($input)
- {
- $inp_len = count($input);
- if (!$inp_len) return array();
- $result = array();
- $last = (int) $input[0];
- $result[] = $last; // copy first char from input to output
- for ($i = 1; $i < $inp_len; ++$i) {
- $char = (int) $input[$i];
- $sindex = $last - $this->_sbase;
- $lindex = $last - $this->_lbase;
- $vindex = $char - $this->_vbase;
- $tindex = $char - $this->_tbase;
- // Find out, whether two current characters are LV and T
- if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0)
- && 0 <= $tindex && $tindex <= $this->_tcount) {
- // create syllable of form LVT
- $last += $tindex;
- $result[(count($result) - 1)] = $last; // reset last
- continue; // discard char
- }
- // Find out, whether two current characters form L and V
- if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) {
- // create syllable of form LV
- $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount;
- $result[(count($result) - 1)] = $last; // reset last
- continue; // discard char
- }
- // if neither case was true, just add the character
- $last = $char;
- $result[] = $char;
- }
- return $result;
- }
- /**
- * Returns the combining class of a certain wide char
- * @param integer Wide char to check (32bit integer)
- * @return integer Combining class if found, else 0
- */
- protected function _get_combining_class($char)
- {
- return isset(self::$NP['norm_combcls'][$char]) ? self::$NP['norm_combcls'][$char] : 0;
- }
- /**
- * Applies the cannonical ordering of a decomposed UCS4 sequence
- * @param array Decomposed UCS4 sequence
- * @return array Ordered USC4 sequence
- */
- protected function _apply_cannonical_ordering($input)
- {
- $swap = true;
- $size = count($input);
- while ($swap) {
- $swap = false;
- $last = $this->_get_combining_class(intval($input[0]));
- for ($i = 0; $i < $size-1; ++$i) {
- $next = $this->_get_combining_class(intval($input[$i+1]));
- if ($next != 0 && $last > $next) {
- // Move item leftward until it fits
- for ($j = $i + 1; $j > 0; --$j) {
- if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break;
- $t = intval($input[$j]);
- $input[$j] = intval($input[$j-1]);
- $input[$j-1] = $t;
- $swap = true;
- }
- // Reentering the loop looking at the old character again
- $next = $last;
- }
- $last = $next;
- }
- }
- return $input;
- }
- /**
- * Do composition of a sequence of starter and non-starter
- * @param array UCS4 Decomposed sequence
- * @return array Ordered USC4 sequence
- */
- protected function _combine($input)
- {
- $inp_len = count($input);
- foreach (self::$NP['replacemaps'] as $np_src => $np_target) {
- if ($np_target[0] != $input[0]) continue;
- if (count($np_target) != $inp_len) continue;
- $hit = false;
- foreach ($input as $k2 => $v2) {
- if ($v2 == $np_target[$k2]) {
- $hit = true;
- } else {
- $hit = false;
- break;
- }
- }
- if ($hit) return $np_src;
- }
- return false;
- }
- /**
- * This converts an UTF-8 encoded string to its UCS-4 representation
- * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
- * each of the "chars". This is due to PHP not being able to handle strings with
- * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
- * The following UTF-8 encodings are supported:
- * bytes bits representation
- * 1 7 0xxxxxxx
- * 2 11 110xxxxx 10xxxxxx
- * 3 16 1110xxxx 10xxxxxx 10xxxxxx
- * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- * Each x represents a bit that can be used to store character data.
- * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
- * @param string $input
- * @return string
- */
- protected function _utf8_to_ucs4($input)
- {
- $output = array();
- $out_len = 0;
- $inp_len = self::byteLength($input);
- $mode = 'next';
- $test = 'none';
- for ($k = 0; $k < $inp_len; ++$k) {
- $v = ord($input{$k}); // Extract byte from input string
- if ($v < 128) { // We found an ASCII char - put into stirng as is
- $output[$out_len] = $v;
- ++$out_len;
- if ('add' == $mode) {
- $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
- return false;
- }
- continue;
- }
- if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
- $start_byte = $v;
- $mode = 'add';
- $test = 'range';
- if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
- $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
- $v = ($v - 192) << 6;
- } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
- $next_byte = 1;
- $v = ($v - 224) << 12;
- } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- $next_byte = 2;
- $v = ($v - 240) << 18;
- } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- $next_byte = 3;
- $v = ($v - 248) << 24;
- } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- $next_byte = 4;
- $v = ($v - 252) << 30;
- } else {
- $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
- return false;
- }
- if ('add' == $mode) {
- $output[$out_len] = (int) $v;
- ++$out_len;
- continue;
- }
- }
- if ('add' == $mode) {
- if (!$this->_allow_overlong && $test == 'range') {
- $test = 'none';
- if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
- $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
- return false;
- }
- }
- if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
- $v = ($v - 128) << ($next_byte * 6);
- $output[($out_len - 1)] += $v;
- --$next_byte;
- } else {
- $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
- return false;
- }
- if ($next_byte < 0) {
- $mode = 'next';
- }
- }
- } // for
- return $output;
- }
- /**
- * Convert UCS-4 string into UTF-8 string
- * See _utf8_to_ucs4() for details
- * @param string $input
- * @return string
- */
- protected function _ucs4_to_utf8($input)
- {
- $output = '';
- foreach ($input as $k => $v) {
- if ($v < 128) { // 7bit are transferred literally
- $output .= chr($v);
- } elseif ($v < (1 << 11)) { // 2 bytes
- $output .= chr(192+($v >> 6)).chr(128+($v & 63));
- } elseif ($v < (1 << 16)) { // 3 bytes
- $output .= chr(224+($v >> 12)).chr(128+(($v >> 6) & 63)).chr(128+($v & 63));
- } elseif ($v < (1 << 21)) { // 4 bytes
- $output .= chr(240+($v >> 18)).chr(128+(($v >> 12) & 63)).chr(128+(($v >> 6) & 63)).chr(128+($v & 63));
- } elseif (self::$safe_mode) {
- $output .= self::$safe_char;
- } else {
- $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
- return false;
- }
- }
- return $output;
- }
- /**
- * Convert UCS-4 array into UCS-4 string
- *
- * @param array $input
- * @return string
- */
- protected function _ucs4_to_ucs4_string($input)
- {
- $output = '';
- // Take array values and split output to 4 bytes per value
- // The bit mask is 255, which reads &11111111
- foreach ($input as $v) {
- $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
- }
- return $output;
- }
- /**
- * Convert UCS-4 strin into UCS-4 garray
- *
- * @param string $input
- * @return array
- */
- protected function _ucs4_string_to_ucs4($input)
- {
- $output = array();
- $inp_len = self::byteLength($input);
- // Input length must be dividable by 4
- if ($inp_len % 4) {
- $this->_error('Input UCS4 string is broken');
- return false;
- }
- // Empty input - return empty output
- if (!$inp_len) return $output;
- for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
- // Increment output position every 4 input bytes
- if (!($i % 4)) {
- $out_len++;
- $output[$out_len] = 0;
- }
- $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
- }
- return $output;
- }
- /**
- * Gets the length of a string in bytes even if mbstring function
- * overloading is turned on
- *
- * @param string $string the string for which to get the length.
- * @return integer the length of the string in bytes.
- */
- protected static function byteLength($string)
- {
- if (self::$_mb_string_overload) {
- return mb_strlen($string, '8bit');
- }
- return strlen((binary) $string);
- }
- /**
- * Attempts to return a concrete IDNA instance.
- *
- * @param array $params Set of paramaters
- * @return idna_convert
- * @access public
- */
- public function getInstance($params = array())
- {
- return new idna_convert($params);
- }
- /**
- * Attempts to return a concrete IDNA instance for either php4 or php5,
- * only creating a new instance if no IDNA instance with the same
- * parameters currently exists.
- *
- * @param array $params Set of paramaters
- *
- * @return object idna_convert
- * @access public
- */
- public function singleton($params = array())
- {
- static $instances;
- if (!isset($instances)) {
- $instances = array();
- }
- $signature = serialize($params);
- if (!isset($instances[$signature])) {
- $instances[$signature] = idna_convert::getInstance($params);
- }
- return $instances[$signature];
- }
- /**
- * Holds all relevant mapping tables
- * See RFC3454 for details
- *
- * @private array
- * @since 0.5.2
- */
- protected static $NP = array
- ('map_nothing' => array(0xAD, 0x34F, 0x1806, 0x180B, 0x180C, 0x180D, 0x200B, 0x200C
- ,0x200D, 0x2060, 0xFE00, 0xFE01, 0xFE02, 0xFE03, 0xFE04, 0xFE05, 0xFE06, 0xFE07
- ,0xFE08, 0xFE09, 0xFE0A, 0xFE0B, 0xFE0C, 0xFE0D, 0xFE0E, 0xFE0F, 0xFEFF
- )
- ,'general_prohibited' => array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
- ,20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 ,33, 34, 35, 36, 37, 38, 39, 40, 41, 42
- ,43, 44, 47, 59, 60, 61, 62, 63, 64, 91, 92, 93, 94, 95, 96, 123, 124, 125, 126, 127, 0x3002
- )
- ,'prohibit' => array(0xA0, 0x340, 0x341, 0x6DD, 0x70F, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, 0x2003
- ,0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D, 0x200E, 0x200F
- ,0x2028, 0x2029, 0x202A, 0x202B, 0x202C, 0x202D, 0x202E, 0x202F, 0x205F, 0x206A, 0x206B, 0x206C
- ,0x206D, 0x206E, 0x206F, 0x3000, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB, 0xFFFC, 0xFFFD, 0xFFFE, 0xFFFF
- ,0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE
- ,0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF
- ,0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xE0001, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF
- )
- ,'prohibit_ranges' => array(array(0x80, 0x9F), array(0x2060, 0x206F), array(0x1D173, 0x1D17A)
- ,array(0xE000, 0xF8FF) ,array(0xF0000, 0xFFFFD), array(0x100000, 0x10FFFD)
- ,array(0xFDD0, 0xFDEF), array(0xD800, 0xDFFF), array(0x2FF0, 0x2FFB), array(0xE0020, 0xE007F)
- )
- ,'replacemaps' => array(0x41 => array(0x61), 0x42 => array(0x62), 0x43 => array(0x63)
- ,0x44 => array(0x64), 0x45 => array(0x65), 0x46 => array(0x66), 0x47 => array(0x67)
- ,0x48 => array(0x68), 0x49 => array(0x69), 0x4A => array(0x6A), 0x4B => array(0x6B)
- ,0x4C => array(0x6C), 0x4D => array(0x6D), 0x4E => array(0x6E), 0x4F => array(0x6F)
- ,0x50 => array(0x70), 0x51 => array(0x71), 0x52 => array(0x72), 0x53 => array(0x73)
- ,0x54 => array(0x74), 0x55 => array(0x75), 0x56 => array(0x76), 0x57 => array(0x77)
- ,0x58 => array(0x78), 0x59 => array(0x79), 0x5A => array(0x7A), 0xB5 => array(0x3BC)
- ,0xC0 => array(0xE0), 0xC1 => array(0xE1), 0xC2 => array(0xE2), 0xC3 => array(0xE3)
- ,0xC4 => array(0xE4), 0xC5 => array(0xE5), 0xC6 => array(0xE6), 0xC7 => array(0xE7)
- ,0xC8 => array(0xE8), 0xC9 => array(0xE9), 0xCA => array(0xEA), 0xCB => array(0xEB)
- ,0xCC => array(0xEC), 0xCD => array(0xED), 0xCE => array(0xEE), 0xCF => array(0xEF)
- ,0xD0 => array(0xF0), 0xD1 => array(0xF1), 0xD2 => array(0xF2), 0xD3 => array(0xF3)
- ,0xD4 => array(0xF4), 0xD5 => array(0xF5), 0xD6 => array(0xF6), 0xD8 => array(0xF8)
- ,0xD9 => array(0xF9), 0xDA => array(0xFA), 0xDB => array(0xFB), 0xDC => array(0xFC)
- ,0xDD => array(0xFD), 0xDE => array(0xFE), 0xDF => array(0x73, 0x73)
- ,0x100 => array(0x101), 0x102 => array(0x103), 0x104 => array(0x105)
- ,0x106 => array(0x107), 0x108 => array(0x109), 0x10A => array(0x10B)
- ,0x10C => array(0x10D), 0x10E => array(0x10F), 0x110 => array(0x111)
- ,0x112 => array(0x113), 0x114 => array(0x115), 0x116 => array(0x117)
- ,0x118 => array(0x119), 0x11A => array(0x11B), 0x11C => array(0x11D)
- ,0x11E => array(0x11F), 0x120 => array(0x121), 0x122 => array(0x123)
- ,0x124 => array(0x125), 0x126 => array(0x127), 0x128 => array(0x129)
- ,0x12A => array(0x12B), 0x12C => array(0x12D), 0x12E => array(0x12F)
- ,0x130 => array(0x69, 0x307), 0x132 => array(0x133), 0x134 => array(0x135)
- ,0x136 => array(0x137), 0x139 => array(0x13A), 0x13B => array(0x13C)
- ,0x13D => array(0x13E), 0x13F => array(0x140), 0x141 => array(0x142)
- ,0x143 => array(0x144), 0x145 => array(0x146), 0x147 => array(0x148)
- ,0x149 => array(0x2BC, 0x6E), 0x14A => array(0x14B), 0x14C => array(0x14D)
- ,0x14E => array(0x14F), 0x150 => array(0x151), 0x152 => array(0x153)
- ,0x154 => array(0x155), 0x156 => array(0x157), 0x158 => array(0x159)
- ,0x15A => array(0x15B), 0x15C => array(0x15D), 0x15E => array(0x15F)
- ,0x160 => array(0x161), 0x162 => array(0x163), 0x164 => array(0x165)
- ,0x166 => array(0x167), 0x168 => array(0x169), 0x16A => array(0x16B)
- ,0x16C => array(0x16D), 0x16E => array(0x16F), 0x170 => array(0x171)
- ,0x172 => array(0x173), 0x174 => array(0x175), 0x176 => array(0x177)
- ,0x178 => array(0xFF), 0x179 => array(0x17A), 0x17B => array(0x17C)
- ,0x17D => array(0x17E), 0x17F => array(0x73), 0x181 => array(0x253)
- ,0x182 => array(0x183), 0x184 => array(0x185), 0x186 => array(0x254)
- ,0x187 => array(0x188), 0x189 => array(0x256), 0x18A => array(0x257)
- ,0x18B => array(0x18C), 0x18E => array(0x1DD), 0x18F => array(0x259)
- ,0x190 => array(0x25B), 0x191 => array(0x192), 0x193 => array(0x260)
- ,0x194 => array(0x263), 0x196 => array(0x269), 0x197 => array(0x268)
- ,0x198 => array(0x199), 0x19C => array(0x26F), 0x19D => array(0x272)
- ,0x19F => array(0x275), 0x1A0 => array(0x1A1), 0x1A2 => array(0x1A3)
- ,0x1A4 => array(0x1A5), 0x1A6 => array(0x280), 0x1A7 => array(0x1A8)
- ,0x1A9 => array(0x283), 0x1AC => array(0x1AD), 0x1AE => array(0x288)
- ,0x1AF => array(0x1B0), 0x1B1 => array(0x28A), 0x1B2 => array(0x28B)
- ,0x1B3 => array(0x1B4), 0x1B5 => array(0x1B6), 0x1B7 => array(0x292)
- ,0x1B8 => array(0x1B9), 0x1BC => array(0x1BD), 0x1C4 => array(0x1C6)
- ,0x1C5 => array(0x1C6), 0x1C7 => array(0x1C9), 0x1C8 => array(0x1C9)
- ,0x1CA => array(0x1CC), 0x1CB => array(0x1CC), 0x1CD => array(0x1CE)
- ,0x1CF => array(0x1D0), 0x1D1 => array(0x1D2), 0x1D3 => array(0x1D4)
- ,0x1D5 => array(0x1D6), 0x1D7 => array(0x1D8), 0x1D9 => array(0x1DA)
- ,0x1DB => array(0x1DC), 0x1DE => array(0x1DF), 0x1E0 => array(0x1E1)
- ,0x1E2 => array(0x1E3), 0x1E4 => array(0x1E5), 0x1E6 => array(0x1E7)
- ,0x1E8 => array(0x1E9), 0x1EA => array(0x1EB), 0x1EC => array(0x1ED)
- ,0x1EE => array(0x1EF), 0x1F0 => array(0x6A, 0x30C), 0x1F1 => array(0x1F3)
- ,0x1F2 => array(0x1F3), 0x1F4 => array(0x1F5), 0x1F6 => array(0x195)
- ,0x1F7 => array(0x1BF), 0x1F8 => array(0x1F9), 0x1FA => array(0x1FB)
- ,0x1FC => array(0x1FD), 0x1FE => array(0x1FF), 0x200 => array(0x201)
- ,0x202 => array(0x203), 0x204 => array(0x205), 0x206 => array(0x207)
- ,0x208 => array(0x209), 0x20A => array(0x20B), 0x20C => array(0x20D)
- ,0x20E => array(0x20F), 0x210 => array(0x211), 0x212 => array(0x213)
- ,0x214 => array(0x215), 0x216 => array(0x217), 0x218 => array(0x219)
- ,0x21A => array(0x21B), 0x21C => array(0x21D), 0x21E => array(0x21F)
- ,0x220 => array(0x19E), 0x222 => array(0x223), 0x224 => array(0x225)
- ,0x226 => array(0x227), 0x228 => array(0x229), 0x22A => array(0x22B)
- ,0x22C => array(0x22D), 0x22E => array(0x22F), 0x230 => array(0x231)
- ,0x232 => array(0x233), 0x345 => array(0x3B9), 0x37A => array(0x20, 0x3B9)
- ,0x386 => array(0x3AC), 0x388 => array(0x3AD), 0x389 => array(0x3AE)
- ,0x38A => array(0x3AF), 0x38C => array(0x3CC), 0x38E => array(0x3CD)
- ,0x38F => array(0x3CE), 0x390 => array(0x3B9, 0x308, 0x301)
- ,0x391 => array(0x3B1), 0x392 => array(0x3B2), 0x393 => array(0x3B3)
- ,0x394 => array(0x3B4), 0x395 => array(0x3B5), 0x396 => array(0x3B6)
- ,0x397 => array(0x3B7), 0x398 => array(0x3B8), 0x399 => array(0x3B9)
- ,0x39A => array(0x3BA), 0x39B => array(0x3BB), 0x39C => array(0x3BC)
- ,0x39D => array(0x3BD), 0x39E => array(0x3BE), 0x39F => array(0x3BF)
- ,0x3A0 => array(0x3C0), 0x3A1 => array(0x3C1), 0x3A3 => array(0x3C3)
- ,0x3A4 => array(0x3C4), 0x3A5 => array(0x3C5), 0x3A6 => array(0x3C6)
- ,0x3A7 => array(0x3C7), 0x3A8 => array(0x3C8), 0x3A9 => array(0x3C9)
- ,0x3AA => array(0x3CA), 0x3AB => array(0x3CB), 0x3B0 => array(0x3C5, 0x308, 0x301)
- ,0x3C2 => array(0x3C3), 0x3D0 => array(0x3B2), 0x3D1 => array(0x3B8)
- ,0x3D2 => array(0x3C5), 0x3D3 => array(0x3CD), 0x3D4 => array(0x3CB)
- ,0x3D5 => array(0…
Large files files are truncated, but you can click here to view the full file