PageRenderTime 125ms CodeModel.GetById 70ms app.highlight 18ms RepoModel.GetById 32ms app.codeStats 1ms

/lib/external/idna_convert/src/IdnaConvert.php

https://bitbucket.org/navigatecms/navigatecms
PHP | 405 lines | 254 code | 33 blank | 118 comment | 45 complexity | a9473cc50392d9419e40121529a3e21e MD5 | raw file
  1<?php
  2
  3// {{{ license
  4
  5/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
  6//
  7// +----------------------------------------------------------------------+
  8// | This library is free software; you can redistribute it and/or modify |
  9// | it under the terms of the GNU Lesser General Public License as       |
 10// | published by the Free Software Foundation; either version 2.1 of the |
 11// | License, or (at your option) any later version.                      |
 12// |                                                                      |
 13// | This library is distributed in the hope that it will be useful, but  |
 14// | WITHOUT ANY WARRANTY; without even the implied warranty of           |
 15// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    |
 16// | Lesser General Public License for more details.                      |
 17// |                                                                      |
 18// | You should have received a copy of the GNU Lesser General Public     |
 19// | License along with this library; if not, write to the Free Software  |
 20// | Foundation, Inc., 51 Franklin St, Boston, MA 02110, United States    |
 21// +----------------------------------------------------------------------+
 22//
 23// }}}
 24
 25/**
 26 * Encode/decode Internationalized Domain Names.
 27 *
 28 * The class allows to convert internationalized domain names
 29 * (see RFC 3490 for details) as they can be used with various registries worldwide
 30 * to be translated between their original (localized) form and their encoded form
 31 * as it will be used in the DNS (Domain Name System).
 32 *
 33 * The class provides two public methods, encode() and decode(), which do exactly
 34 * what you would expect them to do. You are allowed to use complete domain names,
 35 * simple strings and complete email addresses as well. That means, that you might
 36 * use any of the following notations:
 37 *
 38 * - www.nรถrgler.com
 39 * - xn--nrgler-wxa
 40 * - xn--brse-5qa.xn--knrz-1ra.info
 41 *
 42 * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4 array.
 43 * Unicode output is available in the same formats.
 44 * You can select your preferred format via {@link set_paramter()}.
 45 *
 46 * ACE input and output is always expected to be ASCII.
 47 *
 48 * @author  Matthias Sommerfeld <mso@phlylabs.de>
 49 * @copyright 2004-2016 phlyLabs Berlin, http://phlylabs.de
 50 * @version 1.0.1-dev 2016-01-12
 51 */
 52
 53namespace Mso\IdnaConvert;
 54
 55class IdnaConvert {
 56
 57    const Version = '1.1.0';
 58    const SubVersion = 'main';
 59
 60    // Internal settings, do not touch!
 61    protected $encoding = 'utf8';          // Default input charset is UTF-8
 62    protected $strictMode = false;         // Behave strict or not
 63    protected $idnVersion = '2008';          // Can be either 2003 (old) or 2008 (default)
 64
 65    protected $NamePrepData = null;
 66    protected $UnicodeTranscoder = null;
 67
 68    /**
 69     * the constructor
 70     *
 71     * @param array|null $params Parameters to control the class' behaviour
 72     * @since 0.5.2
 73     */
 74    public function __construct($params = null)
 75    {
 76        $this->UnicodeTranscoder = new UnicodeTranscoder();
 77
 78        // Kept for backwarsds compatibility. Consider using the setter methods instead.
 79        if (!empty($params) && is_array($params)) {
 80            if (isset($params['encoding'])) {
 81                $this->setEncoding($params['encoding']);
 82            }
 83
 84            if (isset($params['idn_version'])) {
 85                $this->setIdnVersion($params['idn_version']);
 86            }
 87
 88            if (isset($params['strict_mode'])) {
 89                $this->setStrictMode($params['strict_mode']);
 90            }
 91        }
 92
 93        $this->setIdnVersion($this->idnVersion);
 94    }
 95
 96    public function getClassVersion()
 97    {
 98        return self::Version.'-'.self::SubVersion;
 99    }
100
101    /**
102     * @return string
103     */
104    public function getEncoding()
105    {
106        return $this->encoding;
107    }
108
109    /**
110     * @param string $encoding
111     */
112    public function setEncoding($encoding)
113    {
114        switch ($encoding) {
115            case 'utf8':
116            case 'ucs4_string':
117            case 'ucs4_array':
118                $this->encoding = $encoding;
119                break;
120            default:
121                throw new \InvalidArgumentException(sprintf('Invalid encoding %s', $encoding));
122        }
123    }
124
125    /**
126     * @return boolean
127     */
128    public function isStrictMode()
129    {
130        return $this->strictMode;
131    }
132
133    /**
134     * @param boolean $strictMode
135     */
136    public function setStrictMode($strictMode)
137    {
138        $this->strictMode = ($strictMode) ? true : false;
139    }
140
141    /**
142     * @return int
143     */
144    public function getIdnVersion()
145    {
146        return $this->idnVersion;
147    }
148
149    /**
150     * @param int $idnVersion
151     */
152    public function setIdnVersion($idnVersion)
153    {
154        if (in_array($idnVersion, array('2003', '2008'))) {
155            if (is_null($this->NamePrepData) || $idnVersion != $this->idnVersion) {
156                $this->NamePrepData = null; // Ought to destroy the object's reference
157                // Re-instantiate with different data set
158                $this->NamePrepData = ($idnVersion == 2003)
159                        ? new NamePrepData2003()
160                        : new NamePrepData();
161            }
162
163            $this->idnVersion = $idnVersion;
164
165        } else {
166            throw new \InvalidArgumentException(sprintf('Invalid IDN version %d', $idnVersion));
167        }
168    }
169
170    /**
171     * Decode a given ACE domain name
172     * @param string $input  Domain name (ACE string)
173     * [@param string $one_time_encoding  Desired output encoding]
174     * @return string  Decoded Domain name (UTF-8 or UCS-4)
175     */
176    public function decode($input, $one_time_encoding = null)
177    {
178        $punyCode = $this->punycodeFactory();
179
180        // Optionally set
181        if ($one_time_encoding) {
182            switch ($one_time_encoding) {
183                case 'utf8':
184                case 'ucs4_string':
185                case 'ucs4_array':
186                    break;
187                default:
188                    throw new \InvalidArgumentException(sprintf('Invalid encoding %s', $one_time_encoding));
189            }
190        }
191        // Make sure to drop any newline characters around
192        $input = trim($input);
193
194        // Negotiate input and try to determine, whether it is a plain string,
195        // an email address or something like a complete URL
196        if (strpos($input, '@')) { // Maybe it is an email address
197            // No no in strict mode
198            if ($this->strictMode) {
199                throw new \InvalidArgumentException('Only individual domain name parts can be handled in strict mode');
200            }
201            list ($email_pref, $input) = explode('@', $input, 2);
202            $arr = explode('.', $input);
203            foreach ($arr as $k => $v) {
204                $conv = $punyCode->decode($v);
205                if ($conv) {
206                    $arr[$k] = $conv;
207                }
208            }
209            $input = join('.', $arr);
210            $arr = explode('.', $email_pref);
211            foreach ($arr as $k => $v) {
212                $conv = $punyCode->decode($v);
213                if ($conv) {
214                    $arr[$k] = $conv;
215                }
216            }
217            $email_pref = join('.', $arr);
218            $return = $email_pref . '@' . $input;
219        } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
220            // No no in strict mode
221            if ($this->strictMode) {
222                throw new \InvalidArgumentException('Only individual domain name parts can be handled in strict mode');
223            }
224            $parsed = parse_url($input);
225            if (isset($parsed['host'])) {
226                $arr = explode('.', $parsed['host']);
227                foreach ($arr as $k => $v) {
228                    $conv = $punyCode->decode($v);
229                    if ($conv) {
230                        $arr[$k] = $conv;
231                    }
232                }
233                $parsed['host'] = join('.', $arr);
234                $return = (empty($parsed['scheme']) ? '' : $parsed['scheme'] . (strtolower($parsed['scheme']) == 'mailto' ? ':' : '://')).
235                        (empty($parsed['user']) ? '' : $parsed['user'] . (empty($parsed['pass']) ? '' : ':' . $parsed['pass']) . '@').
236                        $parsed['host'].
237                        (empty($parsed['port']) ? '' : ':' . $parsed['port']).
238                        (empty($parsed['path']) ? '' : $parsed['path']).
239                        (empty($parsed['query']) ? '' : '?' . $parsed['query']).
240                        (empty($parsed['fragment']) ? '' : '#' . $parsed['fragment']);
241            } else { // parse_url seems to have failed, try without it
242                $arr = explode('.', $input);
243                foreach ($arr as $k => $v) {
244                    $conv = $punyCode->decode($v);
245                    if ($conv) {
246                        $arr[$k] = $conv;
247                    }
248                }
249                $return = join('.', $arr);
250            }
251        } else { // Otherwise we consider it being a pure domain name string
252            $return = $punyCode->decode($input);
253            if (!$return) {
254                $return = $input;
255            }
256        }
257        // The output is UTF-8 by default, other output formats need conversion here
258        // If one time encoding is given, use this, else the objects property
259        $outputEncoding = ($one_time_encoding) ? $one_time_encoding : $this->encoding;
260        switch ($outputEncoding) {
261            case 'utf8':
262                return $return; // break;
263            case 'ucs4_string':
264                return $this->UnicodeTranscoder->convert($return, 'utf8', 'ucs4');  // break;
265            case 'ucs4_array':
266                return $this->UnicodeTranscoder->convert($return, 'utf8', 'ucs4array');  // break;
267            default:
268                throw new \InvalidArgumentException(sprintf('Unsupported output encoding %s', $outputEncoding));
269        }
270    }
271
272    /**
273     * Encode a given UTF-8 domain name
274     * @param string $decoded  Domain name (UTF-8 or UCS-4)
275     * [@param boolean  $one_time_encoding  Desired input encoding, see {@link set_parameter}]
276     * @return string   Encoded Domain name (ACE string)
277     */
278    public function encode($decoded, $one_time_encoding = false)
279    {
280        // Forcing conversion of input to UCS4 array
281        // If one time encoding is given, use this, else the objects property
282        $inputEncoding = $one_time_encoding ? $one_time_encoding : $this->encoding;
283        switch ($inputEncoding) {
284            case 'utf8':
285                $decoded = $this->UnicodeTranscoder->convert($decoded, 'utf8', 'ucs4array');
286                break;
287            case 'ucs4_string':
288                $decoded = $this->UnicodeTranscoder->convert($decoded, 'ucs4', 'ucs4array');
289                break;
290            case 'ucs4_array':
291                break;
292            default:
293                throw new \InvalidArgumentException(sprintf('Unsupported input encoding %s', $inputEncoding));
294        }
295
296        // No input, no output, what else did you expect?
297        if (empty($decoded)) {
298            return '';
299        }
300
301        $punyCode = $this->punycodeFactory();
302
303        // Anchors for iteration
304        $last_begin = 0;
305        // Output string
306        $output = '';
307        foreach ($decoded as $k => $v) {
308            // Make sure to use just the plain dot
309            switch ($v) {
310                case 0x3002:
311                case 0xFF0E:
312                case 0xFF61:
313                    $decoded[$k] = 0x2E;
314                    // Right, no break here, the above are converted to dots anyway
315                // Stumbling across an anchoring character
316                case 0x2E:
317                case 0x2F:
318                case 0x3A:
319                case 0x3F:
320                case 0x40:
321                    // Neither email addresses nor URLs allowed in strict mode
322                    if ($this->strictMode) {
323                        throw new \InvalidArgumentException('Neither email addresses nor URLs are allowed in strict mode.');
324                    } else {
325                        // Skip first char
326                        if ($k) {
327                            $encoded = $punyCode->encode(array_slice($decoded, $last_begin, (($k) - $last_begin)));
328                            if ($encoded) {
329                                $output .= $encoded;
330                            } else {
331                                $output .= $this->UnicodeTranscoder->convert(array_slice($decoded, $last_begin, (($k) - $last_begin)), 'ucs4array', 'utf8');
332                            }
333                            $output .= chr($decoded[$k]);
334                        }
335                        $last_begin = $k + 1;
336                    }
337            }
338        }
339        // Catch the rest of the string
340        if ($last_begin) {
341            $inp_len = sizeof($decoded);
342            $encoded = $punyCode->encode(array_slice($decoded, $last_begin, (($inp_len) - $last_begin)));
343            if ($encoded) {
344                $output .= $encoded;
345            } else {
346                $output .= $this->UnicodeTranscoder->convert(array_slice($decoded, $last_begin, (($inp_len) - $last_begin)), 'ucs4array', 'utf8');
347            }
348            return $output;
349        } else {
350            if (false !== ($output = $punyCode->encode($decoded))) {
351                return $output;
352            } else {
353                return $this->UnicodeTranscoder->convert($decoded, 'ucs4array', 'utf8');
354            }
355        }
356    }
357
358    /**
359     * Mitigates a weakness of encode(), which cannot properly handle URIs but instead encodes their
360     * path or query components, too.
361     * @param string  $uri  Expects the URI as a UTF-8 (or ASCII) string
362     * @return  string  The URI encoded to Punycode, everything but the host component is left alone
363     * @since 0.6.4
364     */
365    public function encodeUri($uri)
366    {
367        $parsed = parse_url($uri);
368        if (!isset($parsed['host'])) {
369            throw new \InvalidArgumentException('The given string does not look like a URI');
370        }
371        $arr = explode('.', $parsed['host']);
372        foreach ($arr as $k => $v) {
373            $conv = $this->encode($v, 'utf8');
374            if ($conv) {
375                $arr[$k] = $conv;
376            }
377        }
378        $parsed['host'] = join('.', $arr);
379        $return = (empty($parsed['scheme']) ? '' : $parsed['scheme'] . (strtolower($parsed['scheme']) == 'mailto' ? ':' : '://')).
380                (empty($parsed['user']) ? '' : $parsed['user'] . (empty($parsed['pass']) ? '' : ':' . $parsed['pass']) . '@').
381                $parsed['host'].
382                (empty($parsed['port']) ? '' : ':' . $parsed['port']).
383                (empty($parsed['path']) ? '' : $parsed['path']).
384                (empty($parsed['query']) ? '' : '?' . $parsed['query']).
385                (empty($parsed['fragment']) ? '' : '#' . $parsed['fragment']);
386        return $return;
387    }
388
389    /**
390     * The actual punycode class is rather costly, as well as passing the huge nameprep database around.
391     * This factory method allows to ease the burden when dealing with multiple IDN versions.
392     *
393     * @return \Mso\IdnaConvert\Punycode
394     */
395    protected function punycodeFactory()
396    {
397        static $instances = array();
398
399        if (!isset($instances[$this->idnVersion])) {
400            $instances[$this->idnVersion] = new Punycode($this->NamePrepData, $this->UnicodeTranscoder);
401        }
402        return $instances[$this->idnVersion];
403    }
404
405}