PageRenderTime 35ms CodeModel.GetById 9ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/external/idna_convert/src/IdnaConvert.php

https://bitbucket.org/navigatecms/navigatecms
PHP | 405 lines | 254 code | 33 blank | 118 comment | 45 complexity | a9473cc50392d9419e40121529a3e21e MD5 | raw file
Possible License(s): GPL-2.0, MIT, LGPL-2.1, BSD-3-Clause, AGPL-3.0, Apache-2.0
  1. <?php
  2. // {{{ license
  3. /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
  4. //
  5. // +----------------------------------------------------------------------+
  6. // | This library is free software; you can redistribute it and/or modify |
  7. // | it under the terms of the GNU Lesser General Public License as |
  8. // | published by the Free Software Foundation; either version 2.1 of the |
  9. // | License, or (at your option) any later version. |
  10. // | |
  11. // | This library is distributed in the hope that it will be useful, but |
  12. // | WITHOUT ANY WARRANTY; without even the implied warranty of |
  13. // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
  14. // | Lesser General Public License for more details. |
  15. // | |
  16. // | You should have received a copy of the GNU Lesser General Public |
  17. // | License along with this library; if not, write to the Free Software |
  18. // | Foundation, Inc., 51 Franklin St, Boston, MA 02110, United States |
  19. // +----------------------------------------------------------------------+
  20. //
  21. // }}}
  22. /**
  23. * Encode/decode Internationalized Domain Names.
  24. *
  25. * The class allows to convert internationalized domain names
  26. * (see RFC 3490 for details) as they can be used with various registries worldwide
  27. * to be translated between their original (localized) form and their encoded form
  28. * as it will be used in the DNS (Domain Name System).
  29. *
  30. * The class provides two public methods, encode() and decode(), which do exactly
  31. * what you would expect them to do. You are allowed to use complete domain names,
  32. * simple strings and complete email addresses as well. That means, that you might
  33. * use any of the following notations:
  34. *
  35. * - www.nรถrgler.com
  36. * - xn--nrgler-wxa
  37. * - xn--brse-5qa.xn--knrz-1ra.info
  38. *
  39. * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4 array.
  40. * Unicode output is available in the same formats.
  41. * You can select your preferred format via {@link set_paramter()}.
  42. *
  43. * ACE input and output is always expected to be ASCII.
  44. *
  45. * @author Matthias Sommerfeld <mso@phlylabs.de>
  46. * @copyright 2004-2016 phlyLabs Berlin, http://phlylabs.de
  47. * @version 1.0.1-dev 2016-01-12
  48. */
  49. namespace Mso\IdnaConvert;
  50. class IdnaConvert {
  51. const Version = '1.1.0';
  52. const SubVersion = 'main';
  53. // Internal settings, do not touch!
  54. protected $encoding = 'utf8'; // Default input charset is UTF-8
  55. protected $strictMode = false; // Behave strict or not
  56. protected $idnVersion = '2008'; // Can be either 2003 (old) or 2008 (default)
  57. protected $NamePrepData = null;
  58. protected $UnicodeTranscoder = null;
  59. /**
  60. * the constructor
  61. *
  62. * @param array|null $params Parameters to control the class' behaviour
  63. * @since 0.5.2
  64. */
  65. public function __construct($params = null)
  66. {
  67. $this->UnicodeTranscoder = new UnicodeTranscoder();
  68. // Kept for backwarsds compatibility. Consider using the setter methods instead.
  69. if (!empty($params) && is_array($params)) {
  70. if (isset($params['encoding'])) {
  71. $this->setEncoding($params['encoding']);
  72. }
  73. if (isset($params['idn_version'])) {
  74. $this->setIdnVersion($params['idn_version']);
  75. }
  76. if (isset($params['strict_mode'])) {
  77. $this->setStrictMode($params['strict_mode']);
  78. }
  79. }
  80. $this->setIdnVersion($this->idnVersion);
  81. }
  82. public function getClassVersion()
  83. {
  84. return self::Version.'-'.self::SubVersion;
  85. }
  86. /**
  87. * @return string
  88. */
  89. public function getEncoding()
  90. {
  91. return $this->encoding;
  92. }
  93. /**
  94. * @param string $encoding
  95. */
  96. public function setEncoding($encoding)
  97. {
  98. switch ($encoding) {
  99. case 'utf8':
  100. case 'ucs4_string':
  101. case 'ucs4_array':
  102. $this->encoding = $encoding;
  103. break;
  104. default:
  105. throw new \InvalidArgumentException(sprintf('Invalid encoding %s', $encoding));
  106. }
  107. }
  108. /**
  109. * @return boolean
  110. */
  111. public function isStrictMode()
  112. {
  113. return $this->strictMode;
  114. }
  115. /**
  116. * @param boolean $strictMode
  117. */
  118. public function setStrictMode($strictMode)
  119. {
  120. $this->strictMode = ($strictMode) ? true : false;
  121. }
  122. /**
  123. * @return int
  124. */
  125. public function getIdnVersion()
  126. {
  127. return $this->idnVersion;
  128. }
  129. /**
  130. * @param int $idnVersion
  131. */
  132. public function setIdnVersion($idnVersion)
  133. {
  134. if (in_array($idnVersion, array('2003', '2008'))) {
  135. if (is_null($this->NamePrepData) || $idnVersion != $this->idnVersion) {
  136. $this->NamePrepData = null; // Ought to destroy the object's reference
  137. // Re-instantiate with different data set
  138. $this->NamePrepData = ($idnVersion == 2003)
  139. ? new NamePrepData2003()
  140. : new NamePrepData();
  141. }
  142. $this->idnVersion = $idnVersion;
  143. } else {
  144. throw new \InvalidArgumentException(sprintf('Invalid IDN version %d', $idnVersion));
  145. }
  146. }
  147. /**
  148. * Decode a given ACE domain name
  149. * @param string $input Domain name (ACE string)
  150. * [@param string $one_time_encoding Desired output encoding]
  151. * @return string Decoded Domain name (UTF-8 or UCS-4)
  152. */
  153. public function decode($input, $one_time_encoding = null)
  154. {
  155. $punyCode = $this->punycodeFactory();
  156. // Optionally set
  157. if ($one_time_encoding) {
  158. switch ($one_time_encoding) {
  159. case 'utf8':
  160. case 'ucs4_string':
  161. case 'ucs4_array':
  162. break;
  163. default:
  164. throw new \InvalidArgumentException(sprintf('Invalid encoding %s', $one_time_encoding));
  165. }
  166. }
  167. // Make sure to drop any newline characters around
  168. $input = trim($input);
  169. // Negotiate input and try to determine, whether it is a plain string,
  170. // an email address or something like a complete URL
  171. if (strpos($input, '@')) { // Maybe it is an email address
  172. // No no in strict mode
  173. if ($this->strictMode) {
  174. throw new \InvalidArgumentException('Only individual domain name parts can be handled in strict mode');
  175. }
  176. list ($email_pref, $input) = explode('@', $input, 2);
  177. $arr = explode('.', $input);
  178. foreach ($arr as $k => $v) {
  179. $conv = $punyCode->decode($v);
  180. if ($conv) {
  181. $arr[$k] = $conv;
  182. }
  183. }
  184. $input = join('.', $arr);
  185. $arr = explode('.', $email_pref);
  186. foreach ($arr as $k => $v) {
  187. $conv = $punyCode->decode($v);
  188. if ($conv) {
  189. $arr[$k] = $conv;
  190. }
  191. }
  192. $email_pref = join('.', $arr);
  193. $return = $email_pref . '@' . $input;
  194. } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
  195. // No no in strict mode
  196. if ($this->strictMode) {
  197. throw new \InvalidArgumentException('Only individual domain name parts can be handled in strict mode');
  198. }
  199. $parsed = parse_url($input);
  200. if (isset($parsed['host'])) {
  201. $arr = explode('.', $parsed['host']);
  202. foreach ($arr as $k => $v) {
  203. $conv = $punyCode->decode($v);
  204. if ($conv) {
  205. $arr[$k] = $conv;
  206. }
  207. }
  208. $parsed['host'] = join('.', $arr);
  209. $return = (empty($parsed['scheme']) ? '' : $parsed['scheme'] . (strtolower($parsed['scheme']) == 'mailto' ? ':' : '://')).
  210. (empty($parsed['user']) ? '' : $parsed['user'] . (empty($parsed['pass']) ? '' : ':' . $parsed['pass']) . '@').
  211. $parsed['host'].
  212. (empty($parsed['port']) ? '' : ':' . $parsed['port']).
  213. (empty($parsed['path']) ? '' : $parsed['path']).
  214. (empty($parsed['query']) ? '' : '?' . $parsed['query']).
  215. (empty($parsed['fragment']) ? '' : '#' . $parsed['fragment']);
  216. } else { // parse_url seems to have failed, try without it
  217. $arr = explode('.', $input);
  218. foreach ($arr as $k => $v) {
  219. $conv = $punyCode->decode($v);
  220. if ($conv) {
  221. $arr[$k] = $conv;
  222. }
  223. }
  224. $return = join('.', $arr);
  225. }
  226. } else { // Otherwise we consider it being a pure domain name string
  227. $return = $punyCode->decode($input);
  228. if (!$return) {
  229. $return = $input;
  230. }
  231. }
  232. // The output is UTF-8 by default, other output formats need conversion here
  233. // If one time encoding is given, use this, else the objects property
  234. $outputEncoding = ($one_time_encoding) ? $one_time_encoding : $this->encoding;
  235. switch ($outputEncoding) {
  236. case 'utf8':
  237. return $return; // break;
  238. case 'ucs4_string':
  239. return $this->UnicodeTranscoder->convert($return, 'utf8', 'ucs4'); // break;
  240. case 'ucs4_array':
  241. return $this->UnicodeTranscoder->convert($return, 'utf8', 'ucs4array'); // break;
  242. default:
  243. throw new \InvalidArgumentException(sprintf('Unsupported output encoding %s', $outputEncoding));
  244. }
  245. }
  246. /**
  247. * Encode a given UTF-8 domain name
  248. * @param string $decoded Domain name (UTF-8 or UCS-4)
  249. * [@param boolean $one_time_encoding Desired input encoding, see {@link set_parameter}]
  250. * @return string Encoded Domain name (ACE string)
  251. */
  252. public function encode($decoded, $one_time_encoding = false)
  253. {
  254. // Forcing conversion of input to UCS4 array
  255. // If one time encoding is given, use this, else the objects property
  256. $inputEncoding = $one_time_encoding ? $one_time_encoding : $this->encoding;
  257. switch ($inputEncoding) {
  258. case 'utf8':
  259. $decoded = $this->UnicodeTranscoder->convert($decoded, 'utf8', 'ucs4array');
  260. break;
  261. case 'ucs4_string':
  262. $decoded = $this->UnicodeTranscoder->convert($decoded, 'ucs4', 'ucs4array');
  263. break;
  264. case 'ucs4_array':
  265. break;
  266. default:
  267. throw new \InvalidArgumentException(sprintf('Unsupported input encoding %s', $inputEncoding));
  268. }
  269. // No input, no output, what else did you expect?
  270. if (empty($decoded)) {
  271. return '';
  272. }
  273. $punyCode = $this->punycodeFactory();
  274. // Anchors for iteration
  275. $last_begin = 0;
  276. // Output string
  277. $output = '';
  278. foreach ($decoded as $k => $v) {
  279. // Make sure to use just the plain dot
  280. switch ($v) {
  281. case 0x3002:
  282. case 0xFF0E:
  283. case 0xFF61:
  284. $decoded[$k] = 0x2E;
  285. // Right, no break here, the above are converted to dots anyway
  286. // Stumbling across an anchoring character
  287. case 0x2E:
  288. case 0x2F:
  289. case 0x3A:
  290. case 0x3F:
  291. case 0x40:
  292. // Neither email addresses nor URLs allowed in strict mode
  293. if ($this->strictMode) {
  294. throw new \InvalidArgumentException('Neither email addresses nor URLs are allowed in strict mode.');
  295. } else {
  296. // Skip first char
  297. if ($k) {
  298. $encoded = $punyCode->encode(array_slice($decoded, $last_begin, (($k) - $last_begin)));
  299. if ($encoded) {
  300. $output .= $encoded;
  301. } else {
  302. $output .= $this->UnicodeTranscoder->convert(array_slice($decoded, $last_begin, (($k) - $last_begin)), 'ucs4array', 'utf8');
  303. }
  304. $output .= chr($decoded[$k]);
  305. }
  306. $last_begin = $k + 1;
  307. }
  308. }
  309. }
  310. // Catch the rest of the string
  311. if ($last_begin) {
  312. $inp_len = sizeof($decoded);
  313. $encoded = $punyCode->encode(array_slice($decoded, $last_begin, (($inp_len) - $last_begin)));
  314. if ($encoded) {
  315. $output .= $encoded;
  316. } else {
  317. $output .= $this->UnicodeTranscoder->convert(array_slice($decoded, $last_begin, (($inp_len) - $last_begin)), 'ucs4array', 'utf8');
  318. }
  319. return $output;
  320. } else {
  321. if (false !== ($output = $punyCode->encode($decoded))) {
  322. return $output;
  323. } else {
  324. return $this->UnicodeTranscoder->convert($decoded, 'ucs4array', 'utf8');
  325. }
  326. }
  327. }
  328. /**
  329. * Mitigates a weakness of encode(), which cannot properly handle URIs but instead encodes their
  330. * path or query components, too.
  331. * @param string $uri Expects the URI as a UTF-8 (or ASCII) string
  332. * @return string The URI encoded to Punycode, everything but the host component is left alone
  333. * @since 0.6.4
  334. */
  335. public function encodeUri($uri)
  336. {
  337. $parsed = parse_url($uri);
  338. if (!isset($parsed['host'])) {
  339. throw new \InvalidArgumentException('The given string does not look like a URI');
  340. }
  341. $arr = explode('.', $parsed['host']);
  342. foreach ($arr as $k => $v) {
  343. $conv = $this->encode($v, 'utf8');
  344. if ($conv) {
  345. $arr[$k] = $conv;
  346. }
  347. }
  348. $parsed['host'] = join('.', $arr);
  349. $return = (empty($parsed['scheme']) ? '' : $parsed['scheme'] . (strtolower($parsed['scheme']) == 'mailto' ? ':' : '://')).
  350. (empty($parsed['user']) ? '' : $parsed['user'] . (empty($parsed['pass']) ? '' : ':' . $parsed['pass']) . '@').
  351. $parsed['host'].
  352. (empty($parsed['port']) ? '' : ':' . $parsed['port']).
  353. (empty($parsed['path']) ? '' : $parsed['path']).
  354. (empty($parsed['query']) ? '' : '?' . $parsed['query']).
  355. (empty($parsed['fragment']) ? '' : '#' . $parsed['fragment']);
  356. return $return;
  357. }
  358. /**
  359. * The actual punycode class is rather costly, as well as passing the huge nameprep database around.
  360. * This factory method allows to ease the burden when dealing with multiple IDN versions.
  361. *
  362. * @return \Mso\IdnaConvert\Punycode
  363. */
  364. protected function punycodeFactory()
  365. {
  366. static $instances = array();
  367. if (!isset($instances[$this->idnVersion])) {
  368. $instances[$this->idnVersion] = new Punycode($this->NamePrepData, $this->UnicodeTranscoder);
  369. }
  370. return $instances[$this->idnVersion];
  371. }
  372. }