PageRenderTime 52ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/tests/utf/normalizer_test.php

http://github.com/phpbb/phpbb3
PHP | 327 lines | 203 code | 49 blank | 75 comment | 28 complexity | 56d51d283b4d082eef661ba162cfd8f8 MD5 | raw file
Possible License(s): AGPL-1.0
  1. <?php
  2. /**
  3. *
  4. * This file is part of the phpBB Forum Software package.
  5. *
  6. * @copyright (c) phpBB Limited <https://www.phpbb.com>
  7. * @license GNU General Public License, version 2 (GPL-2.0)
  8. *
  9. * For full copyright and license information, please see
  10. * the docs/CREDITS.txt file.
  11. *
  12. */
  13. require_once dirname(__FILE__) . '/../../phpBB/includes/utf/utf_normalizer.php';
  14. /**
  15. * @group slow
  16. */
  17. class phpbb_utf_normalizer_test extends phpbb_test_case
  18. {
  19. static private $data_dir;
  20. static public function setUpBeforeClass()
  21. {
  22. self::$data_dir = dirname(__file__) . '/../tmp';
  23. self::download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt', self::$data_dir);
  24. self::download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt', self::$data_dir);
  25. }
  26. public function test_normalizer()
  27. {
  28. $test_suite = array(
  29. /**
  30. * NFC
  31. * c2 == NFC(c1) == NFC(c2) == NFC(c3)
  32. * c4 == NFC(c4) == NFC(c5)
  33. */
  34. 'NFC' => array(
  35. 'c2' => array('c1', 'c2', 'c3'),
  36. 'c4' => array('c4', 'c5')
  37. ),
  38. /**
  39. * NFD
  40. * c3 == NFD(c1) == NFD(c2) == NFD(c3)
  41. * c5 == NFD(c4) == NFD(c5)
  42. */
  43. 'NFD' => array(
  44. 'c3' => array('c1', 'c2', 'c3'),
  45. 'c5' => array('c4', 'c5')
  46. ),
  47. /**
  48. * NFKC
  49. * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
  50. */
  51. 'NFKC' => array(
  52. 'c4' => array('c1', 'c2', 'c3', 'c4', 'c5')
  53. ),
  54. /**
  55. * NFKD
  56. * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
  57. */
  58. 'NFKD' => array(
  59. 'c5' => array('c1', 'c2', 'c3', 'c4', 'c5')
  60. )
  61. );
  62. $tested_chars = array();
  63. $fp = fopen(self::$data_dir . '/NormalizationTest.txt', 'rb');
  64. while (!feof($fp))
  65. {
  66. $line = fgets($fp);
  67. if ($line[0] == '@')
  68. {
  69. continue;
  70. }
  71. if (!strpos(' 0123456789ABCDEF', $line[0]))
  72. {
  73. continue;
  74. }
  75. list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
  76. if (!strpos($c1, ' '))
  77. {
  78. /**
  79. * We are currently testing a single character, we add it to the list of
  80. * characters we have processed so that we can exclude it when testing
  81. * for invariants
  82. */
  83. $tested_chars[$c1] = 1;
  84. }
  85. foreach ($test_suite as $form => $serie)
  86. {
  87. foreach ($serie as $expected => $tests)
  88. {
  89. $hex_expected = ${$expected};
  90. $utf_expected = $this->hexseq_to_utf($hex_expected);
  91. foreach ($tests as $test)
  92. {
  93. $utf_result = $utf_expected;
  94. call_user_func_array(array('utf_normalizer', $form), array(&$utf_result));
  95. $hex_result = $this->utf_to_hexseq($utf_result);
  96. $this->assertEquals($utf_expected, $utf_result, "$expected == $form($test) ($hex_expected != $hex_result)");
  97. }
  98. }
  99. }
  100. }
  101. fclose($fp);
  102. return $tested_chars;
  103. }
  104. /**
  105. * @depends test_normalizer
  106. */
  107. public function test_invariants(array $tested_chars)
  108. {
  109. $fp = fopen(self::$data_dir . '/UnicodeData.txt', 'rb');
  110. while (!feof($fp))
  111. {
  112. $line = fgets($fp, 1024);
  113. if (!$pos = strpos($line, ';'))
  114. {
  115. continue;
  116. }
  117. $hex_tested = $hex_expected = substr($line, 0, $pos);
  118. if (isset($tested_chars[$hex_tested]))
  119. {
  120. continue;
  121. }
  122. $utf_expected = $this->hex_to_utf($hex_expected);
  123. if ($utf_expected >= UTF8_SURROGATE_FIRST
  124. && $utf_expected <= UTF8_SURROGATE_LAST)
  125. {
  126. /**
  127. * Surrogates are illegal on their own, we expect the normalizer
  128. * to return a replacement char
  129. */
  130. $utf_expected = UTF8_REPLACEMENT;
  131. $hex_expected = $this->utf_to_hexseq($utf_expected);
  132. }
  133. foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
  134. {
  135. $utf_result = $utf_expected;
  136. call_user_func_array(array('utf_normalizer', $form), array(&$utf_result));
  137. $hex_result = $this->utf_to_hexseq($utf_result);
  138. $this->assertEquals($utf_expected, $utf_result, "$hex_expected == $form($hex_tested) ($hex_expected != $hex_result)");
  139. }
  140. }
  141. fclose($fp);
  142. }
  143. /**
  144. * Convert a UTF string to a sequence of codepoints in hexadecimal
  145. *
  146. * @param string $utf UTF string
  147. * @return integer Unicode codepoints in hex
  148. */
  149. protected function utf_to_hexseq($str)
  150. {
  151. $pos = 0;
  152. $len = strlen($str);
  153. $ret = array();
  154. while ($pos < $len)
  155. {
  156. $c = $str[$pos];
  157. switch ($c & "\xF0")
  158. {
  159. case "\xC0":
  160. case "\xD0":
  161. $utf_char = substr($str, $pos, 2);
  162. $pos += 2;
  163. break;
  164. case "\xE0":
  165. $utf_char = substr($str, $pos, 3);
  166. $pos += 3;
  167. break;
  168. case "\xF0":
  169. $utf_char = substr($str, $pos, 4);
  170. $pos += 4;
  171. break;
  172. default:
  173. $utf_char = $c;
  174. ++$pos;
  175. }
  176. $hex = dechex($this->utf_to_cp($utf_char));
  177. if (!isset($hex[3]))
  178. {
  179. $hex = substr('000' . $hex, -4);
  180. }
  181. $ret[] = $hex;
  182. }
  183. return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
  184. }
  185. /**
  186. * Convert a UTF-8 char to its codepoint
  187. *
  188. * @param string $utf_char UTF-8 char
  189. * @return integer Unicode codepoint
  190. */
  191. protected function utf_to_cp($utf_char)
  192. {
  193. switch (strlen($utf_char))
  194. {
  195. case 1:
  196. return ord($utf_char);
  197. case 2:
  198. return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F);
  199. case 3:
  200. return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F);
  201. case 4:
  202. return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F);
  203. default:
  204. throw new RuntimeException('UTF-8 chars can only be 1-4 bytes long');
  205. }
  206. }
  207. /**
  208. * Return a UTF string formed from a sequence of codepoints in hexadecimal
  209. *
  210. * @param string $seq Sequence of codepoints, separated with a space
  211. * @return string UTF-8 string
  212. */
  213. protected function hexseq_to_utf($seq)
  214. {
  215. return implode('', array_map(array($this, 'hex_to_utf'), explode(' ', $seq)));
  216. }
  217. /**
  218. * Convert a codepoint in hexadecimal to a UTF-8 char
  219. *
  220. * @param string $hex Codepoint, in hexadecimal
  221. * @return string UTF-8 char
  222. */
  223. protected function hex_to_utf($hex)
  224. {
  225. return $this->cp_to_utf(hexdec($hex));
  226. }
  227. /**
  228. * Convert a codepoint to a UTF-8 char
  229. *
  230. * @param integer $cp Unicode codepoint
  231. * @return string UTF-8 string
  232. */
  233. protected function cp_to_utf($cp)
  234. {
  235. if ($cp > 0xFFFF)
  236. {
  237. return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  238. }
  239. else if ($cp > 0x7FF)
  240. {
  241. return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  242. }
  243. else if ($cp > 0x7F)
  244. {
  245. return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
  246. }
  247. else
  248. {
  249. return chr($cp);
  250. }
  251. }
  252. // chunked download helper
  253. static protected function download($url, $to)
  254. {
  255. $target = $to . '/' . basename($url);
  256. if (file_exists($target))
  257. {
  258. return;
  259. }
  260. if (!$fpr = fopen($url, 'rb'))
  261. {
  262. echo "Failed to download $url\n";
  263. return;
  264. }
  265. if (!$fpw = fopen($target, 'wb'))
  266. {
  267. echo "Failed to open $target for writing\n";
  268. return;
  269. }
  270. $chunk = 32768;
  271. while (!feof($fpr))
  272. {
  273. fwrite($fpw, fread($fpr, $chunk));
  274. }
  275. fclose($fpr);
  276. fclose($fpw);
  277. }
  278. }