PageRenderTime 44ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/tests/utf/normalizer_test.php

https://github.com/naderman/phpbb-orchestra
PHP | 320 lines | 201 code | 48 blank | 71 comment | 28 complexity | 925b15767c2bc3279bf04d0352ea0350 MD5 | raw file
  1. <?php
  2. /**
  3. *
  4. * @package testing
  5. * @copyright (c) 2011 phpBB Group
  6. * @license http://opensource.org/licenses/gpl-license.php GNU Public License
  7. *
  8. */
  9. require_once dirname(__FILE__) . '/../../phpBB/includes/utf/utf_normalizer.php';
  10. /**
  11. * @group slow
  12. */
  13. class phpbb_utf_normalizer_test extends phpbb_test_case
  14. {
  15. static public function setUpBeforeClass()
  16. {
  17. self::download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt', dirname(__FILE__).'/data');
  18. self::download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt', dirname(__FILE__).'/data');
  19. }
  20. public function test_normalizer()
  21. {
  22. $test_suite = array(
  23. /**
  24. * NFC
  25. * c2 == NFC(c1) == NFC(c2) == NFC(c3)
  26. * c4 == NFC(c4) == NFC(c5)
  27. */
  28. 'NFC' => array(
  29. 'c2' => array('c1', 'c2', 'c3'),
  30. 'c4' => array('c4', 'c5')
  31. ),
  32. /**
  33. * NFD
  34. * c3 == NFD(c1) == NFD(c2) == NFD(c3)
  35. * c5 == NFD(c4) == NFD(c5)
  36. */
  37. 'NFD' => array(
  38. 'c3' => array('c1', 'c2', 'c3'),
  39. 'c5' => array('c4', 'c5')
  40. ),
  41. /**
  42. * NFKC
  43. * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
  44. */
  45. 'NFKC' => array(
  46. 'c4' => array('c1', 'c2', 'c3', 'c4', 'c5')
  47. ),
  48. /**
  49. * NFKD
  50. * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
  51. */
  52. 'NFKD' => array(
  53. 'c5' => array('c1', 'c2', 'c3', 'c4', 'c5')
  54. )
  55. );
  56. $tested_chars = array();
  57. $fp = fopen(dirname(__FILE__).'/data/NormalizationTest.txt', 'rb');
  58. while (!feof($fp))
  59. {
  60. $line = fgets($fp);
  61. if ($line[0] == '@')
  62. {
  63. continue;
  64. }
  65. if (!strpos(' 0123456789ABCDEF', $line[0]))
  66. {
  67. continue;
  68. }
  69. list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
  70. if (!strpos($c1, ' '))
  71. {
  72. /**
  73. * We are currently testing a single character, we add it to the list of
  74. * characters we have processed so that we can exclude it when testing
  75. * for invariants
  76. */
  77. $tested_chars[$c1] = 1;
  78. }
  79. foreach ($test_suite as $form => $serie)
  80. {
  81. foreach ($serie as $expected => $tests)
  82. {
  83. $hex_expected = ${$expected};
  84. $utf_expected = $this->hexseq_to_utf($hex_expected);
  85. foreach ($tests as $test)
  86. {
  87. $utf_result = $utf_expected;
  88. call_user_func(array('utf_normalizer', $form), &$utf_result);
  89. $hex_result = $this->utf_to_hexseq($utf_result);
  90. $this->assertEquals($utf_expected, $utf_result, "$expected == $form($test) ($hex_expected != $hex_result)");
  91. }
  92. }
  93. }
  94. }
  95. fclose($fp);
  96. return $tested_chars;
  97. }
  98. /**
  99. * @depends test_normalizer
  100. */
  101. public function test_invariants(array $tested_chars)
  102. {
  103. $fp = fopen(dirname(__FILE__).'/data/UnicodeData.txt', 'rb');
  104. while (!feof($fp))
  105. {
  106. $line = fgets($fp, 1024);
  107. if (!$pos = strpos($line, ';'))
  108. {
  109. continue;
  110. }
  111. $hex_tested = $hex_expected = substr($line, 0, $pos);
  112. if (isset($tested_chars[$hex_tested]))
  113. {
  114. continue;
  115. }
  116. $utf_expected = $this->hex_to_utf($hex_expected);
  117. if ($utf_expected >= UTF8_SURROGATE_FIRST
  118. && $utf_expected <= UTF8_SURROGATE_LAST)
  119. {
  120. /**
  121. * Surrogates are illegal on their own, we expect the normalizer
  122. * to return a replacement char
  123. */
  124. $utf_expected = UTF8_REPLACEMENT;
  125. $hex_expected = $this->utf_to_hexseq($utf_expected);
  126. }
  127. foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
  128. {
  129. $utf_result = $utf_expected;
  130. call_user_func(array('utf_normalizer', $form), &$utf_result);
  131. $hex_result = $this->utf_to_hexseq($utf_result);
  132. $this->assertEquals($utf_expected, $utf_result, "$hex_expected == $form($hex_tested) ($hex_expected != $hex_result)");
  133. }
  134. }
  135. fclose($fp);
  136. }
  137. /**
  138. * Convert a UTF string to a sequence of codepoints in hexadecimal
  139. *
  140. * @param string $utf UTF string
  141. * @return integer Unicode codepoints in hex
  142. */
  143. protected function utf_to_hexseq($str)
  144. {
  145. $pos = 0;
  146. $len = strlen($str);
  147. $ret = array();
  148. while ($pos < $len)
  149. {
  150. $c = $str[$pos];
  151. switch ($c & "\xF0")
  152. {
  153. case "\xC0":
  154. case "\xD0":
  155. $utf_char = substr($str, $pos, 2);
  156. $pos += 2;
  157. break;
  158. case "\xE0":
  159. $utf_char = substr($str, $pos, 3);
  160. $pos += 3;
  161. break;
  162. case "\xF0":
  163. $utf_char = substr($str, $pos, 4);
  164. $pos += 4;
  165. break;
  166. default:
  167. $utf_char = $c;
  168. ++$pos;
  169. }
  170. $hex = dechex($this->utf_to_cp($utf_char));
  171. if (!isset($hex[3]))
  172. {
  173. $hex = substr('000' . $hex, -4);
  174. }
  175. $ret[] = $hex;
  176. }
  177. return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
  178. }
  179. /**
  180. * Convert a UTF-8 char to its codepoint
  181. *
  182. * @param string $utf_char UTF-8 char
  183. * @return integer Unicode codepoint
  184. */
  185. protected function utf_to_cp($utf_char)
  186. {
  187. switch (strlen($utf_char))
  188. {
  189. case 1:
  190. return ord($utf_char);
  191. case 2:
  192. return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F);
  193. case 3:
  194. return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F);
  195. case 4:
  196. return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F);
  197. default:
  198. throw new RuntimeException('UTF-8 chars can only be 1-4 bytes long');
  199. }
  200. }
  201. /**
  202. * Return a UTF string formed from a sequence of codepoints in hexadecimal
  203. *
  204. * @param string $seq Sequence of codepoints, separated with a space
  205. * @return string UTF-8 string
  206. */
  207. protected function hexseq_to_utf($seq)
  208. {
  209. return implode('', array_map(array($this, 'hex_to_utf'), explode(' ', $seq)));
  210. }
  211. /**
  212. * Convert a codepoint in hexadecimal to a UTF-8 char
  213. *
  214. * @param string $hex Codepoint, in hexadecimal
  215. * @return string UTF-8 char
  216. */
  217. protected function hex_to_utf($hex)
  218. {
  219. return $this->cp_to_utf(hexdec($hex));
  220. }
  221. /**
  222. * Convert a codepoint to a UTF-8 char
  223. *
  224. * @param integer $cp Unicode codepoint
  225. * @return string UTF-8 string
  226. */
  227. protected function cp_to_utf($cp)
  228. {
  229. if ($cp > 0xFFFF)
  230. {
  231. return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  232. }
  233. else if ($cp > 0x7FF)
  234. {
  235. return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  236. }
  237. else if ($cp > 0x7F)
  238. {
  239. return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
  240. }
  241. else
  242. {
  243. return chr($cp);
  244. }
  245. }
  246. // chunked download helper
  247. static protected function download($url, $to)
  248. {
  249. $target = $to . '/' . basename($url);
  250. if (file_exists($target))
  251. {
  252. return;
  253. }
  254. if (!$fpr = fopen($url, 'rb'))
  255. {
  256. echo "Failed to download $url\n";
  257. return;
  258. }
  259. if (!$fpw = fopen($target, 'wb'))
  260. {
  261. echo "Failed to open $target for writing\n";
  262. return;
  263. }
  264. $chunk = 32768;
  265. while (!feof($fpr))
  266. {
  267. fwrite($fpw, fread($fpr, $chunk));
  268. }
  269. fclose($fpr);
  270. fclose($fpw);
  271. }
  272. }