PageRenderTime 39ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/phpBB/develop/utf_normalizer_test.php

http://github.com/phpbb/phpbb3
PHP | 394 lines | 240 code | 63 blank | 91 comment | 40 complexity | d8cedc191e917917c971150c3504a109 MD5 | raw file
Possible License(s): AGPL-1.0
  1. <?php
  2. /**
  3. *
  4. * This file is part of the phpBB Forum Software package.
  5. *
  6. * @copyright (c) phpBB Limited <https://www.phpbb.com>
  7. * @license GNU General Public License, version 2 (GPL-2.0)
  8. *
  9. * For full copyright and license information, please see
  10. * the docs/CREDITS.txt file.
  11. *
  12. */
  13. if (php_sapi_name() != 'cli')
  14. {
  15. die("This program must be run from the command line.\n");
  16. }
  17. //
  18. // Security message:
  19. //
  20. // This script is potentially dangerous.
  21. // Remove or comment the next line (die(".... ) to enable this script.
  22. // Do NOT FORGET to either remove this script or disable it after you have used it.
  23. //
  24. die("Please read the first lines of this script for instructions on how to enable it");
  25. set_time_limit(0);
  26. error_reporting(E_ALL);
  27. define('IN_PHPBB', true);
  28. $phpbb_root_path = '../';
  29. $phpEx = substr(strrchr(__FILE__, '.'), 1);
  30. /**
  31. * Let's download some files we need
  32. */
  33. download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt');
  34. download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
  35. /**
  36. * Those are the tests we run
  37. */
  38. $test_suite = array(
  39. /**
  40. * NFC
  41. * c2 == NFC(c1) == NFC(c2) == NFC(c3)
  42. * c4 == NFC(c4) == NFC(c5)
  43. */
  44. 'NFC' => array(
  45. 'c2' => array('c1', 'c2', 'c3'),
  46. 'c4' => array('c4', 'c5')
  47. ),
  48. /**
  49. * NFD
  50. * c3 == NFD(c1) == NFD(c2) == NFD(c3)
  51. * c5 == NFD(c4) == NFD(c5)
  52. */
  53. 'NFD' => array(
  54. 'c3' => array('c1', 'c2', 'c3'),
  55. 'c5' => array('c4', 'c5')
  56. ),
  57. /**
  58. * NFKC
  59. * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
  60. */
  61. 'NFKC' => array(
  62. 'c4' => array('c1', 'c2', 'c3', 'c4', 'c5')
  63. ),
  64. /**
  65. * NFKD
  66. * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
  67. */
  68. 'NFKD' => array(
  69. 'c5' => array('c1', 'c2', 'c3', 'c4', 'c5')
  70. )
  71. );
  72. require_once($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
  73. $i = $n = 0;
  74. $failed = false;
  75. $tested_chars = array();
  76. $fp = fopen($phpbb_root_path . 'develop/NormalizationTest.txt', 'rb');
  77. while (!feof($fp))
  78. {
  79. $line = fgets($fp);
  80. ++$n;
  81. if ($line[0] == '@')
  82. {
  83. if ($i)
  84. {
  85. echo "done\n";
  86. }
  87. $i = 0;
  88. echo "\n", substr($line, 1), "\n\n";
  89. continue;
  90. }
  91. if (!strpos(' 0123456789ABCDEF', $line[0]))
  92. {
  93. continue;
  94. }
  95. if (++$i % 100 == 0)
  96. {
  97. echo $i, ' ';
  98. }
  99. list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
  100. if (!strpos($c1, ' '))
  101. {
  102. /**
  103. * We are currently testing a single character, we add it to the list of
  104. * characters we have processed so that we can exclude it when testing
  105. * for invariants
  106. */
  107. $tested_chars[$c1] = 1;
  108. }
  109. foreach ($test_suite as $form => $serie)
  110. {
  111. foreach ($serie as $expected => $tests)
  112. {
  113. $hex_expected = ${$expected};
  114. $utf_expected = hexseq_to_utf($hex_expected);
  115. foreach ($tests as $test)
  116. {
  117. $utf_result = $utf_expected;
  118. call_user_func(array('utf_normalizer', $form), $utf_result);
  119. if (strcmp($utf_expected, $utf_result))
  120. {
  121. $failed = true;
  122. $hex_result = utf_to_hexseq($utf_result);
  123. echo "\nFAILED $expected == $form($test) ($hex_expected != $hex_result)";
  124. }
  125. }
  126. }
  127. if ($failed)
  128. {
  129. die("\n\nFailed at line $n\n");
  130. }
  131. }
  132. }
  133. fclose($fp);
  134. /**
  135. * Test for invariants
  136. */
  137. echo "\n\nTesting for invariants...\n\n";
  138. $fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
  139. $n = 0;
  140. while (!feof($fp))
  141. {
  142. if (++$n % 100 == 0)
  143. {
  144. echo $n, ' ';
  145. }
  146. $line = fgets($fp, 1024);
  147. if (!$pos = strpos($line, ';'))
  148. {
  149. continue;
  150. }
  151. $hex_tested = $hex_expected = substr($line, 0, $pos);
  152. if (isset($tested_chars[$hex_tested]))
  153. {
  154. continue;
  155. }
  156. $utf_expected = hex_to_utf($hex_expected);
  157. if ($utf_expected >= UTF8_SURROGATE_FIRST
  158. && $utf_expected <= UTF8_SURROGATE_LAST)
  159. {
  160. /**
  161. * Surrogates are illegal on their own, we expect the normalizer
  162. * to return a replacement char
  163. */
  164. $utf_expected = UTF8_REPLACEMENT;
  165. $hex_expected = utf_to_hexseq($utf_expected);
  166. }
  167. foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
  168. {
  169. $utf_result = $utf_expected;
  170. utf_normalizer::$form($utf_result);
  171. $hex_result = utf_to_hexseq($utf_result);
  172. // echo "$form($utf_expected) == $utf_result\n";
  173. if (strcmp($utf_expected, $utf_result))
  174. {
  175. $failed = 1;
  176. echo "\nFAILED $hex_expected == $form($hex_tested) ($hex_expected != $hex_result)";
  177. }
  178. }
  179. if ($failed)
  180. {
  181. die("\n\nFailed at line $n\n");
  182. }
  183. }
  184. fclose($fp);
  185. die("\n\nALL TESTS PASSED SUCCESSFULLY\n");
  186. /**
  187. * Download a file to the develop/ dir
  188. *
  189. * @param string $url URL of the file to download
  190. * @return null
  191. */
  192. function download($url)
  193. {
  194. global $phpbb_root_path;
  195. if (file_exists($phpbb_root_path . 'develop/' . basename($url)))
  196. {
  197. return;
  198. }
  199. echo 'Downloading from ', $url, ' ';
  200. if (!$fpr = fopen($url, 'rb'))
  201. {
  202. die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");
  203. }
  204. if (!$fpw = fopen($phpbb_root_path . 'develop/' . basename($url), 'wb'))
  205. {
  206. die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");
  207. }
  208. $i = 0;
  209. $chunk = 32768;
  210. $done = '';
  211. while (!feof($fpr))
  212. {
  213. $i += fwrite($fpw, fread($fpr, $chunk));
  214. echo str_repeat("\x08", strlen($done));
  215. $done = ($i >> 10) . ' KiB';
  216. echo $done;
  217. }
  218. fclose($fpr);
  219. fclose($fpw);
  220. echo "\n";
  221. }
  222. /**
  223. * Convert a UTF string to a sequence of codepoints in hexadecimal
  224. *
  225. * @param string $utf UTF string
  226. * @return integer Unicode codepoints in hex
  227. */
  228. function utf_to_hexseq($str)
  229. {
  230. $pos = 0;
  231. $len = strlen($str);
  232. $ret = array();
  233. while ($pos < $len)
  234. {
  235. $c = $str[$pos];
  236. switch ($c & "\xF0")
  237. {
  238. case "\xC0":
  239. case "\xD0":
  240. $utf_char = substr($str, $pos, 2);
  241. $pos += 2;
  242. break;
  243. case "\xE0":
  244. $utf_char = substr($str, $pos, 3);
  245. $pos += 3;
  246. break;
  247. case "\xF0":
  248. $utf_char = substr($str, $pos, 4);
  249. $pos += 4;
  250. break;
  251. default:
  252. $utf_char = $c;
  253. ++$pos;
  254. }
  255. $hex = dechex(utf_to_cp($utf_char));
  256. if (!isset($hex[3]))
  257. {
  258. $hex = substr('000' . $hex, -4);
  259. }
  260. $ret[] = $hex;
  261. }
  262. return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
  263. }
  264. /**
  265. * Convert a UTF-8 char to its codepoint
  266. *
  267. * @param string $utf_char UTF-8 char
  268. * @return integer Unicode codepoint
  269. */
  270. function utf_to_cp($utf_char)
  271. {
  272. switch (strlen($utf_char))
  273. {
  274. case 1:
  275. return ord($utf_char);
  276. case 2:
  277. return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F);
  278. case 3:
  279. return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F);
  280. case 4:
  281. return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F);
  282. default:
  283. die('UTF-8 chars can only be 1-4 bytes long');
  284. }
  285. }
  286. /**
  287. * Return a UTF string formed from a sequence of codepoints in hexadecimal
  288. *
  289. * @param string $seq Sequence of codepoints, separated with a space
  290. * @return string UTF-8 string
  291. */
  292. function hexseq_to_utf($seq)
  293. {
  294. return implode('', array_map('hex_to_utf', explode(' ', $seq)));
  295. }
  296. /**
  297. * Convert a codepoint in hexadecimal to a UTF-8 char
  298. *
  299. * @param string $hex Codepoint, in hexadecimal
  300. * @return string UTF-8 char
  301. */
  302. function hex_to_utf($hex)
  303. {
  304. return cp_to_utf(hexdec($hex));
  305. }
  306. /**
  307. * Convert a codepoint to a UTF-8 char
  308. *
  309. * @param integer $cp Unicode codepoint
  310. * @return string UTF-8 string
  311. */
  312. function cp_to_utf($cp)
  313. {
  314. if ($cp > 0xFFFF)
  315. {
  316. return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  317. }
  318. else if ($cp > 0x7FF)
  319. {
  320. return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
  321. }
  322. else if ($cp > 0x7F)
  323. {
  324. return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
  325. }
  326. else
  327. {
  328. return chr($cp);
  329. }
  330. }