PageRenderTime 51ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/includes/normal/UtfNormalTest2.php

https://bitbucket.org/kgrashad/thawrapedia
PHP | 239 lines | 160 code | 28 blank | 51 comment | 20 complexity | 0cf80510674e232115a532fd27e34ad6 MD5 | raw file
Possible License(s): GPL-2.0, Apache-2.0, LGPL-3.0
  1. #!/usr/bin/php
  2. <?php
  3. /**
  4. * Other tests for the unicode normalization module
  5. *
  6. * @file
  7. * @ingroup UtfNormal
  8. */
  9. if( php_sapi_name() != 'cli' ) {
  10. die( "Run me from the command line please.\n" );
  11. }
  12. // From http://unicode.org/Public/UNIDATA/NormalizationTest.txt
  13. $file = "NormalizationTest.txt";
  14. // Anything after this character is a comment
  15. define ( 'COMMENT', '#' );
  16. // Semicolons are used to separate the columns
  17. define ( 'SEPARATOR', ';' );
  18. $f = fopen($file, "r");
  19. /**
  20. * The following section will be used for testing different normalization methods.
  21. * - Pure PHP
  22. ~ no assertion errors
  23. ~ 6.25 minutes
  24. * - php_utfnormal.so or intl extension: both are wrappers around
  25. libicu so we list the version of libicu when making the
  26. comparison
  27. * - libicu Ubuntu 3.8.1-3ubuntu1.1 php 5.2.6-3ubuntu4.5
  28. ~ 2200 assertion errors
  29. ~ 5 seconds
  30. ~ output: http://paste2.org/p/921566
  31. * - libicu Ubuntu 4.2.1-3 php 5.3.2-1ubuntu4.2
  32. ~ 1384 assertion errors
  33. ~ 15 seconds
  34. ~ output: http://paste2.org/p/921435
  35. * - libicu Debian 4.4.1-5 php 5.3.2-1ubuntu4.2
  36. ~ no assertion errors
  37. ~ 13 seconds
  38. * - Tests comparing pure PHP output with libicu output were added
  39. later and slow down the runtime.
  40. */
  41. require_once("./UtfNormal.php");
  42. function normalize_form_c($c) { return UtfNormal::toNFC($c); }
  43. function normalize_form_d($c) { return UtfNormal::toNFD($c); }
  44. function normalize_form_kc($c) { return UtfNormal::toNFKC($c); }
  45. function normalize_form_kd($c) { return UtfNormal::toNFKD($c); }
  46. /**
  47. * This set of functions is only useful if youve added a param to the
  48. * following functions to force pure PHP usage. I decided not to
  49. * commit that code since might produce a slowdown in the UTF
  50. * normalization code just for the sake of these tests. -- hexmode
  51. */
  52. function normalize_form_c_php($c) { return UtfNormal::toNFC($c, "php"); }
  53. function normalize_form_d_php($c) { return UtfNormal::toNFD($c, "php"); }
  54. function normalize_form_kc_php($c) { return UtfNormal::toNFKC($c, "php"); }
  55. function normalize_form_kd_php($c) { return UtfNormal::toNFKD($c, "php"); }
  56. assert_options(ASSERT_ACTIVE, 1);
  57. assert_options(ASSERT_WARNING, 0);
  58. assert_options(ASSERT_QUIET_EVAL, 1);
  59. assert_options(ASSERT_CALLBACK, 'my_assert');
  60. function my_assert( $file, $line, $code ) {
  61. global $col, $lineNo;
  62. echo "Assertion that '$code' failed on line $lineNo ($col[5])\n";
  63. }
  64. $count = 0;
  65. $lineNo = 0;
  66. if( $f !== false ) {
  67. while( ( $col = getRow( $f ) ) !== false ) {
  68. $lineNo++;
  69. if(count($col) == 6) {
  70. $count++;
  71. if( $count % 100 === 0 ) echo "Count: $count\n";
  72. } else {
  73. continue;
  74. }
  75. # verify that the pure PHP version is correct
  76. $NFCc1 = normalize_form_c($col[0]);
  77. $NFCc1p = normalize_form_c_php($col[0]);
  78. assert('$NFCc1 === $NFCc1p');
  79. $NFCc2 = normalize_form_c($col[1]);
  80. $NFCc2p = normalize_form_c_php($col[1]);
  81. assert('$NFCc2 === $NFCc2p');
  82. $NFCc3 = normalize_form_c($col[2]);
  83. $NFCc3p = normalize_form_c_php($col[2]);
  84. assert('$NFCc3 === $NFCc3p');
  85. $NFCc4 = normalize_form_c($col[3]);
  86. $NFCc4p = normalize_form_c_php($col[3]);
  87. assert('$NFCc4 === $NFCc4p');
  88. $NFCc5 = normalize_form_c($col[4]);
  89. $NFCc5p = normalize_form_c_php($col[4]);
  90. assert('$NFCc5 === $NFCc5p');
  91. $NFDc1 = normalize_form_d($col[0]);
  92. $NFDc1p = normalize_form_d_php($col[0]);
  93. assert('$NFDc1 === $NFDc1p');
  94. $NFDc2 = normalize_form_d($col[1]);
  95. $NFDc2p = normalize_form_d_php($col[1]);
  96. assert('$NFDc2 === $NFDc2p');
  97. $NFDc3 = normalize_form_d($col[2]);
  98. $NFDc3p = normalize_form_d_php($col[2]);
  99. assert('$NFDc3 === $NFDc3p');
  100. $NFDc4 = normalize_form_d($col[3]);
  101. $NFDc4p = normalize_form_d_php($col[3]);
  102. assert('$NFDc4 === $NFDc4p');
  103. $NFDc5 = normalize_form_d($col[4]);
  104. $NFDc5p = normalize_form_d_php($col[4]);
  105. assert('$NFDc5 === $NFDc5p');
  106. $NFKDc1 = normalize_form_kd($col[0]);
  107. $NFKDc1p = normalize_form_kd_php($col[0]);
  108. assert('$NFKDc1 === $NFKDc1p');
  109. $NFKDc2 = normalize_form_kd($col[1]);
  110. $NFKDc2p = normalize_form_kd_php($col[1]);
  111. assert('$NFKDc2 === $NFKDc2p');
  112. $NFKDc3 = normalize_form_kd($col[2]);
  113. $NFKDc3p = normalize_form_kd_php($col[2]);
  114. assert('$NFKDc3 === $NFKDc3p');
  115. $NFKDc4 = normalize_form_kd($col[3]);
  116. $NFKDc4p = normalize_form_kd_php($col[3]);
  117. assert('$NFKDc4 === $NFKDc4p');
  118. $NFKDc5 = normalize_form_kd($col[4]);
  119. $NFKDc5p = normalize_form_kd_php($col[4]);
  120. assert('$NFKDc5 === $NFKDc5p');
  121. $NFKCc1 = normalize_form_kc($col[0]);
  122. $NFKCc1p = normalize_form_kc_php($col[0]);
  123. assert('$NFKCc1 === $NFKCc1p');
  124. $NFKCc2 = normalize_form_kc($col[1]);
  125. $NFKCc2p = normalize_form_kc_php($col[1]);
  126. assert('$NFKCc2 === $NFKCc2p');
  127. $NFKCc3 = normalize_form_kc($col[2]);
  128. $NFKCc3p = normalize_form_kc_php($col[2]);
  129. assert('$NFKCc3 === $NFKCc3p');
  130. $NFKCc4 = normalize_form_kc($col[3]);
  131. $NFKCc4p = normalize_form_kc_php($col[3]);
  132. assert('$NFKCc4 === $NFKCc4p');
  133. $NFKCc5 = normalize_form_kc($col[4]);
  134. $NFKCc5p = normalize_form_kc_php($col[4]);
  135. assert('$NFKCc5 === $NFKCc5p');
  136. # c2 == NFC(c1) == NFC(c2) == NFC(c3)
  137. assert('$col[1] === $NFCc1');
  138. assert('$col[1] === $NFCc2');
  139. assert('$col[1] === $NFCc3');
  140. # c4 == NFC(c4) == NFC(c5)
  141. assert('$col[3] === $NFCc4');
  142. assert('$col[3] === $NFCc5');
  143. # c3 == NFD(c1) == NFD(c2) == NFD(c3)
  144. assert('$col[2] === $NFDc1');
  145. assert('$col[2] === $NFDc2');
  146. assert('$col[2] === $NFDc3');
  147. # c5 == NFD(c4) == NFD(c5)
  148. assert('$col[4] === $NFDc4');
  149. assert('$col[4] === $NFDc5');
  150. # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
  151. assert('$col[3] === $NFKCc1');
  152. assert('$col[3] === $NFKCc2');
  153. assert('$col[3] === $NFKCc3');
  154. assert('$col[3] === $NFKCc4');
  155. assert('$col[3] === $NFKCc5');
  156. # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
  157. assert('$col[4] === $NFKDc1');
  158. assert('$col[4] === $NFKDc2');
  159. assert('$col[4] === $NFKDc3');
  160. assert('$col[4] === $NFKDc4');
  161. assert('$col[4] === $NFKDc5');
  162. }
  163. }
  164. echo "done.\n";
  165. // Compare against http://en.wikipedia.org/wiki/UTF-8#Description
  166. function unichr($c) {
  167. if ($c <= 0x7F) {
  168. return chr($c);
  169. } else if ($c <= 0x7FF) {
  170. return chr(0xC0 | $c >> 6) . chr(0x80 | $c & 0x3F);
  171. } else if ($c <= 0xFFFF) {
  172. return chr(0xE0 | $c >> 12) . chr(0x80 | $c >> 6 & 0x3F)
  173. . chr(0x80 | $c & 0x3F);
  174. } else if ($c <= 0x10FFFF) {
  175. return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F)
  176. . chr(0x80 | $c >> 6 & 0x3F)
  177. . chr(0x80 | $c & 0x3F);
  178. } else {
  179. return false;
  180. }
  181. }
  182. function unistr($c) {
  183. return implode("", array_map("unichr", array_map("hexdec", explode(" ", $c))));
  184. }
  185. function getRow( $f ) {
  186. $row = fgets( $f );
  187. if( $row === false ) return false;
  188. $row = rtrim($row);
  189. $pos = strpos( $row, COMMENT );
  190. $pos2 = strpos( $row, ")" );
  191. if( $pos === 0 ) return array($row);
  192. $c = "";
  193. if( $pos ) {
  194. if($pos2) $c = substr( $row, $pos2 + 2 );
  195. else $c = substr( $row, $pos );
  196. $row = substr( $row, 0, $pos );
  197. }
  198. $ret = array();
  199. foreach( explode( SEPARATOR, $row ) as $ent ) {
  200. if( trim( $ent ) !== "" ) {
  201. $ret[] = unistr($ent);
  202. }
  203. }
  204. $ret[] = $c;
  205. return $ret;
  206. }