PageRenderTime 50ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/hphp/test/slow/ext_icu/ext_icu.php

http://github.com/facebook/hiphop-php
PHP | 289 lines | 248 code | 26 blank | 15 comment | 22 complexity | 8c86cdec3f0e50edb21e75c7feb01ae5 MD5 | raw file
Possible License(s): LGPL-2.1, BSD-2-Clause, BSD-3-Clause, MPL-2.0-no-copyleft-exception, MIT, LGPL-2.0, Apache-2.0
  1. <?hh
  2. function VS($x, $y) {
  3. var_dump($x === $y);
  4. if ($x !== $y) { echo "Failed: $y\n"; echo "Got: $x\n";
  5. var_dump(debug_backtrace()); }
  6. }
  7. function VERIFY($x) { VS($x, true); }
  8. //////////////////////////////////////////////////////////////////////
  9. function test_icu_match() {
  10. // Test subject strings.
  11. $subject = "\xd7\x96\xf0\x90\xa4\x85". " PHP is a scripting language. " .
  12. "\xef\xba\xb0\xef\xbb\xb3";
  13. $subject_32 =
  14. "\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85" .
  15. "\xf0\x90\xa4\x85\xf0\x90\xa4\x85";
  16. $subject_en = "this is an english string";
  17. // "this is a hebrew string"
  18. $subject_he =
  19. "\xd7\x96\xd7\x94\x20" .
  20. "\xd7\x94\xd7\x95\xd7\x90\x20\xd7\x9e\xd7\x97\xd7\xa8\xd7\x95\xd7" .
  21. "\x96\xd7\xaa\x20\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa";
  22. // "this is an arabic string"
  23. $subject_ar =
  24. "\xef\xbb\xa9\xef".
  25. "\xba\xab\xef\xba\x8d\x20\xef\xbb\xa9\xef\xbb\xad\x20\xef\xba\x8e".
  26. "\xef\xbb\xa0\xef\xbb\xa8\xef\xba\xbb\x20\xef\xba\x8d\xef\xbb\xba".
  27. "\xef\xbb\xa8\xef\xba\xa0\xef\xbb\xa0\xef\xbb\xb3\xef\xba\xb0\xef".
  28. "\xbb\xb3";
  29. // "this is a hebrew string"
  30. $subject_mixed =
  31. "this is a ".
  32. "\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa"
  33. ." string";
  34. // Test basic regex parsing functionality.
  35. VERIFY(icu_match("scripting", $subject) != false);
  36. VERIFY(icu_match("php", $subject) == false);
  37. VERIFY(icu_match("(\\bPHP\\b)", $subject) != false);
  38. VERIFY(icu_match("(\\bPHP\\b))", $subject) == false);
  39. $matches = null;
  40. // Test returning matches functionality.
  41. VERIFY(icu_match_with_matches("(PHP) is", $subject, inout $matches) != false);
  42. VS(print_r($matches, true),
  43. "Array\n".
  44. "(\n".
  45. " [0] => PHP is\n".
  46. " [1] => PHP\n".
  47. ")\n");
  48. VERIFY(
  49. icu_match_with_matches(
  50. "is (a)",
  51. $subject,
  52. inout $matches,
  53. UREGEX_OFFSET_CAPTURE,
  54. ) !=
  55. false,
  56. );
  57. VS(print_r($matches, true),
  58. "Array\n".
  59. "(\n".
  60. " [0] => Array\n".
  61. " (\n".
  62. " [0] => is a\n".
  63. " [1] => 7\n".
  64. " )\n".
  65. "\n".
  66. " [1] => Array\n".
  67. " (\n".
  68. " [0] => a\n".
  69. " [1] => 10\n".
  70. " )\n".
  71. "\n".
  72. ")\n");
  73. VERIFY(
  74. icu_match_with_matches(
  75. "\\. \xef\xba\xb0",
  76. $subject,
  77. inout $matches,
  78. UREGEX_OFFSET_CAPTURE,
  79. ) !=
  80. false,
  81. );
  82. VS(print_r($matches, true),
  83. "Array\n".
  84. "(\n".
  85. " [0] => Array\n".
  86. " (\n".
  87. " [0] => . \xef\xba\xb0\n".
  88. " [1] => 30\n".
  89. " )\n".
  90. "\n".
  91. ")\n");
  92. $junk1="\xef\xbb\xa9\xef\xbb\xad";
  93. $junk2="\xef\xba\x8e\xef\xbb\xa0\xef\xbb\xa8\xef\xba\xbb";
  94. VERIFY(
  95. icu_match_with_matches(
  96. "$junk1 ($junk2)",
  97. $subject_ar,
  98. inout $matches,
  99. UREGEX_OFFSET_CAPTURE,
  100. ) !=
  101. false,
  102. );
  103. VS(print_r($matches, true),
  104. "Array\n".
  105. "(\n".
  106. " [0] => Array\n".
  107. " (\n".
  108. " [0] => $junk1 $junk2\n".
  109. " [1] => 4\n".
  110. " )\n".
  111. "\n".
  112. " [1] => Array\n".
  113. " (\n".
  114. " [0] => $junk2\n".
  115. " [1] => 7\n".
  116. " )\n".
  117. "\n".
  118. ")\n");
  119. // Test match for 32-bit code points.
  120. VERIFY(icu_match_with_matches(".*", $subject_32, inout $matches) != false);
  121. $expected="\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4".
  122. "\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85";
  123. VS(print_r($matches, true),
  124. "Array\n".
  125. "(\n".
  126. " [0] => $expected\n".
  127. ")\n");
  128. // Test regex caching functionality.
  129. VERIFY(
  130. icu_match(
  131. "(php)",
  132. $subject,
  133. UREGEX_CASE_INSENSITIVE,
  134. ) !=
  135. false,
  136. );
  137. VERIFY(icu_match("(php)", $subject) == false);
  138. // Test ICU specific (ie bidi) functionality.
  139. $pattern_ltr = "\\p{Bidi_Class=Left_To_Right}";
  140. $pattern_rtl = "\\p{Bidi_Class=Right_To_Left}";
  141. $pattern_arl = "\\p{Bidi_Class=Arabic_Letter}";
  142. VERIFY(icu_match($pattern_ltr, $subject_en) != false);
  143. VERIFY(icu_match($pattern_rtl, $subject_en) == false);
  144. VERIFY(icu_match($pattern_ltr, $subject_he) == false);
  145. VERIFY(icu_match($pattern_rtl, $subject_he) != false);
  146. VERIFY(icu_match($pattern_arl, $subject_he) == false);
  147. VERIFY(icu_match($pattern_ltr, $subject_ar) == false);
  148. VERIFY(icu_match($pattern_rtl, $subject_ar) == false);
  149. VERIFY(icu_match($pattern_arl, $subject_ar) != false);
  150. VERIFY(icu_match($pattern_ltr, $subject_mixed) != false);
  151. VERIFY(icu_match($pattern_rtl, $subject_mixed) != false);
  152. }
  153. // Test string lifted from tests/intl/utf8.h
  154. function test_icu_transliterate() {
  155. $input_ru = "\xd1\x84\xd0\xb5\xd0\xb9\xd1".
  156. "\x81\xd0\xb1\xd1\x83\xc5\x93\xd0\xba";
  157. $output_ru = icu_transliterate($input_ru, false);
  158. // Note: different than php test ('y' -> 'j')
  159. VERIFY($output_ru == "fejsbu\xc5\x93k");
  160. // Verify that removing accents works.
  161. $input_de = "Ich m\xc3\xb6".
  162. "chte \xc3\xbc".
  163. "berzeugend ".
  164. "oder \xc3\xa4hnliche sein";
  165. $output_de = icu_transliterate($input_de, true);
  166. VERIFY($output_de == "Ich mochte uberzeugend oder ahnliche sein");
  167. // Verify that keeping accents works.
  168. VERIFY(icu_transliterate($input_de, false) == $input_de);
  169. // Check a non-Latin language.
  170. $input_zh = "\xe5\x9b\x9b".
  171. "\xe5\x8d\x81\xe5\x9b\x9b\xe7".
  172. "\x9f\xb3\xe7\x8d\x85\xe5\xad\x90";
  173. $output_zh = icu_transliterate($input_zh, true);
  174. VERIFY($output_zh == "si shi si shi shi zi");
  175. }
  176. function test_icu_tokenize() {
  177. $input_eng = "Hello World";
  178. $output_eng = icu_tokenize($input_eng);
  179. VS(print_r($output_eng, true),
  180. "Array\n".
  181. "(\n".
  182. " [0] => _B_\n".
  183. " [1] => hello\n".
  184. " [2] => world\n".
  185. " [3] => _E_\n".
  186. ")\n"
  187. );
  188. $input_long = "Hello! You are visitor #1234 to ".
  189. "http://www.facebook.com! ".
  190. "<3 How are you today (6/14/2011),".
  191. " hello@world.com?";
  192. $output_long = icu_tokenize($input_long);
  193. VS(print_r($output_long, true),
  194. "Array\n".
  195. "(\n".
  196. " [0] => _B_\n".
  197. " [1] => hello\n".
  198. " [2] => !\n".
  199. " [3] => you\n".
  200. " [4] => are\n".
  201. " [5] => visitor\n".
  202. " [6] => #\n".
  203. " [7] => XXXX\n".
  204. " [8] => to\n".
  205. " [9] => TOKEN_URL\n".
  206. " [10] => !\n".
  207. " [11] => TOKEN_HEART\n".
  208. " [12] => how\n".
  209. " [13] => are\n".
  210. " [14] => you\n".
  211. " [15] => today\n".
  212. " [16] => (\n".
  213. " [17] => TOKEN_DATE\n".
  214. " [18] => )\n".
  215. " [19] => ,\n".
  216. " [20] => TOKEN_EMAIL\n".
  217. " [21] => ?\n".
  218. " [22] => _E_\n".
  219. ")\n"
  220. );
  221. $input_de = "Ich möchte überzeugend oder ähnliche sein";
  222. $output_de = icu_tokenize($input_de);
  223. VS(print_r($output_de, true),
  224. "Array\n".
  225. "(\n".
  226. " [0] => _B_\n".
  227. " [1] => ich\n".
  228. " [2] => mã\n".
  229. " [3] => ¶\n".
  230. " [4] => chte\n".
  231. " [5] => ã\n".
  232. " [6] => ¼\n".
  233. " [7] => berzeugend\n".
  234. " [8] => oder\n".
  235. " [9] => ã\n".
  236. " [10] => ¤\n".
  237. " [11] => hnliche\n".
  238. " [12] => sein\n".
  239. " [13] => _E_\n".
  240. ")\n");
  241. $input_hebrew = "היום יום רביעי, וזה ממש טוב.";
  242. $output_hebrew = icu_tokenize($input_hebrew);
  243. VS(print_r($output_hebrew, true),
  244. "Array\n".
  245. "(\n".
  246. " [0] => _B_\n".
  247. " [1] => היום\n".
  248. " [2] => יום\n".
  249. " [3] => רביעי\n".
  250. " [4] => ,\n".
  251. " [5] => וזה\n".
  252. " [6] => ממש\n".
  253. " [7] => טוב\n".
  254. " [8] => .\n".
  255. " [9] => _E_\n".
  256. ")\n");
  257. }
  258. <<__EntryPoint>>
  259. function main_ext_icu() {
  260. test_icu_match();
  261. test_icu_transliterate();
  262. test_icu_tokenize();
  263. }