PageRenderTime 48ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/core/modules/search/tests/src/Functional/SearchTokenizerTest.php

https://github.com/drupal/drupal
PHP | 157 lines | 97 code | 20 blank | 40 comment | 4 complexity | 75bb542c44b0d9543a3d90a74529574c MD5 | raw file
Possible License(s): LGPL-2.1, GPL-2.0
  1. <?php
  2. namespace Drupal\Tests\search\Functional;
  3. use Drupal\Tests\BrowserTestBase;
  4. /**
  5. * Tests that CJK tokenizer works as intended.
  6. *
  7. * @group search
  8. */
  9. class SearchTokenizerTest extends BrowserTestBase {
  10. /**
  11. * {@inheritdoc}
  12. */
  13. protected static $modules = ['search'];
  14. /**
  15. * Verifies that strings of CJK characters are tokenized.
  16. *
  17. * The search_simplify() function does special things with numbers, symbols,
  18. * and punctuation. So we only test that CJK characters that are not in these
  19. * character classes are tokenized properly. See PREG_CLASS_CKJ for more
  20. * information.
  21. */
  22. public function testTokenizer() {
  23. // Set the minimum word size to 1 (to split all CJK characters) and make
  24. // sure CJK tokenizing is turned on.
  25. $this->config('search.settings')
  26. ->set('index.minimum_word_size', 1)
  27. ->set('index.overlap_cjk', TRUE)
  28. ->save();
  29. $this->refreshVariables();
  30. // Create a string of CJK characters from various character ranges in
  31. // the Unicode tables.
  32. // Beginnings of the character ranges.
  33. $starts = [
  34. 'CJK unified' => 0x4e00,
  35. 'CJK Ext A' => 0x3400,
  36. 'CJK Compat' => 0xf900,
  37. 'Hangul Jamo' => 0x1100,
  38. 'Hangul Ext A' => 0xa960,
  39. 'Hangul Ext B' => 0xd7b0,
  40. 'Hangul Compat' => 0x3131,
  41. 'Half non-punct 1' => 0xff21,
  42. 'Half non-punct 2' => 0xff41,
  43. 'Half non-punct 3' => 0xff66,
  44. 'Hangul Syllables' => 0xac00,
  45. 'Hiragana' => 0x3040,
  46. 'Katakana' => 0x30a1,
  47. 'Katakana Ext' => 0x31f0,
  48. 'CJK Reserve 1' => 0x20000,
  49. 'CJK Reserve 2' => 0x30000,
  50. 'Bomofo' => 0x3100,
  51. 'Bomofo Ext' => 0x31a0,
  52. 'Lisu' => 0xa4d0,
  53. 'Yi' => 0xa000,
  54. ];
  55. // Ends of the character ranges.
  56. $ends = [
  57. 'CJK unified' => 0x9fcf,
  58. 'CJK Ext A' => 0x4dbf,
  59. 'CJK Compat' => 0xfaff,
  60. 'Hangul Jamo' => 0x11ff,
  61. 'Hangul Ext A' => 0xa97f,
  62. 'Hangul Ext B' => 0xd7ff,
  63. 'Hangul Compat' => 0x318e,
  64. 'Half non-punct 1' => 0xff3a,
  65. 'Half non-punct 2' => 0xff5a,
  66. 'Half non-punct 3' => 0xffdc,
  67. 'Hangul Syllables' => 0xd7af,
  68. 'Hiragana' => 0x309f,
  69. 'Katakana' => 0x30ff,
  70. 'Katakana Ext' => 0x31ff,
  71. 'CJK Reserve 1' => 0x2fffd,
  72. 'CJK Reserve 2' => 0x3fffd,
  73. 'Bomofo' => 0x312f,
  74. 'Bomofo Ext' => 0x31b7,
  75. 'Lisu' => 0xa4fd,
  76. 'Yi' => 0xa48f,
  77. ];
  78. // Generate characters consisting of starts, midpoints, and ends.
  79. $chars = [];
  80. $charcodes = [];
  81. foreach ($starts as $key => $value) {
  82. $charcodes[] = $starts[$key];
  83. $chars[] = $this->code2utf($starts[$key]);
  84. $mid = round(0.5 * ($starts[$key] + $ends[$key]));
  85. $charcodes[] = $mid;
  86. $chars[] = $this->code2utf($mid);
  87. $charcodes[] = $ends[$key];
  88. $chars[] = $this->code2utf($ends[$key]);
  89. }
  90. // Merge into a string and tokenize.
  91. $string = implode('', $chars);
  92. $out = trim(search_simplify($string));
  93. $expected = mb_strtolower(implode(' ', $chars));
  94. // Verify that the output matches what we expect.
  95. $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
  96. }
  97. /**
  98. * Verifies that strings of non-CJK characters are not tokenized.
  99. *
  100. * This is just a sanity check - it verifies that strings of letters are
  101. * not tokenized.
  102. */
  103. public function testNoTokenizer() {
  104. // Set the minimum word size to 1 (to split all CJK characters) and make
  105. // sure CJK tokenizing is turned on.
  106. $this->config('search.settings')
  107. ->set('index.minimum_word_size', 1)
  108. ->set('index.overlap_cjk', TRUE)
  109. ->save();
  110. $this->refreshVariables();
  111. $letters = 'abcdefghijklmnopqrstuvwxyz';
  112. $out = trim(search_simplify($letters));
  113. $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
  114. }
  115. /**
  116. * Like PHP chr() function, but for unicode characters.
  117. *
  118. * Function chr() only works for ASCII characters up to character 255. This
  119. * function converts a number to the corresponding unicode character. Adapted
  120. * from functions supplied in comments on several functions on php.net.
  121. */
  122. public function code2utf($num) {
  123. if ($num < 128) {
  124. return chr($num);
  125. }
  126. if ($num < 2048) {
  127. return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
  128. }
  129. if ($num < 65536) {
  130. return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
  131. }
  132. if ($num < 2097152) {
  133. return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
  134. }
  135. return '';
  136. }
  137. }