PageRenderTime 54ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/core/modules/search/src/Tests/SearchTokenizerTest.php

https://github.com/besutra/freeraum
PHP | 159 lines | 102 code | 18 blank | 39 comment | 4 complexity | c179c9c8241753046944afb107f80f0c MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, BSD-3-Clause, MIT
  1. <?php
  2. /**
  3. * @file
  4. * Definition of Drupal\search\Tests\SearchTokenizerTest.
  5. */
  6. namespace Drupal\search\Tests;
  7. /**
  8. * Test the CJK tokenizer.
  9. */
  10. class SearchTokenizerTest extends SearchTestBase {
  11. public static function getInfo() {
  12. return array(
  13. 'name' => 'CJK tokenizer',
  14. 'description' => 'Check that CJK tokenizer works as intended.',
  15. 'group' => 'Search',
  16. );
  17. }
  18. /**
  19. * Verifies that strings of CJK characters are tokenized.
  20. *
  21. * The search_simplify() function does special things with numbers, symbols,
  22. * and punctuation. So we only test that CJK characters that are not in these
  23. * character classes are tokenized properly. See PREG_CLASS_CKJ for more
  24. * information.
  25. */
  26. function testTokenizer() {
  27. // Set the minimum word size to 1 (to split all CJK characters) and make
  28. // sure CJK tokenizing is turned on.
  29. \Drupal::config('search.settings')
  30. ->set('index.minimum_word_size', 1)
  31. ->set('index.overlap_cjk', TRUE)
  32. ->save();
  33. $this->refreshVariables();
  34. // Create a string of CJK characters from various character ranges in
  35. // the Unicode tables.
  36. // Beginnings of the character ranges.
  37. $starts = array(
  38. 'CJK unified' => 0x4e00,
  39. 'CJK Ext A' => 0x3400,
  40. 'CJK Compat' => 0xf900,
  41. 'Hangul Jamo' => 0x1100,
  42. 'Hangul Ext A' => 0xa960,
  43. 'Hangul Ext B' => 0xd7b0,
  44. 'Hangul Compat' => 0x3131,
  45. 'Half non-punct 1' => 0xff21,
  46. 'Half non-punct 2' => 0xff41,
  47. 'Half non-punct 3' => 0xff66,
  48. 'Hangul Syllables' => 0xac00,
  49. 'Hiragana' => 0x3040,
  50. 'Katakana' => 0x30a1,
  51. 'Katakana Ext' => 0x31f0,
  52. 'CJK Reserve 1' => 0x20000,
  53. 'CJK Reserve 2' => 0x30000,
  54. 'Bomofo' => 0x3100,
  55. 'Bomofo Ext' => 0x31a0,
  56. 'Lisu' => 0xa4d0,
  57. 'Yi' => 0xa000,
  58. );
  59. // Ends of the character ranges.
  60. $ends = array(
  61. 'CJK unified' => 0x9fcf,
  62. 'CJK Ext A' => 0x4dbf,
  63. 'CJK Compat' => 0xfaff,
  64. 'Hangul Jamo' => 0x11ff,
  65. 'Hangul Ext A' => 0xa97f,
  66. 'Hangul Ext B' => 0xd7ff,
  67. 'Hangul Compat' => 0x318e,
  68. 'Half non-punct 1' => 0xff3a,
  69. 'Half non-punct 2' => 0xff5a,
  70. 'Half non-punct 3' => 0xffdc,
  71. 'Hangul Syllables' => 0xd7af,
  72. 'Hiragana' => 0x309f,
  73. 'Katakana' => 0x30ff,
  74. 'Katakana Ext' => 0x31ff,
  75. 'CJK Reserve 1' => 0x2fffd,
  76. 'CJK Reserve 2' => 0x3fffd,
  77. 'Bomofo' => 0x312f,
  78. 'Bomofo Ext' => 0x31b7,
  79. 'Lisu' => 0xa4fd,
  80. 'Yi' => 0xa48f,
  81. );
  82. // Generate characters consisting of starts, midpoints, and ends.
  83. $chars = array();
  84. $charcodes = array();
  85. foreach ($starts as $key => $value) {
  86. $charcodes[] = $starts[$key];
  87. $chars[] = $this->code2utf($starts[$key]);
  88. $mid = round(0.5 * ($starts[$key] + $ends[$key]));
  89. $charcodes[] = $mid;
  90. $chars[] = $this->code2utf($mid);
  91. $charcodes[] = $ends[$key];
  92. $chars[] = $this->code2utf($ends[$key]);
  93. }
  94. // Merge into a string and tokenize.
  95. $string = implode('', $chars);
  96. $out = trim(search_simplify($string));
  97. $expected = drupal_strtolower(implode(' ', $chars));
  98. // Verify that the output matches what we expect.
  99. $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
  100. }
  101. /**
  102. * Verifies that strings of non-CJK characters are not tokenized.
  103. *
  104. * This is just a sanity check - it verifies that strings of letters are
  105. * not tokenized.
  106. */
  107. function testNoTokenizer() {
  108. // Set the minimum word size to 1 (to split all CJK characters) and make
  109. // sure CJK tokenizing is turned on.
  110. \Drupal::config('search.settings')
  111. ->set('minimum_word_size', 1)
  112. ->set('overlap_cjk', TRUE)
  113. ->save();
  114. $this->refreshVariables();
  115. $letters = 'abcdefghijklmnopqrstuvwxyz';
  116. $out = trim(search_simplify($letters));
  117. $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
  118. }
  119. /**
  120. * Like PHP chr() function, but for unicode characters.
  121. *
  122. * chr() only works for ASCII characters up to character 255. This function
  123. * converts a number to the corresponding unicode character. Adapted from
  124. * functions supplied in comments on several functions on php.net.
  125. */
  126. function code2utf($num) {
  127. if ($num < 128) {
  128. return chr($num);
  129. }
  130. if ($num < 2048) {
  131. return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
  132. }
  133. if ($num < 65536) {
  134. return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
  135. }
  136. if ($num < 2097152) {
  137. return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
  138. }
  139. return '';
  140. }
  141. }