PageRenderTime 1ms CodeModel.GetById 3ms app.highlight 7ms RepoModel.GetById 5ms app.codeStats 0ms

/core/modules/search/tests/src/Functional/SearchTokenizerTest.php

https://github.com/drupal/drupal
PHP | 157 lines | 97 code | 20 blank | 40 comment | 4 complexity | 75bb542c44b0d9543a3d90a74529574c MD5 | raw file
  1<?php
  2
  3namespace Drupal\Tests\search\Functional;
  4
  5use Drupal\Tests\BrowserTestBase;
  6
  7/**
  8 * Tests that CJK tokenizer works as intended.
  9 *
 10 * @group search
 11 */
 12class SearchTokenizerTest extends BrowserTestBase {
 13
 14  /**
 15   * {@inheritdoc}
 16   */
 17  protected static $modules = ['search'];
 18
 19  /**
 20   * Verifies that strings of CJK characters are tokenized.
 21   *
 22   * The search_simplify() function does special things with numbers, symbols,
 23   * and punctuation. So we only test that CJK characters that are not in these
 24   * character classes are tokenized properly. See PREG_CLASS_CKJ for more
 25   * information.
 26   */
 27  public function testTokenizer() {
 28    // Set the minimum word size to 1 (to split all CJK characters) and make
 29    // sure CJK tokenizing is turned on.
 30    $this->config('search.settings')
 31      ->set('index.minimum_word_size', 1)
 32      ->set('index.overlap_cjk', TRUE)
 33      ->save();
 34    $this->refreshVariables();
 35
 36    // Create a string of CJK characters from various character ranges in
 37    // the Unicode tables.
 38
 39    // Beginnings of the character ranges.
 40    $starts = [
 41      'CJK unified' => 0x4e00,
 42      'CJK Ext A' => 0x3400,
 43      'CJK Compat' => 0xf900,
 44      'Hangul Jamo' => 0x1100,
 45      'Hangul Ext A' => 0xa960,
 46      'Hangul Ext B' => 0xd7b0,
 47      'Hangul Compat' => 0x3131,
 48      'Half non-punct 1' => 0xff21,
 49      'Half non-punct 2' => 0xff41,
 50      'Half non-punct 3' => 0xff66,
 51      'Hangul Syllables' => 0xac00,
 52      'Hiragana' => 0x3040,
 53      'Katakana' => 0x30a1,
 54      'Katakana Ext' => 0x31f0,
 55      'CJK Reserve 1' => 0x20000,
 56      'CJK Reserve 2' => 0x30000,
 57      'Bomofo' => 0x3100,
 58      'Bomofo Ext' => 0x31a0,
 59      'Lisu' => 0xa4d0,
 60      'Yi' => 0xa000,
 61    ];
 62
 63    // Ends of the character ranges.
 64    $ends = [
 65      'CJK unified' => 0x9fcf,
 66      'CJK Ext A' => 0x4dbf,
 67      'CJK Compat' => 0xfaff,
 68      'Hangul Jamo' => 0x11ff,
 69      'Hangul Ext A' => 0xa97f,
 70      'Hangul Ext B' => 0xd7ff,
 71      'Hangul Compat' => 0x318e,
 72      'Half non-punct 1' => 0xff3a,
 73      'Half non-punct 2' => 0xff5a,
 74      'Half non-punct 3' => 0xffdc,
 75      'Hangul Syllables' => 0xd7af,
 76      'Hiragana' => 0x309f,
 77      'Katakana' => 0x30ff,
 78      'Katakana Ext' => 0x31ff,
 79      'CJK Reserve 1' => 0x2fffd,
 80      'CJK Reserve 2' => 0x3fffd,
 81      'Bomofo' => 0x312f,
 82      'Bomofo Ext' => 0x31b7,
 83      'Lisu' => 0xa4fd,
 84      'Yi' => 0xa48f,
 85    ];
 86
 87    // Generate characters consisting of starts, midpoints, and ends.
 88    $chars = [];
 89    $charcodes = [];
 90    foreach ($starts as $key => $value) {
 91      $charcodes[] = $starts[$key];
 92      $chars[] = $this->code2utf($starts[$key]);
 93      $mid = round(0.5 * ($starts[$key] + $ends[$key]));
 94      $charcodes[] = $mid;
 95      $chars[] = $this->code2utf($mid);
 96      $charcodes[] = $ends[$key];
 97      $chars[] = $this->code2utf($ends[$key]);
 98    }
 99
100    // Merge into a string and tokenize.
101    $string = implode('', $chars);
102    $out = trim(search_simplify($string));
103    $expected = mb_strtolower(implode(' ', $chars));
104
105    // Verify that the output matches what we expect.
106    $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
107  }
108
109  /**
110   * Verifies that strings of non-CJK characters are not tokenized.
111   *
112   * This is just a sanity check - it verifies that strings of letters are
113   * not tokenized.
114   */
115  public function testNoTokenizer() {
116    // Set the minimum word size to 1 (to split all CJK characters) and make
117    // sure CJK tokenizing is turned on.
118    $this->config('search.settings')
119      ->set('index.minimum_word_size', 1)
120      ->set('index.overlap_cjk', TRUE)
121      ->save();
122    $this->refreshVariables();
123
124    $letters = 'abcdefghijklmnopqrstuvwxyz';
125    $out = trim(search_simplify($letters));
126
127    $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
128  }
129
130  /**
131   * Like PHP chr() function, but for unicode characters.
132   *
133   * Function chr() only works for ASCII characters up to character 255. This
134   * function converts a number to the corresponding unicode character. Adapted
135   * from functions supplied in comments on several functions on php.net.
136   */
137  public function code2utf($num) {
138    if ($num < 128) {
139      return chr($num);
140    }
141
142    if ($num < 2048) {
143      return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
144    }
145
146    if ($num < 65536) {
147      return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
148    }
149
150    if ($num < 2097152) {
151      return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
152    }
153
154    return '';
155  }
156
157}