PageRenderTime 23ms CodeModel.GetById 10ms app.highlight 9ms RepoModel.GetById 1ms app.codeStats 0ms

/core/modules/search/tests/src/Functional/SearchTokenizerTest.php

http://github.com/drupal/drupal
PHP | 162 lines | 98 code | 21 blank | 43 comment | 4 complexity | f11b8db793c90a5795726eb5a40dd960 MD5 | raw file
  1<?php
  2
  3namespace Drupal\Tests\search\Functional;
  4
  5use Drupal\Tests\BrowserTestBase;
  6
  7/**
  8 * Tests that CJK tokenizer works as intended.
  9 *
 10 * @group search
 11 */
 12class SearchTokenizerTest extends BrowserTestBase {
 13
 14  /**
 15   * {@inheritdoc}
 16   */
 17  protected static $modules = ['search'];
 18
 19  /**
 20   * {@inheritdoc}
 21   */
 22  protected $defaultTheme = 'stark';
 23
 24  /**
 25   * Verifies that strings of CJK characters are tokenized.
 26   *
 27   * The search_simplify() function does special things with numbers, symbols,
 28   * and punctuation. So we only test that CJK characters that are not in these
 29   * character classes are tokenized properly. See PREG_CLASS_CKJ for more
 30   * information.
 31   */
 32  public function testTokenizer() {
 33    // Set the minimum word size to 1 (to split all CJK characters) and make
 34    // sure CJK tokenizing is turned on.
 35    $this->config('search.settings')
 36      ->set('index.minimum_word_size', 1)
 37      ->set('index.overlap_cjk', TRUE)
 38      ->save();
 39    $this->refreshVariables();
 40
 41    // Create a string of CJK characters from various character ranges in
 42    // the Unicode tables.
 43
 44    // Beginnings of the character ranges.
 45    $starts = [
 46      'CJK unified' => 0x4e00,
 47      'CJK Ext A' => 0x3400,
 48      'CJK Compat' => 0xf900,
 49      'Hangul Jamo' => 0x1100,
 50      'Hangul Ext A' => 0xa960,
 51      'Hangul Ext B' => 0xd7b0,
 52      'Hangul Compat' => 0x3131,
 53      'Half non-punct 1' => 0xff21,
 54      'Half non-punct 2' => 0xff41,
 55      'Half non-punct 3' => 0xff66,
 56      'Hangul Syllables' => 0xac00,
 57      'Hiragana' => 0x3040,
 58      'Katakana' => 0x30a1,
 59      'Katakana Ext' => 0x31f0,
 60      'CJK Reserve 1' => 0x20000,
 61      'CJK Reserve 2' => 0x30000,
 62      'Bomofo' => 0x3100,
 63      'Bomofo Ext' => 0x31a0,
 64      'Lisu' => 0xa4d0,
 65      'Yi' => 0xa000,
 66    ];
 67
 68    // Ends of the character ranges.
 69    $ends = [
 70      'CJK unified' => 0x9fcf,
 71      'CJK Ext A' => 0x4dbf,
 72      'CJK Compat' => 0xfaff,
 73      'Hangul Jamo' => 0x11ff,
 74      'Hangul Ext A' => 0xa97f,
 75      'Hangul Ext B' => 0xd7ff,
 76      'Hangul Compat' => 0x318e,
 77      'Half non-punct 1' => 0xff3a,
 78      'Half non-punct 2' => 0xff5a,
 79      'Half non-punct 3' => 0xffdc,
 80      'Hangul Syllables' => 0xd7af,
 81      'Hiragana' => 0x309f,
 82      'Katakana' => 0x30ff,
 83      'Katakana Ext' => 0x31ff,
 84      'CJK Reserve 1' => 0x2fffd,
 85      'CJK Reserve 2' => 0x3fffd,
 86      'Bomofo' => 0x312f,
 87      'Bomofo Ext' => 0x31b7,
 88      'Lisu' => 0xa4fd,
 89      'Yi' => 0xa48f,
 90    ];
 91
 92    // Generate characters consisting of starts, midpoints, and ends.
 93    $chars = [];
 94    $charcodes = [];
 95    foreach ($starts as $key => $value) {
 96      $charcodes[] = $starts[$key];
 97      $chars[] = $this->code2utf($starts[$key]);
 98      $mid = round(0.5 * ($starts[$key] + $ends[$key]));
 99      $charcodes[] = $mid;
100      $chars[] = $this->code2utf($mid);
101      $charcodes[] = $ends[$key];
102      $chars[] = $this->code2utf($ends[$key]);
103    }
104
105    // Merge into a string and tokenize.
106    $string = implode('', $chars);
107    $out = trim(search_simplify($string));
108    $expected = mb_strtolower(implode(' ', $chars));
109
110    // Verify that the output matches what we expect.
111    $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
112  }
113
114  /**
115   * Verifies that strings of non-CJK characters are not tokenized.
116   *
117   * This is just a sanity check - it verifies that strings of letters are
118   * not tokenized.
119   */
120  public function testNoTokenizer() {
121    // Set the minimum word size to 1 (to split all CJK characters) and make
122    // sure CJK tokenizing is turned on.
123    $this->config('search.settings')
124      ->set('index.minimum_word_size', 1)
125      ->set('index.overlap_cjk', TRUE)
126      ->save();
127    $this->refreshVariables();
128
129    $letters = 'abcdefghijklmnopqrstuvwxyz';
130    $out = trim(search_simplify($letters));
131
132    $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
133  }
134
135  /**
136   * Like PHP chr() function, but for unicode characters.
137   *
138   * Function chr() only works for ASCII characters up to character 255. This
139   * function converts a number to the corresponding unicode character. Adapted
140   * from functions supplied in comments on several functions on php.net.
141   */
142  public function code2utf($num) {
143    if ($num < 128) {
144      return chr($num);
145    }
146
147    if ($num < 2048) {
148      return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
149    }
150
151    if ($num < 65536) {
152      return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
153    }
154
155    if ($num < 2097152) {
156      return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
157    }
158
159    return '';
160  }
161
162}