PageRenderTime 2ms CodeModel.GetById 23ms app.highlight 5ms RepoModel.GetById 1ms app.codeStats 0ms

/core/modules/search/src/Tests/SearchTokenizerTest.php

https://gitlab.com/longphu/drupal8
PHP | 155 lines | 96 code | 18 blank | 41 comment | 4 complexity | 69d131b2806465dc54cad837ed227c36 MD5 | raw file
  1<?php
  2
  3/**
  4 * @file
  5 * Definition of Drupal\search\Tests\SearchTokenizerTest.
  6 */
  7
  8namespace Drupal\search\Tests;
  9use Drupal\Component\Utility\Unicode;
 10
 11/**
 12 * Tests that CJK tokenizer works as intended.
 13 *
 14 * @group search
 15 */
 16class SearchTokenizerTest extends SearchTestBase {
 17
 18  /**
 19   * Verifies that strings of CJK characters are tokenized.
 20   *
 21   * The search_simplify() function does special things with numbers, symbols,
 22   * and punctuation. So we only test that CJK characters that are not in these
 23   * character classes are tokenized properly. See PREG_CLASS_CKJ for more
 24   * information.
 25   */
 26  function testTokenizer() {
 27    // Set the minimum word size to 1 (to split all CJK characters) and make
 28    // sure CJK tokenizing is turned on.
 29    $this->config('search.settings')
 30      ->set('index.minimum_word_size', 1)
 31      ->set('index.overlap_cjk', TRUE)
 32      ->save();
 33    $this->refreshVariables();
 34
 35    // Create a string of CJK characters from various character ranges in
 36    // the Unicode tables.
 37
 38    // Beginnings of the character ranges.
 39    $starts = array(
 40      'CJK unified' => 0x4e00,
 41      'CJK Ext A' => 0x3400,
 42      'CJK Compat' => 0xf900,
 43      'Hangul Jamo' => 0x1100,
 44      'Hangul Ext A' => 0xa960,
 45      'Hangul Ext B' => 0xd7b0,
 46      'Hangul Compat' => 0x3131,
 47      'Half non-punct 1' => 0xff21,
 48      'Half non-punct 2' => 0xff41,
 49      'Half non-punct 3' => 0xff66,
 50      'Hangul Syllables' => 0xac00,
 51      'Hiragana' => 0x3040,
 52      'Katakana' => 0x30a1,
 53      'Katakana Ext' => 0x31f0,
 54      'CJK Reserve 1' => 0x20000,
 55      'CJK Reserve 2' => 0x30000,
 56      'Bomofo' => 0x3100,
 57      'Bomofo Ext' => 0x31a0,
 58      'Lisu' => 0xa4d0,
 59      'Yi' => 0xa000,
 60    );
 61
 62    // Ends of the character ranges.
 63    $ends = array(
 64      'CJK unified' => 0x9fcf,
 65      'CJK Ext A' => 0x4dbf,
 66      'CJK Compat' => 0xfaff,
 67      'Hangul Jamo' => 0x11ff,
 68      'Hangul Ext A' => 0xa97f,
 69      'Hangul Ext B' => 0xd7ff,
 70      'Hangul Compat' => 0x318e,
 71      'Half non-punct 1' => 0xff3a,
 72      'Half non-punct 2' => 0xff5a,
 73      'Half non-punct 3' => 0xffdc,
 74      'Hangul Syllables' => 0xd7af,
 75      'Hiragana' => 0x309f,
 76      'Katakana' => 0x30ff,
 77      'Katakana Ext' => 0x31ff,
 78      'CJK Reserve 1' => 0x2fffd,
 79      'CJK Reserve 2' => 0x3fffd,
 80      'Bomofo' => 0x312f,
 81      'Bomofo Ext' => 0x31b7,
 82      'Lisu' => 0xa4fd,
 83      'Yi' => 0xa48f,
 84    );
 85
 86    // Generate characters consisting of starts, midpoints, and ends.
 87    $chars = array();
 88    $charcodes = array();
 89    foreach ($starts as $key => $value) {
 90      $charcodes[] = $starts[$key];
 91      $chars[] = $this->code2utf($starts[$key]);
 92      $mid = round(0.5 * ($starts[$key] + $ends[$key]));
 93      $charcodes[] = $mid;
 94      $chars[] = $this->code2utf($mid);
 95      $charcodes[] = $ends[$key];
 96      $chars[] = $this->code2utf($ends[$key]);
 97    }
 98
 99    // Merge into a string and tokenize.
100    $string = implode('', $chars);
101    $out = trim(search_simplify($string));
102    $expected = Unicode::strtolower(implode(' ', $chars));
103
104    // Verify that the output matches what we expect.
105    $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
106  }
107
108  /**
109   * Verifies that strings of non-CJK characters are not tokenized.
110   *
111   * This is just a sanity check - it verifies that strings of letters are
112   * not tokenized.
113   */
114  function testNoTokenizer() {
115    // Set the minimum word size to 1 (to split all CJK characters) and make
116    // sure CJK tokenizing is turned on.
117    $this->config('search.settings')
118      ->set('index.minimum_word_size', 1)
119      ->set('index.overlap_cjk', TRUE)
120      ->save();
121    $this->refreshVariables();
122
123    $letters = 'abcdefghijklmnopqrstuvwxyz';
124    $out = trim(search_simplify($letters));
125
126    $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
127  }
128
129  /**
130   * Like PHP chr() function, but for unicode characters.
131   *
132   * chr() only works for ASCII characters up to character 255. This function
133   * converts a number to the corresponding unicode character. Adapted from
134   * functions supplied in comments on several functions on php.net.
135   */
136  function code2utf($num) {
137    if ($num < 128) {
138      return chr($num);
139    }
140
141    if ($num < 2048) {
142      return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
143    }
144
145    if ($num < 65536) {
146      return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
147    }
148
149    if ($num < 2097152) {
150      return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
151    }
152
153    return '';
154  }
155}