PageRenderTime 34ms CodeModel.GetById 11ms app.highlight 16ms RepoModel.GetById 1ms app.codeStats 1ms

/core/modules/search/src/Tests/SearchTokenizerTest.php

https://github.com/build2be/drupal
PHP | 153 lines | 95 code | 17 blank | 41 comment | 4 complexity | f373c897272931d7df6546f8c78ef606 MD5 | raw file
  1<?php
  2
  3/**
  4 * @file
  5 * Definition of Drupal\search\Tests\SearchTokenizerTest.
  6 */
  7
  8namespace Drupal\search\Tests;
  9
 10/**
 11 * Tests that CJK tokenizer works as intended.
 12 *
 13 * @group search
 14 */
 15class SearchTokenizerTest extends SearchTestBase {
 16  /**
 17   * Verifies that strings of CJK characters are tokenized.
 18   *
 19   * The search_simplify() function does special things with numbers, symbols,
 20   * and punctuation. So we only test that CJK characters that are not in these
 21   * character classes are tokenized properly. See PREG_CLASS_CKJ for more
 22   * information.
 23   */
 24  function testTokenizer() {
 25    // Set the minimum word size to 1 (to split all CJK characters) and make
 26    // sure CJK tokenizing is turned on.
 27    \Drupal::config('search.settings')
 28      ->set('index.minimum_word_size', 1)
 29      ->set('index.overlap_cjk', TRUE)
 30      ->save();
 31    $this->refreshVariables();
 32
 33    // Create a string of CJK characters from various character ranges in
 34    // the Unicode tables.
 35
 36    // Beginnings of the character ranges.
 37    $starts = array(
 38      'CJK unified' => 0x4e00,
 39      'CJK Ext A' => 0x3400,
 40      'CJK Compat' => 0xf900,
 41      'Hangul Jamo' => 0x1100,
 42      'Hangul Ext A' => 0xa960,
 43      'Hangul Ext B' => 0xd7b0,
 44      'Hangul Compat' => 0x3131,
 45      'Half non-punct 1' => 0xff21,
 46      'Half non-punct 2' => 0xff41,
 47      'Half non-punct 3' => 0xff66,
 48      'Hangul Syllables' => 0xac00,
 49      'Hiragana' => 0x3040,
 50      'Katakana' => 0x30a1,
 51      'Katakana Ext' => 0x31f0,
 52      'CJK Reserve 1' => 0x20000,
 53      'CJK Reserve 2' => 0x30000,
 54      'Bomofo' => 0x3100,
 55      'Bomofo Ext' => 0x31a0,
 56      'Lisu' => 0xa4d0,
 57      'Yi' => 0xa000,
 58    );
 59
 60    // Ends of the character ranges.
 61    $ends = array(
 62      'CJK unified' => 0x9fcf,
 63      'CJK Ext A' => 0x4dbf,
 64      'CJK Compat' => 0xfaff,
 65      'Hangul Jamo' => 0x11ff,
 66      'Hangul Ext A' => 0xa97f,
 67      'Hangul Ext B' => 0xd7ff,
 68      'Hangul Compat' => 0x318e,
 69      'Half non-punct 1' => 0xff3a,
 70      'Half non-punct 2' => 0xff5a,
 71      'Half non-punct 3' => 0xffdc,
 72      'Hangul Syllables' => 0xd7af,
 73      'Hiragana' => 0x309f,
 74      'Katakana' => 0x30ff,
 75      'Katakana Ext' => 0x31ff,
 76      'CJK Reserve 1' => 0x2fffd,
 77      'CJK Reserve 2' => 0x3fffd,
 78      'Bomofo' => 0x312f,
 79      'Bomofo Ext' => 0x31b7,
 80      'Lisu' => 0xa4fd,
 81      'Yi' => 0xa48f,
 82    );
 83
 84    // Generate characters consisting of starts, midpoints, and ends.
 85    $chars = array();
 86    $charcodes = array();
 87    foreach ($starts as $key => $value) {
 88      $charcodes[] = $starts[$key];
 89      $chars[] = $this->code2utf($starts[$key]);
 90      $mid = round(0.5 * ($starts[$key] + $ends[$key]));
 91      $charcodes[] = $mid;
 92      $chars[] = $this->code2utf($mid);
 93      $charcodes[] = $ends[$key];
 94      $chars[] = $this->code2utf($ends[$key]);
 95    }
 96
 97    // Merge into a string and tokenize.
 98    $string = implode('', $chars);
 99    $out = trim(search_simplify($string));
100    $expected = drupal_strtolower(implode(' ', $chars));
101
102    // Verify that the output matches what we expect.
103    $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
104  }
105
106  /**
107   * Verifies that strings of non-CJK characters are not tokenized.
108   *
109   * This is just a sanity check - it verifies that strings of letters are
110   * not tokenized.
111   */
112  function testNoTokenizer() {
113    // Set the minimum word size to 1 (to split all CJK characters) and make
114    // sure CJK tokenizing is turned on.
115    \Drupal::config('search.settings')
116      ->set('minimum_word_size', 1)
117      ->set('overlap_cjk', TRUE)
118      ->save();
119    $this->refreshVariables();
120
121    $letters = 'abcdefghijklmnopqrstuvwxyz';
122    $out = trim(search_simplify($letters));
123
124    $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
125  }
126
127  /**
128   * Like PHP chr() function, but for unicode characters.
129   *
130   * chr() only works for ASCII characters up to character 255. This function
131   * converts a number to the corresponding unicode character. Adapted from
132   * functions supplied in comments on several functions on php.net.
133   */
134  function code2utf($num) {
135    if ($num < 128) {
136      return chr($num);
137    }
138
139    if ($num < 2048) {
140      return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
141    }
142
143    if ($num < 65536) {
144      return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
145    }
146
147    if ($num < 2097152) {
148      return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
149    }
150
151    return '';
152  }
153}