PageRenderTime 25ms CodeModel.GetById 12ms app.highlight 10ms RepoModel.GetById 1ms app.codeStats 0ms

/core/modules/search/src/Tests/SearchTokenizerTest.php

https://github.com/besutra/freeraum
PHP | 159 lines | 102 code | 18 blank | 39 comment | 4 complexity | c179c9c8241753046944afb107f80f0c MD5 | raw file
  1<?php
  2
  3/**
  4 * @file
  5 * Definition of Drupal\search\Tests\SearchTokenizerTest.
  6 */
  7
  8namespace Drupal\search\Tests;
  9
 10/**
 11 * Test the CJK tokenizer.
 12 */
 13class SearchTokenizerTest extends SearchTestBase {
 14  public static function getInfo() {
 15    return array(
 16      'name' => 'CJK tokenizer',
 17      'description' => 'Check that CJK tokenizer works as intended.',
 18      'group' => 'Search',
 19    );
 20  }
 21
 22  /**
 23   * Verifies that strings of CJK characters are tokenized.
 24   *
 25   * The search_simplify() function does special things with numbers, symbols,
 26   * and punctuation. So we only test that CJK characters that are not in these
 27   * character classes are tokenized properly. See PREG_CLASS_CKJ for more
 28   * information.
 29   */
 30  function testTokenizer() {
 31    // Set the minimum word size to 1 (to split all CJK characters) and make
 32    // sure CJK tokenizing is turned on.
 33    \Drupal::config('search.settings')
 34      ->set('index.minimum_word_size', 1)
 35      ->set('index.overlap_cjk', TRUE)
 36      ->save();
 37    $this->refreshVariables();
 38
 39    // Create a string of CJK characters from various character ranges in
 40    // the Unicode tables.
 41
 42    // Beginnings of the character ranges.
 43    $starts = array(
 44      'CJK unified' => 0x4e00,
 45      'CJK Ext A' => 0x3400,
 46      'CJK Compat' => 0xf900,
 47      'Hangul Jamo' => 0x1100,
 48      'Hangul Ext A' => 0xa960,
 49      'Hangul Ext B' => 0xd7b0,
 50      'Hangul Compat' => 0x3131,
 51      'Half non-punct 1' => 0xff21,
 52      'Half non-punct 2' => 0xff41,
 53      'Half non-punct 3' => 0xff66,
 54      'Hangul Syllables' => 0xac00,
 55      'Hiragana' => 0x3040,
 56      'Katakana' => 0x30a1,
 57      'Katakana Ext' => 0x31f0,
 58      'CJK Reserve 1' => 0x20000,
 59      'CJK Reserve 2' => 0x30000,
 60      'Bomofo' => 0x3100,
 61      'Bomofo Ext' => 0x31a0,
 62      'Lisu' => 0xa4d0,
 63      'Yi' => 0xa000,
 64    );
 65
 66    // Ends of the character ranges.
 67    $ends = array(
 68      'CJK unified' => 0x9fcf,
 69      'CJK Ext A' => 0x4dbf,
 70      'CJK Compat' => 0xfaff,
 71      'Hangul Jamo' => 0x11ff,
 72      'Hangul Ext A' => 0xa97f,
 73      'Hangul Ext B' => 0xd7ff,
 74      'Hangul Compat' => 0x318e,
 75      'Half non-punct 1' => 0xff3a,
 76      'Half non-punct 2' => 0xff5a,
 77      'Half non-punct 3' => 0xffdc,
 78      'Hangul Syllables' => 0xd7af,
 79      'Hiragana' => 0x309f,
 80      'Katakana' => 0x30ff,
 81      'Katakana Ext' => 0x31ff,
 82      'CJK Reserve 1' => 0x2fffd,
 83      'CJK Reserve 2' => 0x3fffd,
 84      'Bomofo' => 0x312f,
 85      'Bomofo Ext' => 0x31b7,
 86      'Lisu' => 0xa4fd,
 87      'Yi' => 0xa48f,
 88    );
 89
 90    // Generate characters consisting of starts, midpoints, and ends.
 91    $chars = array();
 92    $charcodes = array();
 93    foreach ($starts as $key => $value) {
 94      $charcodes[] = $starts[$key];
 95      $chars[] = $this->code2utf($starts[$key]);
 96      $mid = round(0.5 * ($starts[$key] + $ends[$key]));
 97      $charcodes[] = $mid;
 98      $chars[] = $this->code2utf($mid);
 99      $charcodes[] = $ends[$key];
100      $chars[] = $this->code2utf($ends[$key]);
101    }
102
103    // Merge into a string and tokenize.
104    $string = implode('', $chars);
105    $out = trim(search_simplify($string));
106    $expected = drupal_strtolower(implode(' ', $chars));
107
108    // Verify that the output matches what we expect.
109    $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
110  }
111
112  /**
113   * Verifies that strings of non-CJK characters are not tokenized.
114   *
115   * This is just a sanity check - it verifies that strings of letters are
116   * not tokenized.
117   */
118  function testNoTokenizer() {
119    // Set the minimum word size to 1 (to split all CJK characters) and make
120    // sure CJK tokenizing is turned on.
121    \Drupal::config('search.settings')
122      ->set('minimum_word_size', 1)
123      ->set('overlap_cjk', TRUE)
124      ->save();
125    $this->refreshVariables();
126
127    $letters = 'abcdefghijklmnopqrstuvwxyz';
128    $out = trim(search_simplify($letters));
129
130    $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
131  }
132
133  /**
134   * Like PHP chr() function, but for unicode characters.
135   *
136   * chr() only works for ASCII characters up to character 255. This function
137   * converts a number to the corresponding unicode character. Adapted from
138   * functions supplied in comments on several functions on php.net.
139   */
140  function code2utf($num) {
141    if ($num < 128) {
142      return chr($num);
143    }
144
145    if ($num < 2048) {
146      return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
147    }
148
149    if ($num < 65536) {
150      return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
151    }
152
153    if ($num < 2097152) {
154      return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
155    }
156
157    return '';
158  }
159}