PageRenderTime 92ms CodeModel.GetById 89ms app.highlight 16ms RepoModel.GetById 94ms app.codeStats 1ms

/tests/src/Plugin/Processor/TokenizerTest.php

https://github.com/prateeksachan/search-api-sandbox
PHP | 307 lines | 185 code | 34 blank | 88 comment | 11 complexity | 4c004d3f70e46d8ba72f1df3adeb18a5 MD5 | raw file
  1<?php
  2
  3/**
  4 * @file
  5 * Contains \Drupal\search_api\Tests\Plugin\Processor\TokenizerTest.
  6 */
  7
  8namespace Drupal\search_api\Tests\Plugin\Processor;
  9
 10use Drupal\Component\Utility\Unicode;
 11use Drupal\search_api\Plugin\SearchApi\Processor\Tokenizer;
 12use Drupal\search_api\Utility\Utility;
 13use Drupal\Tests\UnitTestCase;
 14
 15/**
 16 * Tests the Tokenizer processor plugin.
 17 *
 18 * @group Drupal
 19 * @group search_api
 20 */
 21class TokenizerTest extends UnitTestCase {
 22
 23  use ProcessorTestTrait;
 24
 25  /**
 26   * {@inheritdoc}
 27   */
 28  public static function getInfo() {
 29    return array(
 30      'name' => 'Tokenizer processor test',
 31      'description' => 'Test if Tokenizer processor works.',
 32      'group' => 'Search API',
 33    );
 34  }
 35
 36  /**
 37   * {@inheritdoc}
 38   */
 39  protected function setUp() {
 40    parent::setUp();
 41
 42    $this->processor = new Tokenizer(array(), 'tokenizer', array());
 43  }
 44
 45  /**
 46   * Tests the processFieldValue() method.
 47   *
 48   * @dataProvider textDataProvider
 49   */
 50  public function testProcessFieldValue($passedString, $expectedValue, array $config = array()) {
 51    if ($config) {
 52      $this->processor->setConfiguration($config);
 53    }
 54    $this->invokeMethod('processFieldValue', array(&$passedString, 'text'));
 55    $this->assertEquals($expectedValue, $passedString);
 56  }
 57
 58  /**
 59   * Data provider for testValueConfiguration().
 60   */
 61  public function textDataProvider() {
 62    $word_token = Utility::createTextToken('word');
 63    return array(
 64      // Simple cases.
 65      array('word', array($word_token)),
 66      array('word word', array($word_token, $word_token)),
 67      // Default splits on special characters, too.
 68      array('words!word', array(Utility::createTextToken('words'), $word_token)),
 69      array('words$word', array(Utility::createTextToken('words'), $word_token)),
 70      // Overriding the default works and is case-insensitive.
 71      array('wordXwordxword', array($word_token, Utility::createTextToken('wordxword')), array('spaces' => 'X')),
 72      array('word3word!word', array($word_token, Utility::createTextToken('word!word')), array('spaces' => '\d')),
 73      array('wordXwordRword', array($word_token, $word_token, $word_token), array('spaces' => 'R-Z')),
 74      array('wordXwordRword', array($word_token, $word_token, $word_token), array('spaces' => 'R-TW-Z')),
 75      array('wordXword word', array($word_token, $word_token, $word_token), array('spaces' => 'R-Z')),
 76      // Minimum word size works.
 77      array('wordSwo', array($word_token), array('spaces' => 'R-Z')),
 78      array('wordSwo', array($word_token, Utility::createTextToken('wo')), array('spaces' => 'R-Z', 'minimum_word_size' => 2)),
 79      array('word w', array($word_token), array('minimum_word_size' => 2)),
 80      array('word w', array($word_token, Utility::createTextToken('w')), array('minimum_word_size' => 1)),
 81      array('word wordword', array(), array('minimum_word_size' => 10)),
 82    );
 83  }
 84
 85
 86  /**
 87   * Tests that the simplifyText() method handles CJK characters properly.
 88   *
 89   * The simplifyText() method does special things with numbers, symbols and
 90   * punctuation. So we only test that CJK characters that are not in these
 91   * character classes are tokenized properly. See PREG_CLASS_CJK for more
 92   * information.
 93   */
 94  public function testCjkSupport() {
 95    $this->invokeMethod('prepare');
 96    // Create a string of CJK characters from various character ranges in
 97    // the Unicode tables.
 98
 99    // Beginnings of the character ranges.
100    $starts = array(
101      'CJK unified' => 0x4e00,
102      'CJK Ext A' => 0x3400,
103      'CJK Compat' => 0xf900,
104      'Hangul Jamo' => 0x1100,
105      'Hangul Ext A' => 0xa960,
106      'Hangul Ext B' => 0xd7b0,
107      'Hangul Compat' => 0x3131,
108      'Half non-punct 1' => 0xff21,
109      'Half non-punct 2' => 0xff41,
110      'Half non-punct 3' => 0xff66,
111      'Hangul Syllables' => 0xac00,
112      'Hiragana' => 0x3040,
113      'Katakana' => 0x30a1,
114      'Katakana Ext' => 0x31f0,
115      'CJK Reserve 1' => 0x20000,
116      'CJK Reserve 2' => 0x30000,
117      'Bomofo' => 0x3100,
118      'Bomofo Ext' => 0x31a0,
119      'Lisu' => 0xa4d0,
120      'Yi' => 0xa000,
121    );
122
123    // Ends of the character ranges.
124    $ends = array(
125      'CJK unified' => 0x9fcf,
126      'CJK Ext A' => 0x4dbf,
127      'CJK Compat' => 0xfaff,
128      'Hangul Jamo' => 0x11ff,
129      'Hangul Ext A' => 0xa97f,
130      'Hangul Ext B' => 0xd7ff,
131      'Hangul Compat' => 0x318e,
132      'Half non-punct 1' => 0xff3a,
133      'Half non-punct 2' => 0xff5a,
134      'Half non-punct 3' => 0xffdc,
135      'Hangul Syllables' => 0xd7af,
136      'Hiragana' => 0x309f,
137      'Katakana' => 0x30ff,
138      'Katakana Ext' => 0x31ff,
139      'CJK Reserve 1' => 0x2fffd,
140      'CJK Reserve 2' => 0x3fffd,
141      'Bomofo' => 0x312f,
142      'Bomofo Ext' => 0x31b7,
143      'Lisu' => 0xa4fd,
144      'Yi' => 0xa48f,
145    );
146
147    // Generate characters consisting of starts, midpoints, and ends.
148    $chars = array();
149    foreach ($starts as $key => $value) {
150      $chars[] = self::codepointToUtf8($starts[$key]);
151      $mid = round(0.5 * ($starts[$key] + $ends[$key]));
152      $chars[] = self::codepointToUtf8($mid);
153      $chars[] = self::codepointToUtf8($ends[$key]);
154    }
155
156    // Merge into a string and tokenize.
157    $text = implode('', $chars);
158
159    $simplified_text = $this->invokeMethod('simplifyText', array($text));
160    $expected = '';
161    for ($i = 2; $i < count($chars); ++$i) {
162      $expected .= $chars[$i - 2];
163      $expected .= $chars[$i - 1];
164      $expected .= $chars[$i];
165      $expected .= ' ';
166    }
167    $expected = trim($expected);
168
169    // Verify that the output matches what we expect.
170    $this->assertEquals($expected, $simplified_text, 'CJK tokenizer worked on all supplied CJK characters');
171
172    $this->processor->setConfiguration(array('overlap_cjk' => FALSE));
173    $this->invokeMethod('prepare');
174    $simplified_text = $this->invokeMethod('simplifyText', array($text));
175    $this->assertEquals($text, $simplified_text, 'CJK tokenizing is successfully disabled');
176  }
177
178  /**
179   * Verifies that strings of non-CJK characters are not tokenized.
180   *
181   * This is just a sanity check - it verifies that strings of letters are
182   * not tokenized.
183   */
184  public function testNoTokenizer() {
185    // Set the minimum word size to 1 (to split all CJK characters).
186    $this->processor->setConfiguration(array('minimum_word_size' => 1));
187    $this->invokeMethod('prepare');
188
189    $letters = 'abcdefghijklmnopqrstuvwxyz';
190    $out = $this->invokeMethod('simplifyText', array($letters));
191
192    $this->assertEquals($letters, $out, 'Latin letters are not CJK tokenized');
193  }
194
195  /**
196   * Like PHP chr() function, but for unicode characters.
197   *
198   * chr() only works for ASCII characters up to character 255. This function
199   * converts a number to the corresponding unicode character. Adapted from
200   * functions supplied in comments on several functions on php.net.
201   */
202  public static function codepointToUtf8($num) {
203    if ($num < 128) {
204      return chr($num);
205    }
206
207    if ($num < 2048) {
208      return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
209    }
210
211    if ($num < 65536) {
212      return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
213    }
214
215    if ($num < 2097152) {
216      return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
217    }
218
219    return '';
220  }
221
222  /**
223   * Tests that all Unicode characters simplify correctly.
224   *
225   * This test uses a Drupal core search file that was constructed so that the
226   * even lines are boundary characters, and the odd lines are valid word
227   * characters. (It was generated as a sequence of all the Unicode characters,
228   * and then the boundary chararacters (punctuation, spaces, etc.) were split
229   * off into their own lines).  So the even-numbered lines should simplify to
230   * nothing, and the odd-numbered lines we need to split into shorter chunks
231   * and verify that simplification doesn't lose any characters.
232   *
233   */
234  public function testSearchSimplifyUnicode() {
235    // Set the minimum word size to 1 (to split all CJK characters).
236    $this->processor->setConfiguration(array('minimum_word_size' => 1));
237    $this->invokeMethod('prepare');
238
239    $input = file_get_contents(DRUPAL_ROOT . '/core/modules/search/tests/UnicodeTest.txt');
240    $basestrings = explode(chr(10), $input);
241    $strings = array();
242    foreach ($basestrings as $key => $string) {
243      if ($key %2) {
244        // Even line - should simplify down to a space.
245        $simplified = $this->invokeMethod('simplifyText', array($string));
246        $this->assertEquals('', $simplified, "Line $key is excluded from the index");
247      }
248      else {
249        // Odd line, should be word characters.
250        // Split this into 30-character chunks, so we don't run into limits
251        // of truncation in search_simplify().
252        $start = 0;
253        while ($start < Unicode::strlen($string)) {
254          $newstr = Unicode::substr($string, $start, 30);
255          // Special case: leading zeros are removed from numeric strings,
256          // and there's one string in this file that is numbers starting with
257          // zero, so prepend a 1 on that string.
258          if (preg_match('/^[0-9]+$/', $newstr)) {
259            $newstr = '1' . $newstr;
260          }
261          $strings[] = $newstr;
262          $start += 30;
263        }
264      }
265    }
266    foreach ($strings as $key => $string) {
267      $simplified = $this->invokeMethod('simplifyText', array($string));
268      $this->assertTrue(Unicode::strlen($simplified) >= Unicode::strlen($string), "Nothing is removed from string $key.");
269    }
270
271    // Test the low-numbered ASCII control characters separately. They are not
272    // in the text file because they are problematic for diff, especially \0.
273    $string = '';
274    for ($i = 0; $i < 32; $i++) {
275      $string .= chr($i);
276    }
277    $this->assertEquals('', $this->invokeMethod('simplifyText', array($string)), 'Search simplify works for ASCII control characters.');
278  }
279
280  /**
281   * Tests whether punctuation is treated correctly.
282   *
283   * @dataProvider searchSimplifyPunctuationProvider
284   */
285  public function testSearchSimplifyPunctuation($passedString, $expectedValue, $message) {
286    // Set the minimum word size to 1 (to split all CJK characters).
287    $this->processor->setConfiguration(array('minimum_word_size' => 1));
288    $this->invokeMethod('prepare');
289
290    $out = $this->invokeMethod('simplifyText', array($passedString));
291    $this->assertEquals($expectedValue, $out, $message);
292  }
293
294  /**
295   * Data provider for testSearchSimplifyPunctuation().
296   */
297  public function searchSimplifyPunctuationProvider() {
298    $cases = array(
299      array('20.03/94-28,876', '20039428876', 'Punctuation removed from numbers'),
300      array('great...drupal--module', 'great drupal module', 'Multiple dot and dashes are word boundaries'),
301      array('very_great-drupal.module', 'verygreatdrupalmodule', 'Single dot, dash, underscore are removed'),
302      array('regular,punctuation;word', 'regular punctuation word', 'Punctuation is a word boundary'),
303    );
304    return $cases;
305  }
306
307}