PageRenderTime 64ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/tests/src/Plugin/Processor/TokenizerTest.php

https://github.com/prateeksachan/search-api-sandbox
PHP | 307 lines | 185 code | 34 blank | 88 comment | 11 complexity | 4c004d3f70e46d8ba72f1df3adeb18a5 MD5 | raw file
  1. <?php
  2. /**
  3. * @file
  4. * Contains \Drupal\search_api\Tests\Plugin\Processor\TokenizerTest.
  5. */
  6. namespace Drupal\search_api\Tests\Plugin\Processor;
  7. use Drupal\Component\Utility\Unicode;
  8. use Drupal\search_api\Plugin\SearchApi\Processor\Tokenizer;
  9. use Drupal\search_api\Utility\Utility;
  10. use Drupal\Tests\UnitTestCase;
  11. /**
  12. * Tests the Tokenizer processor plugin.
  13. *
  14. * @group Drupal
  15. * @group search_api
  16. */
  17. class TokenizerTest extends UnitTestCase {
  18. use ProcessorTestTrait;
  19. /**
  20. * {@inheritdoc}
  21. */
  22. public static function getInfo() {
  23. return array(
  24. 'name' => 'Tokenizer processor test',
  25. 'description' => 'Test if Tokenizer processor works.',
  26. 'group' => 'Search API',
  27. );
  28. }
  29. /**
  30. * {@inheritdoc}
  31. */
  32. protected function setUp() {
  33. parent::setUp();
  34. $this->processor = new Tokenizer(array(), 'tokenizer', array());
  35. }
  36. /**
  37. * Tests the processFieldValue() method.
  38. *
  39. * @dataProvider textDataProvider
  40. */
  41. public function testProcessFieldValue($passedString, $expectedValue, array $config = array()) {
  42. if ($config) {
  43. $this->processor->setConfiguration($config);
  44. }
  45. $this->invokeMethod('processFieldValue', array(&$passedString, 'text'));
  46. $this->assertEquals($expectedValue, $passedString);
  47. }
  48. /**
  49. * Data provider for testValueConfiguration().
  50. */
  51. public function textDataProvider() {
  52. $word_token = Utility::createTextToken('word');
  53. return array(
  54. // Simple cases.
  55. array('word', array($word_token)),
  56. array('word word', array($word_token, $word_token)),
  57. // Default splits on special characters, too.
  58. array('words!word', array(Utility::createTextToken('words'), $word_token)),
  59. array('words$word', array(Utility::createTextToken('words'), $word_token)),
  60. // Overriding the default works and is case-insensitive.
  61. array('wordXwordxword', array($word_token, Utility::createTextToken('wordxword')), array('spaces' => 'X')),
  62. array('word3word!word', array($word_token, Utility::createTextToken('word!word')), array('spaces' => '\d')),
  63. array('wordXwordRword', array($word_token, $word_token, $word_token), array('spaces' => 'R-Z')),
  64. array('wordXwordRword', array($word_token, $word_token, $word_token), array('spaces' => 'R-TW-Z')),
  65. array('wordXword word', array($word_token, $word_token, $word_token), array('spaces' => 'R-Z')),
  66. // Minimum word size works.
  67. array('wordSwo', array($word_token), array('spaces' => 'R-Z')),
  68. array('wordSwo', array($word_token, Utility::createTextToken('wo')), array('spaces' => 'R-Z', 'minimum_word_size' => 2)),
  69. array('word w', array($word_token), array('minimum_word_size' => 2)),
  70. array('word w', array($word_token, Utility::createTextToken('w')), array('minimum_word_size' => 1)),
  71. array('word wordword', array(), array('minimum_word_size' => 10)),
  72. );
  73. }
  74. /**
  75. * Tests that the simplifyText() method handles CJK characters properly.
  76. *
  77. * The simplifyText() method does special things with numbers, symbols and
  78. * punctuation. So we only test that CJK characters that are not in these
  79. * character classes are tokenized properly. See PREG_CLASS_CJK for more
  80. * information.
  81. */
  82. public function testCjkSupport() {
  83. $this->invokeMethod('prepare');
  84. // Create a string of CJK characters from various character ranges in
  85. // the Unicode tables.
  86. // Beginnings of the character ranges.
  87. $starts = array(
  88. 'CJK unified' => 0x4e00,
  89. 'CJK Ext A' => 0x3400,
  90. 'CJK Compat' => 0xf900,
  91. 'Hangul Jamo' => 0x1100,
  92. 'Hangul Ext A' => 0xa960,
  93. 'Hangul Ext B' => 0xd7b0,
  94. 'Hangul Compat' => 0x3131,
  95. 'Half non-punct 1' => 0xff21,
  96. 'Half non-punct 2' => 0xff41,
  97. 'Half non-punct 3' => 0xff66,
  98. 'Hangul Syllables' => 0xac00,
  99. 'Hiragana' => 0x3040,
  100. 'Katakana' => 0x30a1,
  101. 'Katakana Ext' => 0x31f0,
  102. 'CJK Reserve 1' => 0x20000,
  103. 'CJK Reserve 2' => 0x30000,
  104. 'Bomofo' => 0x3100,
  105. 'Bomofo Ext' => 0x31a0,
  106. 'Lisu' => 0xa4d0,
  107. 'Yi' => 0xa000,
  108. );
  109. // Ends of the character ranges.
  110. $ends = array(
  111. 'CJK unified' => 0x9fcf,
  112. 'CJK Ext A' => 0x4dbf,
  113. 'CJK Compat' => 0xfaff,
  114. 'Hangul Jamo' => 0x11ff,
  115. 'Hangul Ext A' => 0xa97f,
  116. 'Hangul Ext B' => 0xd7ff,
  117. 'Hangul Compat' => 0x318e,
  118. 'Half non-punct 1' => 0xff3a,
  119. 'Half non-punct 2' => 0xff5a,
  120. 'Half non-punct 3' => 0xffdc,
  121. 'Hangul Syllables' => 0xd7af,
  122. 'Hiragana' => 0x309f,
  123. 'Katakana' => 0x30ff,
  124. 'Katakana Ext' => 0x31ff,
  125. 'CJK Reserve 1' => 0x2fffd,
  126. 'CJK Reserve 2' => 0x3fffd,
  127. 'Bomofo' => 0x312f,
  128. 'Bomofo Ext' => 0x31b7,
  129. 'Lisu' => 0xa4fd,
  130. 'Yi' => 0xa48f,
  131. );
  132. // Generate characters consisting of starts, midpoints, and ends.
  133. $chars = array();
  134. foreach ($starts as $key => $value) {
  135. $chars[] = self::codepointToUtf8($starts[$key]);
  136. $mid = round(0.5 * ($starts[$key] + $ends[$key]));
  137. $chars[] = self::codepointToUtf8($mid);
  138. $chars[] = self::codepointToUtf8($ends[$key]);
  139. }
  140. // Merge into a string and tokenize.
  141. $text = implode('', $chars);
  142. $simplified_text = $this->invokeMethod('simplifyText', array($text));
  143. $expected = '';
  144. for ($i = 2; $i < count($chars); ++$i) {
  145. $expected .= $chars[$i - 2];
  146. $expected .= $chars[$i - 1];
  147. $expected .= $chars[$i];
  148. $expected .= ' ';
  149. }
  150. $expected = trim($expected);
  151. // Verify that the output matches what we expect.
  152. $this->assertEquals($expected, $simplified_text, 'CJK tokenizer worked on all supplied CJK characters');
  153. $this->processor->setConfiguration(array('overlap_cjk' => FALSE));
  154. $this->invokeMethod('prepare');
  155. $simplified_text = $this->invokeMethod('simplifyText', array($text));
  156. $this->assertEquals($text, $simplified_text, 'CJK tokenizing is successfully disabled');
  157. }
  158. /**
  159. * Verifies that strings of non-CJK characters are not tokenized.
  160. *
  161. * This is just a sanity check - it verifies that strings of letters are
  162. * not tokenized.
  163. */
  164. public function testNoTokenizer() {
  165. // Set the minimum word size to 1 (to split all CJK characters).
  166. $this->processor->setConfiguration(array('minimum_word_size' => 1));
  167. $this->invokeMethod('prepare');
  168. $letters = 'abcdefghijklmnopqrstuvwxyz';
  169. $out = $this->invokeMethod('simplifyText', array($letters));
  170. $this->assertEquals($letters, $out, 'Latin letters are not CJK tokenized');
  171. }
  172. /**
  173. * Like PHP chr() function, but for unicode characters.
  174. *
  175. * chr() only works for ASCII characters up to character 255. This function
  176. * converts a number to the corresponding unicode character. Adapted from
  177. * functions supplied in comments on several functions on php.net.
  178. */
  179. public static function codepointToUtf8($num) {
  180. if ($num < 128) {
  181. return chr($num);
  182. }
  183. if ($num < 2048) {
  184. return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
  185. }
  186. if ($num < 65536) {
  187. return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
  188. }
  189. if ($num < 2097152) {
  190. return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
  191. }
  192. return '';
  193. }
  194. /**
  195. * Tests that all Unicode characters simplify correctly.
  196. *
  197. * This test uses a Drupal core search file that was constructed so that the
  198. * even lines are boundary characters, and the odd lines are valid word
  199. * characters. (It was generated as a sequence of all the Unicode characters,
  200. * and then the boundary chararacters (punctuation, spaces, etc.) were split
  201. * off into their own lines). So the even-numbered lines should simplify to
  202. * nothing, and the odd-numbered lines we need to split into shorter chunks
  203. * and verify that simplification doesn't lose any characters.
  204. *
  205. */
  206. public function testSearchSimplifyUnicode() {
  207. // Set the minimum word size to 1 (to split all CJK characters).
  208. $this->processor->setConfiguration(array('minimum_word_size' => 1));
  209. $this->invokeMethod('prepare');
  210. $input = file_get_contents(DRUPAL_ROOT . '/core/modules/search/tests/UnicodeTest.txt');
  211. $basestrings = explode(chr(10), $input);
  212. $strings = array();
  213. foreach ($basestrings as $key => $string) {
  214. if ($key %2) {
  215. // Even line - should simplify down to a space.
  216. $simplified = $this->invokeMethod('simplifyText', array($string));
  217. $this->assertEquals('', $simplified, "Line $key is excluded from the index");
  218. }
  219. else {
  220. // Odd line, should be word characters.
  221. // Split this into 30-character chunks, so we don't run into limits
  222. // of truncation in search_simplify().
  223. $start = 0;
  224. while ($start < Unicode::strlen($string)) {
  225. $newstr = Unicode::substr($string, $start, 30);
  226. // Special case: leading zeros are removed from numeric strings,
  227. // and there's one string in this file that is numbers starting with
  228. // zero, so prepend a 1 on that string.
  229. if (preg_match('/^[0-9]+$/', $newstr)) {
  230. $newstr = '1' . $newstr;
  231. }
  232. $strings[] = $newstr;
  233. $start += 30;
  234. }
  235. }
  236. }
  237. foreach ($strings as $key => $string) {
  238. $simplified = $this->invokeMethod('simplifyText', array($string));
  239. $this->assertTrue(Unicode::strlen($simplified) >= Unicode::strlen($string), "Nothing is removed from string $key.");
  240. }
  241. // Test the low-numbered ASCII control characters separately. They are not
  242. // in the text file because they are problematic for diff, especially \0.
  243. $string = '';
  244. for ($i = 0; $i < 32; $i++) {
  245. $string .= chr($i);
  246. }
  247. $this->assertEquals('', $this->invokeMethod('simplifyText', array($string)), 'Search simplify works for ASCII control characters.');
  248. }
  249. /**
  250. * Tests whether punctuation is treated correctly.
  251. *
  252. * @dataProvider searchSimplifyPunctuationProvider
  253. */
  254. public function testSearchSimplifyPunctuation($passedString, $expectedValue, $message) {
  255. // Set the minimum word size to 1 (to split all CJK characters).
  256. $this->processor->setConfiguration(array('minimum_word_size' => 1));
  257. $this->invokeMethod('prepare');
  258. $out = $this->invokeMethod('simplifyText', array($passedString));
  259. $this->assertEquals($expectedValue, $out, $message);
  260. }
  261. /**
  262. * Data provider for testSearchSimplifyPunctuation().
  263. */
  264. public function searchSimplifyPunctuationProvider() {
  265. $cases = array(
  266. array('20.03/94-28,876', '20039428876', 'Punctuation removed from numbers'),
  267. array('great...drupal--module', 'great drupal module', 'Multiple dot and dashes are word boundaries'),
  268. array('very_great-drupal.module', 'verygreatdrupalmodule', 'Single dot, dash, underscore are removed'),
  269. array('regular,punctuation;word', 'regular punctuation word', 'Punctuation is a word boundary'),
  270. );
  271. return $cases;
  272. }
  273. }