/tests/Zend/Search/Lucene/AnalysisTest.php

https://github.com/jtai/zf2 · PHP · 382 lines · 231 code · 89 blank · 62 comment · 12 complexity · 2d2cc2ce5f519350a368aad3fab70159 MD5 · raw file

  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage UnitTests
  18. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /**
  22. * @namespace
  23. */
  24. namespace ZendTest\Search\Lucene;
  25. use Zend\Search\Lucene\Analysis\Analyzer;
  26. use Zend\Search\Lucene\Analysis\Analyzer\Common;
  27. use Zend\Search\Lucene\Analysis\Analyzer\Common\Text;
  28. use Zend\Search\Lucene\Analysis\Analyzer\Common\TextNum;
  29. use Zend\Search\Lucene\Analysis\Analyzer\Common\Utf8;
  30. use Zend\Search\Lucene\Analysis\Analyzer\Common\Utf8Num;
  31. /**
  32. * Zend_Search_Lucene
  33. */
  34. /**
  35. * PHPUnit test case
  36. */
  37. /**
  38. * @category Zend
  39. * @package Zend_Search_Lucene
  40. * @subpackage UnitTests
  41. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  42. * @license http://framework.zend.com/license/new-bsd New BSD License
  43. * @group Zend_Search_Lucene
  44. */
  45. class AnalysisTest extends \PHPUnit_Framework_TestCase
  46. {
  47. public function testAnalyzer()
  48. {
  49. $currentAnalyzer = Analyzer\Analyzer::getDefault();
  50. $this->assertTrue($currentAnalyzer instanceof Analyzer);
  51. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
  52. $newAnalyzer = new Common\Utf8Num();
  53. Analyzer\Analyzer::setDefault($newAnalyzer);
  54. $this->assertTrue(Analyzer\Analyzer::getDefault() === $newAnalyzer);
  55. // Set analyzer to the default value (used in other tests)
  56. Analyzer\Analyzer::setDefault($currentAnalyzer);
  57. }
  58. public function testText()
  59. {
  60. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
  61. $analyzer = new Common\Text();
  62. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  63. $this->assertEquals(count($tokenList), 3);
  64. $this->assertEquals($tokenList[0]->getTermText(), 'Word');
  65. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  66. $this->assertEquals($tokenList[0]->getEndOffset(), 4);
  67. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  68. $this->assertEquals($tokenList[1]->getTermText(), 'Word');
  69. $this->assertEquals($tokenList[1]->getStartOffset(), 6);
  70. $this->assertEquals($tokenList[1]->getEndOffset(), 10);
  71. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  72. $this->assertEquals($tokenList[2]->getTermText(), 'anotherWord');
  73. $this->assertEquals($tokenList[2]->getStartOffset(), 12);
  74. $this->assertEquals($tokenList[2]->getEndOffset(), 23);
  75. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  76. }
  77. public function testTextCaseInsensitive()
  78. {
  79. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  80. $analyzer = new Text\CaseInsensitive();
  81. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  82. $this->assertEquals(count($tokenList), 3);
  83. $this->assertEquals($tokenList[0]->getTermText(), 'word');
  84. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  85. $this->assertEquals($tokenList[0]->getEndOffset(), 4);
  86. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  87. $this->assertEquals($tokenList[1]->getTermText(), 'word');
  88. $this->assertEquals($tokenList[1]->getStartOffset(), 6);
  89. $this->assertEquals($tokenList[1]->getEndOffset(), 10);
  90. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  91. $this->assertEquals($tokenList[2]->getTermText(), 'anotherword');
  92. $this->assertEquals($tokenList[2]->getStartOffset(), 12);
  93. $this->assertEquals($tokenList[2]->getEndOffset(), 23);
  94. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  95. }
  96. public function testTextNum()
  97. {
  98. /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
  99. $analyzer = new Common\TextNum();
  100. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  101. $this->assertEquals(count($tokenList), 3);
  102. $this->assertEquals($tokenList[0]->getTermText(), 'Word1');
  103. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  104. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  105. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  106. $this->assertEquals($tokenList[1]->getTermText(), 'Word2');
  107. $this->assertEquals($tokenList[1]->getStartOffset(), 6);
  108. $this->assertEquals($tokenList[1]->getEndOffset(), 11);
  109. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  110. $this->assertEquals($tokenList[2]->getTermText(), 'anotherWord');
  111. $this->assertEquals($tokenList[2]->getStartOffset(), 12);
  112. $this->assertEquals($tokenList[2]->getEndOffset(), 23);
  113. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  114. }
  115. public function testTextNumCaseInsensitive()
  116. {
  117. /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
  118. $analyzer = new TextNum\CaseInsensitive();
  119. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  120. $this->assertEquals(count($tokenList), 3);
  121. $this->assertEquals($tokenList[0]->getTermText(), 'word1');
  122. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  123. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  124. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  125. $this->assertEquals($tokenList[1]->getTermText(), 'word2');
  126. $this->assertEquals($tokenList[1]->getStartOffset(), 6);
  127. $this->assertEquals($tokenList[1]->getEndOffset(), 11);
  128. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  129. $this->assertEquals($tokenList[2]->getTermText(), 'anotherword');
  130. $this->assertEquals($tokenList[2]->getStartOffset(), 12);
  131. $this->assertEquals($tokenList[2]->getEndOffset(), 23);
  132. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  133. }
  134. public function testUtf8()
  135. {
  136. if (@preg_match('/\pL/u', 'a') != 1) {
  137. // PCRE unicode support is turned off
  138. return;
  139. }
  140. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
  141. $analyzer = new Common\Utf8();
  142. // UTF-8 text with a cyrillic symbols
  143. $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
  144. $this->assertEquals(count($tokenList), 3);
  145. $this->assertEquals($tokenList[0]->getTermText(), 'Слово');
  146. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  147. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  148. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  149. $this->assertEquals($tokenList[1]->getTermText(), 'Слово');
  150. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  151. $this->assertEquals($tokenList[1]->getEndOffset(), 12);
  152. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  153. $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
  154. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  155. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  156. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  157. }
  158. public function testUtf8Num()
  159. {
  160. if (@preg_match('/\pL/u', 'a') != 1) {
  161. // PCRE unicode support is turned off
  162. return;
  163. }
  164. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
  165. $analyzer = new Common\Utf8Num();
  166. // UTF-8 text with a cyrillic symbols
  167. $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
  168. $this->assertEquals(count($tokenList), 3);
  169. $this->assertEquals($tokenList[0]->getTermText(), 'Слово1');
  170. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  171. $this->assertEquals($tokenList[0]->getEndOffset(), 6);
  172. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  173. $this->assertEquals($tokenList[1]->getTermText(), 'Слово2');
  174. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  175. $this->assertEquals($tokenList[1]->getEndOffset(), 13);
  176. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  177. $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
  178. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  179. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  180. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  181. }
  182. public function testUtf8CaseInsensitive()
  183. {
  184. if (@preg_match('/\pL/u', 'a') != 1) {
  185. // PCRE unicode support is turned off
  186. return;
  187. }
  188. if (!function_exists('mb_strtolower')) {
  189. // mbstring extension is disabled
  190. return;
  191. }
  192. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */
  193. $analyzer = new Utf8\CaseInsensitive();
  194. // UTF-8 text with a cyrillic symbols
  195. $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
  196. $this->assertEquals(count($tokenList), 3);
  197. $this->assertEquals($tokenList[0]->getTermText(), 'слово');
  198. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  199. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  200. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  201. $this->assertEquals($tokenList[1]->getTermText(), 'слово');
  202. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  203. $this->assertEquals($tokenList[1]->getEndOffset(), 12);
  204. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  205. $this->assertEquals($tokenList[2]->getTermText(), 'другоеслово');
  206. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  207. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  208. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  209. }
  210. public function testUtf8NumCaseInsensitive()
  211. {
  212. if (@preg_match('/\pL/u', 'a') != 1) {
  213. // PCRE unicode support is turned off
  214. return;
  215. }
  216. if (!function_exists('mb_strtolower')) {
  217. // mbstring extension is disabled
  218. return;
  219. }
  220. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */
  221. $analyzer = new Utf8Num\CaseInsensitive();
  222. // UTF-8 text with a cyrillic symbols
  223. $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
  224. $this->assertEquals(count($tokenList), 3);
  225. $this->assertEquals($tokenList[0]->getTermText(), 'слово1');
  226. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  227. $this->assertEquals($tokenList[0]->getEndOffset(), 6);
  228. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  229. $this->assertEquals($tokenList[1]->getTermText(), 'слово2');
  230. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  231. $this->assertEquals($tokenList[1]->getEndOffset(), 13);
  232. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  233. $this->assertEquals($tokenList[2]->getTermText(), 'другоеслово');
  234. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  235. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  236. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  237. }
  238. public function testEncoding()
  239. {
  240. if (PHP_OS == 'AIX') {
  241. $this->markTestSkipped('Test not available on AIX');
  242. }
  243. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
  244. $analyzer = new Common\Utf8();
  245. // UTF-8 text with a cyrillic symbols
  246. $tokenList = $analyzer->tokenize(iconv('UTF-8', 'Windows-1251', 'Слово1 Слово2 ДругоеСлово'), 'Windows-1251');
  247. $this->assertEquals(count($tokenList), 3);
  248. $this->assertEquals($tokenList[0]->getTermText(), 'Слово');
  249. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  250. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  251. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  252. $this->assertEquals($tokenList[1]->getTermText(), 'Слово');
  253. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  254. $this->assertEquals($tokenList[1]->getEndOffset(), 12);
  255. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  256. $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
  257. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  258. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  259. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  260. }
  261. public function testStopWords()
  262. {
  263. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  264. /** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */
  265. $analyzer = new Text\CaseInsensitive();
  266. $stopWordsFilter = new \Zend\Search\Lucene\Analysis\TokenFilter\StopWords(array('word', 'and', 'or'));
  267. $analyzer->addFilter($stopWordsFilter);
  268. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  269. $this->assertEquals(count($tokenList), 1);
  270. $this->assertEquals($tokenList[0]->getTermText(), 'anotherword');
  271. $this->assertEquals($tokenList[0]->getStartOffset(), 12);
  272. $this->assertEquals($tokenList[0]->getEndOffset(), 23);
  273. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  274. }
  275. public function testShortWords()
  276. {
  277. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  278. /** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */
  279. $analyzer = new Text\CaseInsensitive();
  280. $stopWordsFilter = new \Zend\Search\Lucene\Analysis\TokenFilter\ShortWords(4 /* Minimal length */);
  281. $analyzer->addFilter($stopWordsFilter);
  282. $tokenList = $analyzer->tokenize('Word1 and anotherWord');
  283. $this->assertEquals(count($tokenList), 2);
  284. $this->assertEquals($tokenList[0]->getTermText(), 'word');
  285. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  286. $this->assertEquals($tokenList[0]->getEndOffset(), 4);
  287. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  288. $this->assertEquals($tokenList[1]->getTermText(), 'anotherword');
  289. $this->assertEquals($tokenList[1]->getStartOffset(), 10);
  290. $this->assertEquals($tokenList[1]->getEndOffset(), 21);
  291. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  292. }
  293. }