/tests/Zend/Search/Lucene/AnalysisTest.php

https://github.com/WebTricks/WebTricks-CMS · PHP · 388 lines · 240 code · 88 blank · 60 comment · 12 complexity · bbb3356c1beacf9d5a3acc0c9e56da7c MD5 · raw file

  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage UnitTests
  18. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: AnalysisTest.php 20096 2010-01-06 02:05:09Z bkarwin $
  21. */
  22. /**
  23. * Zend_Search_Lucene
  24. */
  25. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  26. /**
  27. * PHPUnit test case
  28. */
  29. require_once 'PHPUnit/Framework/TestCase.php';
  30. /**
  31. * @category Zend
  32. * @package Zend_Search_Lucene
  33. * @subpackage UnitTests
  34. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  35. * @license http://framework.zend.com/license/new-bsd New BSD License
  36. * @group Zend_Search_Lucene
  37. */
  38. class Zend_Search_Lucene_AnalysisTest extends PHPUnit_Framework_TestCase
  39. {
  40. public function testAnalyzer()
  41. {
  42. $currentAnalyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
  43. $this->assertTrue($currentAnalyzer instanceof Zend_Search_Lucene_Analysis_Analyzer);
  44. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
  45. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
  46. $newAnalyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num();
  47. Zend_Search_Lucene_Analysis_Analyzer::setDefault($newAnalyzer);
  48. $this->assertTrue(Zend_Search_Lucene_Analysis_Analyzer::getDefault() === $newAnalyzer);
  49. // Set analyzer to the default value (used in other tests)
  50. Zend_Search_Lucene_Analysis_Analyzer::setDefault($currentAnalyzer);
  51. }
  52. public function testText()
  53. {
  54. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
  55. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
  56. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text();
  57. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  58. $this->assertEquals(count($tokenList), 3);
  59. $this->assertEquals($tokenList[0]->getTermText(), 'Word');
  60. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  61. $this->assertEquals($tokenList[0]->getEndOffset(), 4);
  62. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  63. $this->assertEquals($tokenList[1]->getTermText(), 'Word');
  64. $this->assertEquals($tokenList[1]->getStartOffset(), 6);
  65. $this->assertEquals($tokenList[1]->getEndOffset(), 10);
  66. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  67. $this->assertEquals($tokenList[2]->getTermText(), 'anotherWord');
  68. $this->assertEquals($tokenList[2]->getStartOffset(), 12);
  69. $this->assertEquals($tokenList[2]->getEndOffset(), 23);
  70. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  71. }
  72. public function testTextCaseInsensitive()
  73. {
  74. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  75. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
  76. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
  77. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  78. $this->assertEquals(count($tokenList), 3);
  79. $this->assertEquals($tokenList[0]->getTermText(), 'word');
  80. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  81. $this->assertEquals($tokenList[0]->getEndOffset(), 4);
  82. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  83. $this->assertEquals($tokenList[1]->getTermText(), 'word');
  84. $this->assertEquals($tokenList[1]->getStartOffset(), 6);
  85. $this->assertEquals($tokenList[1]->getEndOffset(), 10);
  86. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  87. $this->assertEquals($tokenList[2]->getTermText(), 'anotherword');
  88. $this->assertEquals($tokenList[2]->getStartOffset(), 12);
  89. $this->assertEquals($tokenList[2]->getEndOffset(), 23);
  90. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  91. }
  92. public function testTextNum()
  93. {
  94. /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
  95. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
  96. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum();
  97. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  98. $this->assertEquals(count($tokenList), 3);
  99. $this->assertEquals($tokenList[0]->getTermText(), 'Word1');
  100. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  101. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  102. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  103. $this->assertEquals($tokenList[1]->getTermText(), 'Word2');
  104. $this->assertEquals($tokenList[1]->getStartOffset(), 6);
  105. $this->assertEquals($tokenList[1]->getEndOffset(), 11);
  106. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  107. $this->assertEquals($tokenList[2]->getTermText(), 'anotherWord');
  108. $this->assertEquals($tokenList[2]->getStartOffset(), 12);
  109. $this->assertEquals($tokenList[2]->getEndOffset(), 23);
  110. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  111. }
  112. public function testTextNumCaseInsensitive()
  113. {
  114. /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
  115. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
  116. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive();
  117. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  118. $this->assertEquals(count($tokenList), 3);
  119. $this->assertEquals($tokenList[0]->getTermText(), 'word1');
  120. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  121. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  122. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  123. $this->assertEquals($tokenList[1]->getTermText(), 'word2');
  124. $this->assertEquals($tokenList[1]->getStartOffset(), 6);
  125. $this->assertEquals($tokenList[1]->getEndOffset(), 11);
  126. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  127. $this->assertEquals($tokenList[2]->getTermText(), 'anotherword');
  128. $this->assertEquals($tokenList[2]->getStartOffset(), 12);
  129. $this->assertEquals($tokenList[2]->getEndOffset(), 23);
  130. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  131. }
  132. public function testUtf8()
  133. {
  134. if (@preg_match('/\pL/u', 'a') != 1) {
  135. // PCRE unicode support is turned off
  136. return;
  137. }
  138. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
  139. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
  140. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8();
  141. // UTF-8 text with a cyrillic symbols
  142. $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
  143. $this->assertEquals(count($tokenList), 3);
  144. $this->assertEquals($tokenList[0]->getTermText(), 'Слово');
  145. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  146. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  147. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  148. $this->assertEquals($tokenList[1]->getTermText(), 'Слово');
  149. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  150. $this->assertEquals($tokenList[1]->getEndOffset(), 12);
  151. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  152. $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
  153. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  154. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  155. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  156. }
  157. public function testUtf8Num()
  158. {
  159. if (@preg_match('/\pL/u', 'a') != 1) {
  160. // PCRE unicode support is turned off
  161. return;
  162. }
  163. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
  164. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
  165. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num();
  166. // UTF-8 text with a cyrillic symbols
  167. $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
  168. $this->assertEquals(count($tokenList), 3);
  169. $this->assertEquals($tokenList[0]->getTermText(), 'Слово1');
  170. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  171. $this->assertEquals($tokenList[0]->getEndOffset(), 6);
  172. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  173. $this->assertEquals($tokenList[1]->getTermText(), 'Слово2');
  174. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  175. $this->assertEquals($tokenList[1]->getEndOffset(), 13);
  176. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  177. $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
  178. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  179. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  180. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  181. }
  182. public function testUtf8CaseInsensitive()
  183. {
  184. if (@preg_match('/\pL/u', 'a') != 1) {
  185. // PCRE unicode support is turned off
  186. return;
  187. }
  188. if (!function_exists('mb_strtolower')) {
  189. // mbstring extension is disabled
  190. return;
  191. }
  192. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */
  193. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php';
  194. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive();
  195. // UTF-8 text with a cyrillic symbols
  196. $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
  197. $this->assertEquals(count($tokenList), 3);
  198. $this->assertEquals($tokenList[0]->getTermText(), 'слово');
  199. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  200. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  201. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  202. $this->assertEquals($tokenList[1]->getTermText(), 'слово');
  203. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  204. $this->assertEquals($tokenList[1]->getEndOffset(), 12);
  205. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  206. $this->assertEquals($tokenList[2]->getTermText(), 'другоеслово');
  207. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  208. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  209. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  210. }
  211. public function testUtf8NumCaseInsensitive()
  212. {
  213. if (@preg_match('/\pL/u', 'a') != 1) {
  214. // PCRE unicode support is turned off
  215. return;
  216. }
  217. if (!function_exists('mb_strtolower')) {
  218. // mbstring extension is disabled
  219. return;
  220. }
  221. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */
  222. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php';
  223. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive();
  224. // UTF-8 text with a cyrillic symbols
  225. $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
  226. $this->assertEquals(count($tokenList), 3);
  227. $this->assertEquals($tokenList[0]->getTermText(), 'слово1');
  228. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  229. $this->assertEquals($tokenList[0]->getEndOffset(), 6);
  230. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  231. $this->assertEquals($tokenList[1]->getTermText(), 'слово2');
  232. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  233. $this->assertEquals($tokenList[1]->getEndOffset(), 13);
  234. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  235. $this->assertEquals($tokenList[2]->getTermText(), 'другоеслово');
  236. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  237. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  238. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  239. }
  240. public function testEncoding()
  241. {
  242. if (PHP_OS == 'AIX') {
  243. $this->markTestSkipped('Test not available on AIX');
  244. }
  245. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
  246. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
  247. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8();
  248. // UTF-8 text with a cyrillic symbols
  249. $tokenList = $analyzer->tokenize(iconv('UTF-8', 'Windows-1251', 'Слово1 Слово2 ДругоеСлово'), 'Windows-1251');
  250. $this->assertEquals(count($tokenList), 3);
  251. $this->assertEquals($tokenList[0]->getTermText(), 'Слово');
  252. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  253. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  254. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  255. $this->assertEquals($tokenList[1]->getTermText(), 'Слово');
  256. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  257. $this->assertEquals($tokenList[1]->getEndOffset(), 12);
  258. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  259. $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
  260. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  261. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  262. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  263. }
  264. public function testStopWords()
  265. {
  266. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  267. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
  268. /** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */
  269. require_once 'Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php';
  270. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
  271. $stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_StopWords(array('word', 'and', 'or'));
  272. $analyzer->addFilter($stopWordsFilter);
  273. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  274. $this->assertEquals(count($tokenList), 1);
  275. $this->assertEquals($tokenList[0]->getTermText(), 'anotherword');
  276. $this->assertEquals($tokenList[0]->getStartOffset(), 12);
  277. $this->assertEquals($tokenList[0]->getEndOffset(), 23);
  278. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  279. }
  280. public function testShortWords()
  281. {
  282. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  283. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
  284. /** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */
  285. require_once 'Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php';
  286. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
  287. $stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_ShortWords(4 /* Minimal length */);
  288. $analyzer->addFilter($stopWordsFilter);
  289. $tokenList = $analyzer->tokenize('Word1 and anotherWord');
  290. $this->assertEquals(count($tokenList), 2);
  291. $this->assertEquals($tokenList[0]->getTermText(), 'word');
  292. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  293. $this->assertEquals($tokenList[0]->getEndOffset(), 4);
  294. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  295. $this->assertEquals($tokenList[1]->getTermText(), 'anotherword');
  296. $this->assertEquals($tokenList[1]->getStartOffset(), 10);
  297. $this->assertEquals($tokenList[1]->getEndOffset(), 21);
  298. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  299. }
  300. }