/sbw2/ZendFramework-1.11.4/tests/Zend/Search/Lucene/AnalysisTest.php

https://github.com/nbcutech/o3drupal · PHP · 383 lines · 239 code · 87 blank · 57 comment · 12 complexity · d9e32ed1e443c92af7293c9c47c9ab6e MD5 · raw file

  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage UnitTests
  18. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: AnalysisTest.php 23775 2011-03-01 17:25:24Z ralph $
  21. */
  22. /**
  23. * Zend_Search_Lucene
  24. */
  25. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  26. /**
  27. * @category Zend
  28. * @package Zend_Search_Lucene
  29. * @subpackage UnitTests
  30. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  31. * @license http://framework.zend.com/license/new-bsd New BSD License
  32. * @group Zend_Search_Lucene
  33. */
  34. class Zend_Search_Lucene_AnalysisTest extends PHPUnit_Framework_TestCase
  35. {
  36. public function testAnalyzer()
  37. {
  38. $currentAnalyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
  39. $this->assertTrue($currentAnalyzer instanceof Zend_Search_Lucene_Analysis_Analyzer);
  40. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
  41. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
  42. $newAnalyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num();
  43. Zend_Search_Lucene_Analysis_Analyzer::setDefault($newAnalyzer);
  44. $this->assertTrue(Zend_Search_Lucene_Analysis_Analyzer::getDefault() === $newAnalyzer);
  45. // Set analyzer to the default value (used in other tests)
  46. Zend_Search_Lucene_Analysis_Analyzer::setDefault($currentAnalyzer);
  47. }
  48. public function testText()
  49. {
  50. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
  51. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
  52. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text();
  53. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  54. $this->assertEquals(count($tokenList), 3);
  55. $this->assertEquals($tokenList[0]->getTermText(), 'Word');
  56. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  57. $this->assertEquals($tokenList[0]->getEndOffset(), 4);
  58. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  59. $this->assertEquals($tokenList[1]->getTermText(), 'Word');
  60. $this->assertEquals($tokenList[1]->getStartOffset(), 6);
  61. $this->assertEquals($tokenList[1]->getEndOffset(), 10);
  62. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  63. $this->assertEquals($tokenList[2]->getTermText(), 'anotherWord');
  64. $this->assertEquals($tokenList[2]->getStartOffset(), 12);
  65. $this->assertEquals($tokenList[2]->getEndOffset(), 23);
  66. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  67. }
  68. public function testTextCaseInsensitive()
  69. {
  70. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  71. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
  72. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
  73. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  74. $this->assertEquals(count($tokenList), 3);
  75. $this->assertEquals($tokenList[0]->getTermText(), 'word');
  76. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  77. $this->assertEquals($tokenList[0]->getEndOffset(), 4);
  78. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  79. $this->assertEquals($tokenList[1]->getTermText(), 'word');
  80. $this->assertEquals($tokenList[1]->getStartOffset(), 6);
  81. $this->assertEquals($tokenList[1]->getEndOffset(), 10);
  82. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  83. $this->assertEquals($tokenList[2]->getTermText(), 'anotherword');
  84. $this->assertEquals($tokenList[2]->getStartOffset(), 12);
  85. $this->assertEquals($tokenList[2]->getEndOffset(), 23);
  86. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  87. }
  88. public function testTextNum()
  89. {
  90. /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
  91. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
  92. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum();
  93. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  94. $this->assertEquals(count($tokenList), 3);
  95. $this->assertEquals($tokenList[0]->getTermText(), 'Word1');
  96. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  97. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  98. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  99. $this->assertEquals($tokenList[1]->getTermText(), 'Word2');
  100. $this->assertEquals($tokenList[1]->getStartOffset(), 6);
  101. $this->assertEquals($tokenList[1]->getEndOffset(), 11);
  102. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  103. $this->assertEquals($tokenList[2]->getTermText(), 'anotherWord');
  104. $this->assertEquals($tokenList[2]->getStartOffset(), 12);
  105. $this->assertEquals($tokenList[2]->getEndOffset(), 23);
  106. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  107. }
  108. public function testTextNumCaseInsensitive()
  109. {
  110. /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
  111. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
  112. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive();
  113. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  114. $this->assertEquals(count($tokenList), 3);
  115. $this->assertEquals($tokenList[0]->getTermText(), 'word1');
  116. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  117. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  118. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  119. $this->assertEquals($tokenList[1]->getTermText(), 'word2');
  120. $this->assertEquals($tokenList[1]->getStartOffset(), 6);
  121. $this->assertEquals($tokenList[1]->getEndOffset(), 11);
  122. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  123. $this->assertEquals($tokenList[2]->getTermText(), 'anotherword');
  124. $this->assertEquals($tokenList[2]->getStartOffset(), 12);
  125. $this->assertEquals($tokenList[2]->getEndOffset(), 23);
  126. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  127. }
  128. public function testUtf8()
  129. {
  130. if (@preg_match('/\pL/u', 'a') != 1) {
  131. // PCRE unicode support is turned off
  132. return;
  133. }
  134. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
  135. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
  136. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8();
  137. // UTF-8 text with a cyrillic symbols
  138. $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
  139. $this->assertEquals(count($tokenList), 3);
  140. $this->assertEquals($tokenList[0]->getTermText(), 'Слово');
  141. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  142. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  143. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  144. $this->assertEquals($tokenList[1]->getTermText(), 'Слово');
  145. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  146. $this->assertEquals($tokenList[1]->getEndOffset(), 12);
  147. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  148. $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
  149. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  150. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  151. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  152. }
  153. public function testUtf8Num()
  154. {
  155. if (@preg_match('/\pL/u', 'a') != 1) {
  156. // PCRE unicode support is turned off
  157. return;
  158. }
  159. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
  160. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
  161. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num();
  162. // UTF-8 text with a cyrillic symbols
  163. $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
  164. $this->assertEquals(count($tokenList), 3);
  165. $this->assertEquals($tokenList[0]->getTermText(), 'Слово1');
  166. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  167. $this->assertEquals($tokenList[0]->getEndOffset(), 6);
  168. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  169. $this->assertEquals($tokenList[1]->getTermText(), 'Слово2');
  170. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  171. $this->assertEquals($tokenList[1]->getEndOffset(), 13);
  172. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  173. $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
  174. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  175. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  176. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  177. }
  178. public function testUtf8CaseInsensitive()
  179. {
  180. if (@preg_match('/\pL/u', 'a') != 1) {
  181. // PCRE unicode support is turned off
  182. return;
  183. }
  184. if (!function_exists('mb_strtolower')) {
  185. // mbstring extension is disabled
  186. return;
  187. }
  188. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */
  189. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php';
  190. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive();
  191. // UTF-8 text with a cyrillic symbols
  192. $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
  193. $this->assertEquals(count($tokenList), 3);
  194. $this->assertEquals($tokenList[0]->getTermText(), 'слово');
  195. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  196. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  197. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  198. $this->assertEquals($tokenList[1]->getTermText(), 'слово');
  199. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  200. $this->assertEquals($tokenList[1]->getEndOffset(), 12);
  201. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  202. $this->assertEquals($tokenList[2]->getTermText(), 'другоеслово');
  203. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  204. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  205. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  206. }
  207. public function testUtf8NumCaseInsensitive()
  208. {
  209. if (@preg_match('/\pL/u', 'a') != 1) {
  210. // PCRE unicode support is turned off
  211. return;
  212. }
  213. if (!function_exists('mb_strtolower')) {
  214. // mbstring extension is disabled
  215. return;
  216. }
  217. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */
  218. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php';
  219. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive();
  220. // UTF-8 text with a cyrillic symbols
  221. $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8');
  222. $this->assertEquals(count($tokenList), 3);
  223. $this->assertEquals($tokenList[0]->getTermText(), 'слово1');
  224. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  225. $this->assertEquals($tokenList[0]->getEndOffset(), 6);
  226. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  227. $this->assertEquals($tokenList[1]->getTermText(), 'слово2');
  228. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  229. $this->assertEquals($tokenList[1]->getEndOffset(), 13);
  230. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  231. $this->assertEquals($tokenList[2]->getTermText(), 'другоеслово');
  232. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  233. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  234. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  235. }
  236. public function testEncoding()
  237. {
  238. if (PHP_OS == 'AIX') {
  239. $this->markTestSkipped('Test not available on AIX');
  240. }
  241. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
  242. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
  243. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8();
  244. // UTF-8 text with a cyrillic symbols
  245. $tokenList = $analyzer->tokenize(iconv('UTF-8', 'Windows-1251', 'Слово1 Слово2 ДругоеСлово'), 'Windows-1251');
  246. $this->assertEquals(count($tokenList), 3);
  247. $this->assertEquals($tokenList[0]->getTermText(), 'Слово');
  248. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  249. $this->assertEquals($tokenList[0]->getEndOffset(), 5);
  250. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  251. $this->assertEquals($tokenList[1]->getTermText(), 'Слово');
  252. $this->assertEquals($tokenList[1]->getStartOffset(), 7);
  253. $this->assertEquals($tokenList[1]->getEndOffset(), 12);
  254. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  255. $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово');
  256. $this->assertEquals($tokenList[2]->getStartOffset(), 14);
  257. $this->assertEquals($tokenList[2]->getEndOffset(), 25);
  258. $this->assertEquals($tokenList[2]->getPositionIncrement(), 1);
  259. }
  260. public function testStopWords()
  261. {
  262. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  263. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
  264. /** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */
  265. require_once 'Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php';
  266. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
  267. $stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_StopWords(array('word', 'and', 'or'));
  268. $analyzer->addFilter($stopWordsFilter);
  269. $tokenList = $analyzer->tokenize('Word1 Word2 anotherWord');
  270. $this->assertEquals(count($tokenList), 1);
  271. $this->assertEquals($tokenList[0]->getTermText(), 'anotherword');
  272. $this->assertEquals($tokenList[0]->getStartOffset(), 12);
  273. $this->assertEquals($tokenList[0]->getEndOffset(), 23);
  274. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  275. }
  276. public function testShortWords()
  277. {
  278. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  279. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
  280. /** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */
  281. require_once 'Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php';
  282. $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
  283. $stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_ShortWords(4 /* Minimal length */);
  284. $analyzer->addFilter($stopWordsFilter);
  285. $tokenList = $analyzer->tokenize('Word1 and anotherWord');
  286. $this->assertEquals(count($tokenList), 2);
  287. $this->assertEquals($tokenList[0]->getTermText(), 'word');
  288. $this->assertEquals($tokenList[0]->getStartOffset(), 0);
  289. $this->assertEquals($tokenList[0]->getEndOffset(), 4);
  290. $this->assertEquals($tokenList[0]->getPositionIncrement(), 1);
  291. $this->assertEquals($tokenList[1]->getTermText(), 'anotherword');
  292. $this->assertEquals($tokenList[1]->getStartOffset(), 10);
  293. $this->assertEquals($tokenList[1]->getEndOffset(), 21);
  294. $this->assertEquals($tokenList[1]->getPositionIncrement(), 1);
  295. }
  296. }