PageRenderTime 60ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/ZendFramework/tests/Zend/Search/Lucene/DocumentTest.php

https://bitbucket.org/Dal-Papa/is-340-publish-base
PHP | 344 lines | 225 code | 62 blank | 57 comment | 16 complexity | b4c71d67c1602baf987418cb5c01216f MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage UnitTests
  18. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: DocumentTest.php 24593 2012-01-05 20:35:02Z matthew $
  21. */
  22. /**
  23. * Zend_Search_Lucene
  24. */
  25. require_once 'Zend/Search/Lucene.php';
  26. /**
  27. * Zend_Search_Lucene_Document
  28. */
  29. require_once 'Zend/Search/Lucene/Document.php';
  30. /**
  31. * Zend_Search_Lucene_Document_Docx
  32. */
  33. require_once 'Zend/Search/Lucene/Document/Docx.php';
  34. /**
  35. * Zend_Search_Lucene_Document_Pptx
  36. */
  37. require_once 'Zend/Search/Lucene/Document/Pptx.php';
  38. /**
  39. * Zend_Search_Lucene_Document_Xlsx
  40. */
  41. require_once 'Zend/Search/Lucene/Document/Xlsx.php';
  42. /**
  43. * Zend_Search_Lucene_Document_Html
  44. */
  45. require_once 'Zend/Search/Lucene/Document/Html.php';
  46. /**
  47. * @category Zend
  48. * @package Zend_Search_Lucene
  49. * @subpackage UnitTests
  50. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  51. * @license http://framework.zend.com/license/new-bsd New BSD License
  52. * @group Zend_Search_Lucene
  53. */
  54. class Zend_Search_Lucene_DocumentTest extends PHPUnit_Framework_TestCase
  55. {
  56. private function _clearDirectory($dirName)
  57. {
  58. if (!file_exists($dirName) || !is_dir($dirName)) {
  59. return;
  60. }
  61. // remove files from temporary direcytory
  62. $dir = opendir($dirName);
  63. while (($file = readdir($dir)) !== false) {
  64. if (!is_dir($dirName . '/' . $file)) {
  65. @unlink($dirName . '/' . $file);
  66. }
  67. }
  68. closedir($dir);
  69. }
  70. public function testCreate()
  71. {
  72. $document = new Zend_Search_Lucene_Document();
  73. $this->assertEquals($document->boost, 1);
  74. }
  75. public function testFields()
  76. {
  77. $document = new Zend_Search_Lucene_Document();
  78. $document->addField(Zend_Search_Lucene_Field::Text('title', 'Title'));
  79. $document->addField(Zend_Search_Lucene_Field::Text('annotation', 'Annotation'));
  80. $document->addField(Zend_Search_Lucene_Field::Text('body', 'Document body, document body, document body...'));
  81. $fieldnamesDiffArray = array_diff($document->getFieldNames(), array('title', 'annotation', 'body'));
  82. $this->assertTrue(is_array($fieldnamesDiffArray));
  83. $this->assertEquals(count($fieldnamesDiffArray), 0);
  84. $this->assertEquals($document->title, 'Title');
  85. $this->assertEquals($document->annotation, 'Annotation');
  86. $this->assertEquals($document->body, 'Document body, document body, document body...');
  87. $this->assertEquals($document->getField('title')->value, 'Title');
  88. $this->assertEquals($document->getField('annotation')->value, 'Annotation');
  89. $this->assertEquals($document->getField('body')->value, 'Document body, document body, document body...');
  90. $this->assertEquals($document->getFieldValue('title'), 'Title');
  91. $this->assertEquals($document->getFieldValue('annotation'), 'Annotation');
  92. $this->assertEquals($document->getFieldValue('body'), 'Document body, document body, document body...');
  93. if (PHP_OS == 'AIX') {
  94. return; // tests below here not valid on AIX
  95. }
  96. $wordsWithUmlautsIso88591 = iconv('UTF-8', 'ISO-8859-1', 'Words with umlauts: åãü...');
  97. $document->addField(Zend_Search_Lucene_Field::Text('description', $wordsWithUmlautsIso88591, 'ISO-8859-1'));
  98. $this->assertEquals($document->description, $wordsWithUmlautsIso88591);
  99. $this->assertEquals($document->getFieldUtf8Value('description'), 'Words with umlauts: åãü...');
  100. }
  101. public function testAddFieldMethodChaining()
  102. {
  103. $document = new Zend_Search_Lucene_Document();
  104. $this->assertTrue($document->addField(Zend_Search_Lucene_Field::Text('title', 'Title')) instanceof Zend_Search_Lucene_Document);
  105. $document = new Zend_Search_Lucene_Document();
  106. $document->addField(Zend_Search_Lucene_Field::Text('title', 'Title'))
  107. ->addField(Zend_Search_Lucene_Field::Text('annotation', 'Annotation'))
  108. ->addField(Zend_Search_Lucene_Field::Text('body', 'Document body, document body, document body...'));
  109. }
  110. public function testHtmlHighlighting()
  111. {
  112. $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
  113. $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
  114. $doc->highlight('document', '#66ffff');
  115. $this->assertTrue(strpos($doc->getHTML(), '<b style="color:black;background-color:#66ffff">Document</b> body.') !== false);
  116. }
  117. public function testHtmlExtendedHighlighting()
  118. {
  119. $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
  120. $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
  121. $doc->highlightExtended('document',
  122. array('Zend_Search_Lucene_DocumentTest_DocHighlightingContainer',
  123. 'extendedHighlightingCallback'),
  124. array('style="color:black;background-color:#ff66ff"',
  125. '(!!!)'));
  126. $this->assertTrue(strpos($doc->getHTML(), '<b style="color:black;background-color:#ff66ff">Document</b>(!!!) body.') !== false);
  127. }
  128. public function testHtmlWordsHighlighting()
  129. {
  130. $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
  131. $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
  132. $doc->highlight(array('document', 'body'), '#66ffff');
  133. $highlightedHTML = $doc->getHTML();
  134. $this->assertTrue(strpos($highlightedHTML, '<b style="color:black;background-color:#66ffff">Document</b>') !== false);
  135. $this->assertTrue(strpos($highlightedHTML, '<b style="color:black;background-color:#66ffff">body</b>') !== false);
  136. }
  137. public function testHtmlExtendedHighlightingCorrectWrongHtml()
  138. {
  139. $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
  140. $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
  141. $doc->highlightExtended('document',
  142. array('Zend_Search_Lucene_DocumentTest_DocHighlightingContainer',
  143. 'extendedHighlightingCallback'),
  144. array('style="color:black;background-color:#ff66ff"',
  145. '<h3>(!!!)' /* Wrong HTML here, <h3> tag is not closed */));
  146. $this->assertTrue(strpos($doc->getHTML(), '<b style="color:black;background-color:#ff66ff">Document</b><h3>(!!!)</h3> body.') !== false);
  147. }
  148. public function testHtmlLinksProcessing()
  149. {
  150. $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true);
  151. $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
  152. $this->assertTrue(array_values($doc->getHeaderLinks()) ==
  153. array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
  154. $this->assertTrue(array_values($doc->getLinks()) ==
  155. array('contributing.bugs.html',
  156. 'contributing.wishlist.html',
  157. 'developers.documentation.html',
  158. 'faq.translators-revision-tracking.html',
  159. 'index.html',
  160. 'contributing.html'));
  161. }
  162. /**
  163. * @group ZF-4252
  164. */
  165. public function testHtmlInlineTagsIndexing()
  166. {
  167. $index = Zend_Search_Lucene::create(dirname(__FILE__) . '/_index/_files');
  168. $htmlString = '<html><head><title>Hello World</title></head>'
  169. . '<body><b>Zend</b>Framework' . "\n" . ' <div>Foo</div>Bar ' . "\n"
  170. . ' <strong>Test</strong></body></html>';
  171. $doc = Zend_Search_Lucene_Document_Html::loadHTML($htmlString);
  172. $index->addDocument($doc);
  173. $hits = $index->find('FooBar');
  174. $this->assertEquals(count($hits), 0);
  175. $hits = $index->find('ZendFramework');
  176. $this->assertEquals(count($hits), 1);
  177. unset($index);
  178. $this->_clearDirectory(dirname(__FILE__) . '/_index/_files');
  179. }
  180. /**
  181. * @group ZF-8740
  182. */
  183. public function testHtmlAreaTags()
  184. {
  185. $html = '<HTML>'
  186. . '<HEAD><TITLE>Page title</TITLE></HEAD>'
  187. . '<BODY>'
  188. . 'Document body.'
  189. . '<img src="img.png" width="640" height="480" alt="some image" usemap="#some_map" />'
  190. . '<map name="some_map">'
  191. . '<area shape="rect" coords="0,0,100,100" href="link3.html" alt="Link 3" />'
  192. . '<area shape="rect" coords="200,200,300,300" href="link4.html" alt="Link 4" />'
  193. . '</map>'
  194. . '<a href="link1.html">Link 1</a>.'
  195. . '<a href="link2.html" rel="nofollow">Link 1</a>.'
  196. . '</BODY>'
  197. . '</HTML>';
  198. $oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks();
  199. Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false);
  200. $doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html);
  201. $this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html);
  202. $links = array('link1.html', 'link2.html', 'link3.html', 'link4.html');
  203. $this->assertTrue(array_values($doc1->getLinks()) == $links);
  204. }
  205. public function testHtmlNoFollowLinks()
  206. {
  207. $html = '<HTML>'
  208. . '<HEAD><TITLE>Page title</TITLE></HEAD>'
  209. . '<BODY>'
  210. . 'Document body.'
  211. . '<a href="link1.html">Link 1</a>.'
  212. . '<a href="link2.html" rel="nofollow">Link 1</a>.'
  213. . '</BODY>'
  214. . '</HTML>';
  215. $oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks();
  216. Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false);
  217. $doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html);
  218. $this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html);
  219. $this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html'));
  220. Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
  221. $doc2 = Zend_Search_Lucene_Document_Html::loadHTML($html);
  222. $this->assertTrue($doc2 instanceof Zend_Search_Lucene_Document_Html);
  223. $this->assertTrue(array_values($doc2->getLinks()) == array('link1.html'));
  224. }
  225. public function testDocx()
  226. {
  227. if (!class_exists('ZipArchive')) {
  228. $this->markTestSkipped('ZipArchive class (Zip extension) is not loaded');
  229. }
  230. $docxDocument = Zend_Search_Lucene_Document_Docx::loadDocxFile(dirname(__FILE__) . '/_openXmlDocuments/test.docx', true);
  231. $this->assertTrue($docxDocument instanceof Zend_Search_Lucene_Document_Docx);
  232. $this->assertEquals($docxDocument->getFieldValue('title'), 'Test document');
  233. $this->assertEquals($docxDocument->getFieldValue('description'), 'This is a test document which can be used to demonstrate something.');
  234. $this->assertTrue($docxDocument->getFieldValue('body') != '');
  235. try {
  236. $docxDocument1 = Zend_Search_Lucene_Document_Docx::loadDocxFile(dirname(__FILE__) . '/_openXmlDocuments/dummy.docx', true);
  237. $this->fail('File not readable exception is expected.');
  238. } catch (Zend_Search_Lucene_Document_Exception $e) {
  239. if (strpos($e->getMessage(), 'is not readable') === false) {
  240. // Passthrough exception
  241. throw $e;
  242. }
  243. }
  244. }
  245. public function testPptx()
  246. {
  247. if (!class_exists('ZipArchive')) {
  248. $this->markTestSkipped('ZipArchive class (Zip extension) is not loaded');
  249. }
  250. $pptxDocument = Zend_Search_Lucene_Document_Pptx::loadPptxFile(dirname(__FILE__) . '/_openXmlDocuments/test.pptx', true);
  251. $this->assertTrue($pptxDocument instanceof Zend_Search_Lucene_Document_Pptx);
  252. $this->assertEquals($pptxDocument->getFieldValue('title'), 'Test document');
  253. $this->assertEquals($pptxDocument->getFieldValue('description'), 'This is a test document which can be used to demonstrate something.');
  254. $this->assertTrue($pptxDocument->getFieldValue('body') != '');
  255. }
  256. public function testXlsx()
  257. {
  258. if (!class_exists('ZipArchive')) {
  259. $this->markTestSkipped('ZipArchive class (Zip extension) is not loaded');
  260. }
  261. $xlsxDocument = Zend_Search_Lucene_Document_Xlsx::loadXlsxFile(dirname(__FILE__) . '/_openXmlDocuments/test.xlsx', true);
  262. $this->assertTrue($xlsxDocument instanceof Zend_Search_Lucene_Document_Xlsx);
  263. $this->assertEquals($xlsxDocument->getFieldValue('title'), 'Test document');
  264. $this->assertEquals($xlsxDocument->getFieldValue('description'), 'This is a test document which can be used to demonstrate something.');
  265. $this->assertTrue($xlsxDocument->getFieldValue('body') != '');
  266. $this->assertTrue( strpos($xlsxDocument->getFieldValue('body'), 'ipsum') !== false );
  267. }
  268. /**
  269. * @group ZF-10686
  270. */
  271. public function testLoadHtmlWithAttributesInTagHTML()
  272. {
  273. $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML lang="en_US"><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
  274. $this->assertEquals('Page title ', $doc->title);
  275. }
  276. }
  277. class Zend_Search_Lucene_DocumentTest_DocHighlightingContainer {
  278. public static function extendedHighlightingCallback($stringToHighlight, $param1, $param2)
  279. {
  280. return '<b ' . $param1 . '>' . $stringToHighlight . '</b>' . $param2;
  281. }
  282. }