PageRenderTime 47ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/protected/vendors/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php

https://bitbucket.org/thomasvandiepen/uva-searchengines
PHP | 230 lines | 121 code | 37 blank | 72 comment | 24 complexity | 25cd7bcf4d256c65d82a02e8a1eae39d MD5 | raw file
Possible License(s): BSD-2-Clause, Apache-2.0, LGPL-2.1, BSD-3-Clause
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: DocumentWriter.php 23775 2011-03-01 17:25:24Z ralph $
  21. */
  22. /** Zend_Search_Lucene_Index_SegmentWriter */
  23. require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
  24. /**
  25. * @category Zend
  26. * @package Zend_Search_Lucene
  27. * @subpackage Index
  28. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  29. * @license http://framework.zend.com/license/new-bsd New BSD License
  30. */
  31. class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter
  32. {
  33. /**
  34. * Term Dictionary
  35. * Array of the Zend_Search_Lucene_Index_Term objects
  36. * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
  37. *
  38. * @var array
  39. */
  40. protected $_termDictionary;
  41. /**
  42. * Documents, which contain the term
  43. *
  44. * @var array
  45. */
  46. protected $_termDocs;
  47. /**
  48. * Object constructor.
  49. *
  50. * @param Zend_Search_Lucene_Storage_Directory $directory
  51. * @param string $name
  52. */
  53. public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
  54. {
  55. parent::__construct($directory, $name);
  56. $this->_termDocs = array();
  57. $this->_termDictionary = array();
  58. }
  59. /**
  60. * Adds a document to this segment.
  61. *
  62. * @param Zend_Search_Lucene_Document $document
  63. * @throws Zend_Search_Lucene_Exception
  64. */
  65. public function addDocument(Zend_Search_Lucene_Document $document)
  66. {
  67. /** Zend_Search_Lucene_Search_Similarity */
  68. require_once 'Zend/Search/Lucene/Search/Similarity.php';
  69. $storedFields = array();
  70. $docNorms = array();
  71. $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
  72. foreach ($document->getFieldNames() as $fieldName) {
  73. $field = $document->getField($fieldName);
  74. if ($field->storeTermVector) {
  75. /**
  76. * @todo term vector storing support
  77. */
  78. require_once 'Zend/Search/Lucene/Exception.php';
  79. throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
  80. }
  81. if ($field->isIndexed) {
  82. if ($field->isTokenized) {
  83. /** Zend_Search_Lucene_Analysis_Analyzer */
  84. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  85. $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
  86. $analyzer->setInput($field->value, $field->encoding);
  87. $position = 0;
  88. $tokenCounter = 0;
  89. while (($token = $analyzer->nextToken()) !== null) {
  90. $tokenCounter++;
  91. $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
  92. $termKey = $term->key();
  93. if (!isset($this->_termDictionary[$termKey])) {
  94. // New term
  95. $this->_termDictionary[$termKey] = $term;
  96. $this->_termDocs[$termKey] = array();
  97. $this->_termDocs[$termKey][$this->_docCount] = array();
  98. } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
  99. // Existing term, but new term entry
  100. $this->_termDocs[$termKey][$this->_docCount] = array();
  101. }
  102. $position += $token->getPositionIncrement();
  103. $this->_termDocs[$termKey][$this->_docCount][] = $position;
  104. }
  105. if ($tokenCounter == 0) {
  106. // Field contains empty value. Treat it as non-indexed and non-tokenized
  107. $field = clone($field);
  108. $field->isIndexed = $field->isTokenized = false;
  109. } else {
  110. $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
  111. $tokenCounter)*
  112. $document->boost*
  113. $field->boost ));
  114. }
  115. } else if (($fieldUtf8Value = $field->getUtf8Value()) == '') {
  116. // Field contains empty value. Treat it as non-indexed and non-tokenized
  117. $field = clone($field);
  118. $field->isIndexed = $field->isTokenized = false;
  119. } else {
  120. $term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name);
  121. $termKey = $term->key();
  122. if (!isset($this->_termDictionary[$termKey])) {
  123. // New term
  124. $this->_termDictionary[$termKey] = $term;
  125. $this->_termDocs[$termKey] = array();
  126. $this->_termDocs[$termKey][$this->_docCount] = array();
  127. } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
  128. // Existing term, but new term entry
  129. $this->_termDocs[$termKey][$this->_docCount] = array();
  130. }
  131. $this->_termDocs[$termKey][$this->_docCount][] = 0; // position
  132. $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
  133. $document->boost*
  134. $field->boost ));
  135. }
  136. }
  137. if ($field->isStored) {
  138. $storedFields[] = $field;
  139. }
  140. $this->addField($field);
  141. }
  142. foreach ($this->_fields as $fieldName => $field) {
  143. if (!$field->isIndexed) {
  144. continue;
  145. }
  146. if (!isset($this->_norms[$fieldName])) {
  147. $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
  148. $this->_docCount);
  149. }
  150. if (isset($docNorms[$fieldName])){
  151. $this->_norms[$fieldName] .= $docNorms[$fieldName];
  152. } else {
  153. $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
  154. }
  155. }
  156. $this->addStoredFields($storedFields);
  157. }
  158. /**
  159. * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
  160. */
  161. protected function _dumpDictionary()
  162. {
  163. ksort($this->_termDictionary, SORT_STRING);
  164. $this->initializeDictionaryFiles();
  165. foreach ($this->_termDictionary as $termId => $term) {
  166. $this->addTerm($term, $this->_termDocs[$termId]);
  167. }
  168. $this->closeDictionaryFiles();
  169. }
  170. /**
  171. * Close segment, write it to disk and return segment info
  172. *
  173. * @return Zend_Search_Lucene_Index_SegmentInfo
  174. */
  175. public function close()
  176. {
  177. if ($this->_docCount == 0) {
  178. return null;
  179. }
  180. $this->_dumpFNM();
  181. $this->_dumpDictionary();
  182. $this->_generateCFS();
  183. /** Zend_Search_Lucene_Index_SegmentInfo */
  184. require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
  185. return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
  186. $this->_name,
  187. $this->_docCount,
  188. -1,
  189. null,
  190. true,
  191. true);
  192. }
  193. }