/library/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php

https://github.com/Exercise/zf2 · PHP · 236 lines · 123 code · 34 blank · 79 comment · 24 complexity · 0a862d79a86ad65264928eba73f7d318 MD5 · raw file

  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id$
  21. */
  22. /**
  23. * @namespace
  24. */
  25. namespace Zend\Search\Lucene\Index\SegmentWriter;
  26. use Zend\Search\Lucene;
  27. use Zend\Search\Lucene\Index;
  28. use Zend\Search\Lucene\Analysis\Analyzer;
  29. use Zend\Search\Lucene\Storage\Directory;
  30. use Zend\Search\Lucene\Document;
  31. use Zend\Search\Lucene\Search\Similarity;
  32. /**
  33. * @uses \Zend\Search\Lucene\Analysis\Analyzer
  34. * @uses \Zend\Search\Lucene\Exception
  35. * @uses \Zend\Search\Lucene\Index\SegmentInfo
  36. * @uses \Zend\Search\Lucene\Index\SegmentWriter\AbstractSegmentWriter
  37. * @uses \Zend\Search\Lucene\Index\Term
  38. * @uses \Zend\Search\Lucene\Search\Similarity
  39. * @uses \Zend\Search\Lucene\Storage\Directory
  40. * @uses \Zend\Search\Lucene\Document;
  41. * @category Zend
  42. * @package Zend_Search_Lucene
  43. * @subpackage Index
  44. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  45. * @license http://framework.zend.com/license/new-bsd New BSD License
  46. */
  47. class DocumentWriter extends AbstractSegmentWriter
  48. {
  49. /**
  50. * Term Dictionary
  51. * Array of the Zend_Search_Lucene_Index_Term objects
  52. * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
  53. *
  54. * @var array
  55. */
  56. protected $_termDictionary;
  57. /**
  58. * Documents, which contain the term
  59. *
  60. * @var array
  61. */
  62. protected $_termDocs;
  63. /**
  64. * Object constructor.
  65. *
  66. * @param \Zend\Search\Lucene\Storage\Directory $directory
  67. * @param string $name
  68. */
  69. public function __construct(Directory $directory, $name)
  70. {
  71. parent::__construct($directory, $name);
  72. $this->_termDocs = array();
  73. $this->_termDictionary = array();
  74. }
  75. /**
  76. * Adds a document to this segment.
  77. *
  78. * @param \Zend\Search\Lucene\Document $document
  79. * @throws \Zend\Search\Lucene\Exception
  80. */
  81. public function addDocument(Document $document)
  82. {
  83. $storedFields = array();
  84. $docNorms = array();
  85. $similarity = Similarity::getDefault();
  86. foreach ($document->getFieldNames() as $fieldName) {
  87. $field = $document->getField($fieldName);
  88. if ($field->storeTermVector) {
  89. /**
  90. * @todo term vector storing support
  91. */
  92. throw new Lucene\Exception('Store term vector functionality is not supported yet.');
  93. }
  94. if ($field->isIndexed) {
  95. if ($field->isTokenized) {
  96. $analyzer = Analyzer\Analyzer::getDefault();
  97. $analyzer->setInput($field->value, $field->encoding);
  98. $position = 0;
  99. $tokenCounter = 0;
  100. while (($token = $analyzer->nextToken()) !== null) {
  101. $tokenCounter++;
  102. $term = new Index\Term($token->getTermText(), $field->name);
  103. $termKey = $term->key();
  104. if (!isset($this->_termDictionary[$termKey])) {
  105. // New term
  106. $this->_termDictionary[$termKey] = $term;
  107. $this->_termDocs[$termKey] = array();
  108. $this->_termDocs[$termKey][$this->_docCount] = array();
  109. } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
  110. // Existing term, but new term entry
  111. $this->_termDocs[$termKey][$this->_docCount] = array();
  112. }
  113. $position += $token->getPositionIncrement();
  114. $this->_termDocs[$termKey][$this->_docCount][] = $position;
  115. }
  116. if ($tokenCounter == 0) {
  117. // Field contains empty value. Treat it as non-indexed and non-tokenized
  118. $field = clone($field);
  119. $field->isIndexed = $field->isTokenized = false;
  120. } else {
  121. $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
  122. $tokenCounter)*
  123. $document->boost*
  124. $field->boost ));
  125. }
  126. } else if (($fieldUtf8Value = $field->getUtf8Value()) == '') {
  127. // Field contains empty value. Treat it as non-indexed and non-tokenized
  128. $field = clone($field);
  129. $field->isIndexed = $field->isTokenized = false;
  130. } else {
  131. $term = new Index\Term($fieldUtf8Value, $field->name);
  132. $termKey = $term->key();
  133. if (!isset($this->_termDictionary[$termKey])) {
  134. // New term
  135. $this->_termDictionary[$termKey] = $term;
  136. $this->_termDocs[$termKey] = array();
  137. $this->_termDocs[$termKey][$this->_docCount] = array();
  138. } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
  139. // Existing term, but new term entry
  140. $this->_termDocs[$termKey][$this->_docCount] = array();
  141. }
  142. $this->_termDocs[$termKey][$this->_docCount][] = 0; // position
  143. $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
  144. $document->boost*
  145. $field->boost ));
  146. }
  147. }
  148. if ($field->isStored) {
  149. $storedFields[] = $field;
  150. }
  151. $this->addField($field);
  152. }
  153. foreach ($this->_fields as $fieldName => $field) {
  154. if (!$field->isIndexed) {
  155. continue;
  156. }
  157. if (!isset($this->_norms[$fieldName])) {
  158. $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
  159. $this->_docCount);
  160. }
  161. if (isset($docNorms[$fieldName])){
  162. $this->_norms[$fieldName] .= $docNorms[$fieldName];
  163. } else {
  164. $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
  165. }
  166. }
  167. $this->addStoredFields($storedFields);
  168. }
  169. /**
  170. * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
  171. */
  172. protected function _dumpDictionary()
  173. {
  174. ksort($this->_termDictionary, SORT_STRING);
  175. $this->initializeDictionaryFiles();
  176. foreach ($this->_termDictionary as $termId => $term) {
  177. $this->addTerm($term, $this->_termDocs[$termId]);
  178. }
  179. $this->closeDictionaryFiles();
  180. }
  181. /**
  182. * Close segment, write it to disk and return segment info
  183. *
  184. * @return \Zend\Search\Lucene\Index\SegmentInfo
  185. */
  186. public function close()
  187. {
  188. if ($this->_docCount == 0) {
  189. return null;
  190. }
  191. $this->_dumpFNM();
  192. $this->_dumpDictionary();
  193. $this->_generateCFS();
  194. return new Index\SegmentInfo($this->_directory,
  195. $this->_name,
  196. $this->_docCount,
  197. -1,
  198. null,
  199. true,
  200. true);
  201. }
  202. }