PageRenderTime 55ms CodeModel.GetById 31ms RepoModel.GetById 0ms app.codeStats 0ms

/library/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php

https://bitbucket.org/Ebozavrik/test-application
PHP | 231 lines | 121 code | 37 blank | 73 comment | 24 complexity | c103c782967e685a1951696ecc0e2952 MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: DocumentWriter.php 24593 2012-01-05 20:35:02Z matthew $
  21. */
  22. /** Zend_Search_Lucene_Index_SegmentWriter */
  23. require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
  24. /**
  25. * @category Zend
  26. * @package Zend_Search_Lucene
  27. * @subpackage Index
  28. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  29. * @license http://framework.zend.com/license/new-bsd New BSD License
  30. */
  31. class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter
  32. {
  33. /**
  34. * Term Dictionary
  35. * Array of the Zend_Search_Lucene_Index_Term objects
  36. * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
  37. *
  38. * @var array
  39. */
  40. protected $_termDictionary;
  41. /**
  42. * Documents, which contain the term
  43. *
  44. * @var array
  45. */
  46. protected $_termDocs;
  47. /**
  48. * Object constructor.
  49. *
  50. * @param Zend_Search_Lucene_Storage_Directory $directory
  51. * @param string $name
  52. */
  53. public function __construct (Zend_Search_Lucene_Storage_Directory $directory, $name)
  54. {
  55. parent::__construct($directory, $name);
  56. $this->_termDocs = array();
  57. $this->_termDictionary = array();
  58. }
  59. /**
  60. * Adds a document to this segment.
  61. *
  62. * @param Zend_Search_Lucene_Document $document
  63. *
  64. * @throws Zend_Search_Lucene_Exception
  65. */
  66. public function addDocument (Zend_Search_Lucene_Document $document)
  67. {
  68. /** Zend_Search_Lucene_Search_Similarity */
  69. require_once 'Zend/Search/Lucene/Search/Similarity.php';
  70. $storedFields = array();
  71. $docNorms = array();
  72. $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
  73. foreach ($document->getFieldNames() as $fieldName) {
  74. $field = $document->getField($fieldName);
  75. if ($field->storeTermVector) {
  76. /**
  77. * @todo term vector storing support
  78. */
  79. require_once 'Zend/Search/Lucene/Exception.php';
  80. throw new Zend_Search_Lucene_Exception( 'Store term vector functionality is not supported yet.' );
  81. }
  82. if ($field->isIndexed) {
  83. if ($field->isTokenized) {
  84. /** Zend_Search_Lucene_Analysis_Analyzer */
  85. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  86. $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
  87. $analyzer->setInput($field->value, $field->encoding);
  88. $position = 0;
  89. $tokenCounter = 0;
  90. while (( $token = $analyzer->nextToken() ) !== null) {
  91. $tokenCounter++;
  92. $term = new Zend_Search_Lucene_Index_Term( $token->getTermText(), $field->name );
  93. $termKey = $term->key();
  94. if (!isset( $this->_termDictionary[$termKey] )) {
  95. // New term
  96. $this->_termDictionary[$termKey] = $term;
  97. $this->_termDocs[$termKey] = array();
  98. $this->_termDocs[$termKey][$this->_docCount] = array();
  99. } else if (!isset( $this->_termDocs[$termKey][$this->_docCount] )) {
  100. // Existing term, but new term entry
  101. $this->_termDocs[$termKey][$this->_docCount] = array();
  102. }
  103. $position += $token->getPositionIncrement();
  104. $this->_termDocs[$termKey][$this->_docCount][] = $position;
  105. }
  106. if ($tokenCounter == 0) {
  107. // Field contains empty value. Treat it as non-indexed and non-tokenized
  108. $field = clone( $field );
  109. $field->isIndexed = $field->isTokenized = false;
  110. } else {
  111. $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name,
  112. $tokenCounter) *
  113. $document->boost *
  114. $field->boost));
  115. }
  116. } else if (( $fieldUtf8Value = $field->getUtf8Value() ) == '') {
  117. // Field contains empty value. Treat it as non-indexed and non-tokenized
  118. $field = clone( $field );
  119. $field->isIndexed = $field->isTokenized = false;
  120. } else {
  121. $term = new Zend_Search_Lucene_Index_Term( $fieldUtf8Value, $field->name );
  122. $termKey = $term->key();
  123. if (!isset( $this->_termDictionary[$termKey] )) {
  124. // New term
  125. $this->_termDictionary[$termKey] = $term;
  126. $this->_termDocs[$termKey] = array();
  127. $this->_termDocs[$termKey][$this->_docCount] = array();
  128. } else if (!isset( $this->_termDocs[$termKey][$this->_docCount] )) {
  129. // Existing term, but new term entry
  130. $this->_termDocs[$termKey][$this->_docCount] = array();
  131. }
  132. $this->_termDocs[$termKey][$this->_docCount][] = 0; // position
  133. $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, 1) *
  134. $document->boost *
  135. $field->boost));
  136. }
  137. }
  138. if ($field->isStored) {
  139. $storedFields[] = $field;
  140. }
  141. $this->addField($field);
  142. }
  143. foreach ($this->_fields as $fieldName => $field) {
  144. if (!$field->isIndexed) {
  145. continue;
  146. }
  147. if (!isset( $this->_norms[$fieldName] )) {
  148. $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))),
  149. $this->_docCount);
  150. }
  151. if (isset( $docNorms[$fieldName] )) {
  152. $this->_norms[$fieldName] .= $docNorms[$fieldName];
  153. } else {
  154. $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0)));
  155. }
  156. }
  157. $this->addStoredFields($storedFields);
  158. }
  159. /**
  160. * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
  161. */
  162. protected function _dumpDictionary ()
  163. {
  164. ksort($this->_termDictionary, SORT_STRING);
  165. $this->initializeDictionaryFiles();
  166. foreach ($this->_termDictionary as $termId => $term) {
  167. $this->addTerm($term, $this->_termDocs[$termId]);
  168. }
  169. $this->closeDictionaryFiles();
  170. }
  171. /**
  172. * Close segment, write it to disk and return segment info
  173. *
  174. * @return Zend_Search_Lucene_Index_SegmentInfo
  175. */
  176. public function close ()
  177. {
  178. if ($this->_docCount == 0) {
  179. return null;
  180. }
  181. $this->_dumpFNM();
  182. $this->_dumpDictionary();
  183. $this->_generateCFS();
  184. /** Zend_Search_Lucene_Index_SegmentInfo */
  185. require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
  186. return new Zend_Search_Lucene_Index_SegmentInfo( $this->_directory,
  187. $this->_name,
  188. $this->_docCount,
  189. -1,
  190. null,
  191. true,
  192. true );
  193. }
  194. }