/application/libraries/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php

https://github.com/grandison/budo16 · PHP · 214 lines · 108 code · 35 blank · 71 comment · 18 complexity · fec1cb4b00f4b4bdd1b4ab84bddc5108 MD5 · raw file

  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: DocumentWriter.php 16541 2009-07-07 06:59:03Z bkarwin $
  21. */
  22. /** Zend_Search_Lucene_Analysis_Analyzer */
  23. // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  24. /** Zend_Search_Lucene_Index_SegmentWriter */
  25. // require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
  26. /**
  27. * @category Zend
  28. * @package Zend_Search_Lucene
  29. * @subpackage Index
  30. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  31. * @license http://framework.zend.com/license/new-bsd New BSD License
  32. */
  33. class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter
  34. {
  35. /**
  36. * Term Dictionary
  37. * Array of the Zend_Search_Lucene_Index_Term objects
  38. * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
  39. *
  40. * @var array
  41. */
  42. protected $_termDictionary;
  43. /**
  44. * Documents, which contain the term
  45. *
  46. * @var array
  47. */
  48. protected $_termDocs;
  49. /**
  50. * Object constructor.
  51. *
  52. * @param Zend_Search_Lucene_Storage_Directory $directory
  53. * @param string $name
  54. */
  55. public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
  56. {
  57. parent::__construct($directory, $name);
  58. $this->_termDocs = array();
  59. $this->_termDictionary = array();
  60. }
  61. /**
  62. * Adds a document to this segment.
  63. *
  64. * @param Zend_Search_Lucene_Document $document
  65. * @throws Zend_Search_Lucene_Exception
  66. */
  67. public function addDocument(Zend_Search_Lucene_Document $document)
  68. {
  69. $storedFields = array();
  70. $docNorms = array();
  71. $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
  72. foreach ($document->getFieldNames() as $fieldName) {
  73. $field = $document->getField($fieldName);
  74. $this->addField($field);
  75. if ($field->storeTermVector) {
  76. /**
  77. * @todo term vector storing support
  78. */
  79. // require_once 'Zend/Search/Lucene/Exception.php';
  80. throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
  81. }
  82. if ($field->isIndexed) {
  83. if ($field->isTokenized) {
  84. $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
  85. $analyzer->setInput($field->value, $field->encoding);
  86. $position = 0;
  87. $tokenCounter = 0;
  88. while (($token = $analyzer->nextToken()) !== null) {
  89. $tokenCounter++;
  90. $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
  91. $termKey = $term->key();
  92. if (!isset($this->_termDictionary[$termKey])) {
  93. // New term
  94. $this->_termDictionary[$termKey] = $term;
  95. $this->_termDocs[$termKey] = array();
  96. $this->_termDocs[$termKey][$this->_docCount] = array();
  97. } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
  98. // Existing term, but new term entry
  99. $this->_termDocs[$termKey][$this->_docCount] = array();
  100. }
  101. $position += $token->getPositionIncrement();
  102. $this->_termDocs[$termKey][$this->_docCount][] = $position;
  103. }
  104. $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
  105. $tokenCounter)*
  106. $document->boost*
  107. $field->boost ));
  108. } else {
  109. $term = new Zend_Search_Lucene_Index_Term($field->getUtf8Value(), $field->name);
  110. $termKey = $term->key();
  111. if (!isset($this->_termDictionary[$termKey])) {
  112. // New term
  113. $this->_termDictionary[$termKey] = $term;
  114. $this->_termDocs[$termKey] = array();
  115. $this->_termDocs[$termKey][$this->_docCount] = array();
  116. } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
  117. // Existing term, but new term entry
  118. $this->_termDocs[$termKey][$this->_docCount] = array();
  119. }
  120. $this->_termDocs[$termKey][$this->_docCount][] = 0; // position
  121. $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
  122. $document->boost*
  123. $field->boost ));
  124. }
  125. }
  126. if ($field->isStored) {
  127. $storedFields[] = $field;
  128. }
  129. }
  130. foreach ($this->_fields as $fieldName => $field) {
  131. if (!$field->isIndexed) {
  132. continue;
  133. }
  134. if (!isset($this->_norms[$fieldName])) {
  135. $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
  136. $this->_docCount);
  137. }
  138. if (isset($docNorms[$fieldName])){
  139. $this->_norms[$fieldName] .= $docNorms[$fieldName];
  140. } else {
  141. $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
  142. }
  143. }
  144. $this->addStoredFields($storedFields);
  145. }
  146. /**
  147. * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
  148. */
  149. protected function _dumpDictionary()
  150. {
  151. ksort($this->_termDictionary, SORT_STRING);
  152. $this->initializeDictionaryFiles();
  153. foreach ($this->_termDictionary as $termId => $term) {
  154. $this->addTerm($term, $this->_termDocs[$termId]);
  155. }
  156. $this->closeDictionaryFiles();
  157. }
  158. /**
  159. * Close segment, write it to disk and return segment info
  160. *
  161. * @return Zend_Search_Lucene_Index_SegmentInfo
  162. */
  163. public function close()
  164. {
  165. if ($this->_docCount == 0) {
  166. return null;
  167. }
  168. $this->_dumpFNM();
  169. $this->_dumpDictionary();
  170. $this->_generateCFS();
  171. return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
  172. $this->_name,
  173. $this->_docCount,
  174. -1,
  175. null,
  176. true,
  177. true);
  178. }
  179. }