PageRenderTime 51ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/strigi-0.7.7/libstreamanalyzer/plugins/indexers/clucenengindexer/cluceneindexwriter.cpp

#
C++ | 279 lines | 239 code | 15 blank | 25 comment | 31 complexity | ee8d00f6ae9e0bbf6da80dd30d00c654 MD5 | raw file
Possible License(s): LGPL-2.0
  1. /* This file is part of Strigi Desktop Search
  2. *
  3. * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Library General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Library General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Library General Public License
  16. * along with this library; see the file COPYING.LIB. If not, write to
  17. * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  18. * Boston, MA 02110-1301, USA.
  19. */
  20. #include "cluceneindexwriter.h"
  21. #include "tcharutils.h"
  22. #include <CLucene.h>
  23. #include <CLucene/store/Lock.h>
  24. #include "cluceneindexreader.h"
  25. #include "cluceneindexmanager.h"
  26. #include <CLucene/search/PrefixQuery.h>
  27. #include <sstream>
  28. #include <assert.h>
  29. #include <iostream>
  30. using lucene::document::Document;
  31. using lucene::document::Field;
  32. using lucene::index::IndexWriter;
  33. using lucene::index::Term;
  34. using lucene::index::TermDocs;
  35. using lucene::search::BooleanQuery;
  36. using lucene::search::IndexSearcher;
  37. using lucene::search::Hits;
  38. using lucene::search::PrefixFilter;
  39. using lucene::search::TermQuery;
  40. using lucene::search::Query;
  41. using lucene::util::BitSet;
  42. using lucene::util::Reader;
  43. using namespace std;
  44. using namespace Strigi;
  45. struct CLuceneDocData {
  46. lucene::document::Document doc;
  47. std::string content;
  48. };
  49. CLuceneIndexWriter::CLuceneIndexWriter(CLuceneIndexManager* m):
  50. manager(m), doccount(0) {
  51. string contentID(FieldRegister::contentFieldName.c_str());
  52. wstring cID(utf8toucs2(contentID));
  53. addMapping(_T(""),cID.c_str());
  54. }
  55. CLuceneIndexWriter::~CLuceneIndexWriter() {
  56. }
  57. const wchar_t*
  58. CLuceneIndexWriter::systemlocation() {
  59. const static wstring s(utf8toucs2(FieldRegister::pathFieldName));
  60. return s.c_str();
  61. }
  62. namespace {
  63. const wchar_t*
  64. parentlocation() {
  65. const static wstring s(utf8toucs2(FieldRegister::parentLocationFieldName));
  66. return s.c_str();
  67. }
  68. }
  69. void
  70. CLuceneIndexWriter::addText(const AnalysisResult* idx, const char* text,
  71. int32_t length) {
  72. CLuceneDocData* doc = static_cast<CLuceneDocData*>(idx->writerData());
  73. doc->content.append(text, length);
  74. }
  75. typedef map<wstring, wstring> CLuceneIndexWriterFieldMapType;
  76. CLuceneIndexWriterFieldMapType CLuceneIndexWriterFieldMap;
  77. void CLuceneIndexWriter::addMapping(const TCHAR* from, const TCHAR* to){
  78. CLuceneIndexWriterFieldMap[from] = to;
  79. }
  80. const TCHAR*
  81. CLuceneIndexWriter::mapId(const TCHAR* id) {
  82. if (id == 0) id = _T("");
  83. CLuceneIndexWriterFieldMapType::iterator itr
  84. = CLuceneIndexWriterFieldMap.find(id);
  85. if (itr == CLuceneIndexWriterFieldMap.end()) {
  86. return id;
  87. } else {
  88. return itr->second.c_str();
  89. }
  90. }
  91. void
  92. CLuceneIndexWriter::addValue(const AnalysisResult* idx,
  93. AnalyzerConfiguration::FieldType type, const TCHAR* name,
  94. const TCHAR* value) {
  95. CLuceneDocData* doc = static_cast<CLuceneDocData*>(idx->writerData());
  96. int config = 0;
  97. if ((type & AnalyzerConfiguration::Stored) == AnalyzerConfiguration::Stored){
  98. config |= Field::STORE_YES;
  99. } else {
  100. config |= Field::STORE_NO;
  101. }
  102. if ((type & AnalyzerConfiguration::Indexed)
  103. == AnalyzerConfiguration::Indexed) {
  104. if ((type & AnalyzerConfiguration::Tokenized)
  105. == AnalyzerConfiguration::Tokenized) {
  106. config |= Field::INDEX_TOKENIZED;
  107. } else {
  108. config |= Field::INDEX_UNTOKENIZED;
  109. }
  110. } else {
  111. config |= Field::INDEX_NO;
  112. }
  113. Field* field = new Field(name, value, config);
  114. doc->doc.add(*field);
  115. }
  116. void
  117. CLuceneIndexWriter::addValue(const AnalysisResult* idx,
  118. AnalyzerConfiguration::FieldType type, const TCHAR* fn,
  119. const std::string& value) {
  120. addValue(idx, type, CLuceneIndexWriter::mapId(fn), utf8toucs2(value).c_str());
  121. }
  122. void
  123. CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
  124. const Strigi::RegisteredField* field, const std::string& value) {
  125. AnalyzerConfiguration::FieldType type
  126. = idx->config().indexType(field);
  127. if (type == AnalyzerConfiguration::None) return;
  128. addValue(idx, type, utf8toucs2(field->key()).c_str(), value);
  129. }
  130. void
  131. CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
  132. const Strigi::RegisteredField* field, uint32_t value) {
  133. ostringstream o;
  134. o << value;
  135. addValue(idx, field, o.str());
  136. }
  137. void
  138. CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
  139. const Strigi::RegisteredField* field, int32_t value) {
  140. ostringstream o;
  141. o << value;
  142. addValue(idx, field, o.str());
  143. }
  144. void
  145. CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
  146. const Strigi::RegisteredField* field,
  147. const unsigned char* data, uint32_t size) {
  148. addValue(idx, field, string((const char*)data, (string::size_type)size));
  149. }
  150. void
  151. CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
  152. const Strigi::RegisteredField* field, double value) {
  153. ostringstream o;
  154. o << value;
  155. addValue(idx, field, o.str());
  156. }
  157. void
  158. CLuceneIndexWriter::startAnalysis(const AnalysisResult* idx) {
  159. doccount++;
  160. CLuceneDocData*doc = new CLuceneDocData();
  161. idx->setWriterData(doc);
  162. }
  163. /*
  164. Close all left open indexwriters for this path.
  165. */
  166. void
  167. CLuceneIndexWriter::finishAnalysis(const AnalysisResult* idx) {
  168. CLuceneDocData* doc = static_cast<CLuceneDocData*>(idx->writerData());
  169. wstring c(utf8toucs2(doc->content));
  170. if (doc->content.length() > 0) {
  171. const TCHAR* mappedFn = mapId(_T(""));
  172. // add the stored field as compressed and indexed
  173. doc->doc.add(*new Field(mappedFn, c.c_str(), Field::STORE_YES | Field::STORE_COMPRESS | Field::INDEX_TOKENIZED));
  174. }
  175. lucene::index::IndexWriter* writer = manager->refWriter();
  176. if (writer) {
  177. try {
  178. writer->addDocument(&doc->doc);
  179. fprintf(stderr, "added %s\n", idx->path().c_str());
  180. } catch (CLuceneError& err) {
  181. fprintf(stderr, "%s: %s\n", idx->path().c_str(), err.what());
  182. }
  183. }
  184. manager->derefWriter();
  185. delete doc;
  186. }
  187. void
  188. CLuceneIndexWriter::deleteEntries(const std::vector<std::string>& entries) {
  189. // make sure the index reader is up to date
  190. lucene::index::IndexReader* reader = manager->checkReader(true);
  191. if (reader == NULL) {
  192. fprintf(stderr,"cannot delete entry: lucene reader cannot be opened\n");
  193. return;
  194. }
  195. lucene::index::IndexWriter* writer = manager->refWriter();
  196. for (uint i=0; i<entries.size(); ++i) {
  197. deleteEntry(entries[i], writer, reader);
  198. }
  199. writer->flush();
  200. reader->commit();
  201. manager->derefWriter();
  202. }
  203. void
  204. CLuceneIndexWriter::deleteEntry(const string& entry, lucene::index::IndexWriter* writer, lucene::index::IndexReader* reader) {
  205. wstring path(utf8toucs2(entry));
  206. {
  207. Term* t = _CLNEW Term(systemlocation(), path.c_str());
  208. writer->deleteDocuments(t);
  209. _CLDECDELETE(t);
  210. }
  211. {
  212. Term* t = _CLNEW Term(parentlocation(), path.c_str());
  213. writer->deleteDocuments(t);
  214. _CLDECDELETE(t);
  215. }
  216. {
  217. // delete all deeper nested files
  218. wstring v = utf8toucs2(entry+"/");
  219. Term* t(_CLNEW Term(parentlocation(), v.c_str()));
  220. PrefixFilter* filter = _CLNEW PrefixFilter(t);
  221. BitSet* b = filter->bits(reader);
  222. _CLDELETE(filter);
  223. int32_t size = b->size();
  224. for (int id = 0; id < size; ++id) {
  225. if (b->get(id) && !reader->isDeleted(id)) {
  226. reader->deleteDocument(id);
  227. }
  228. }
  229. _CLDELETE(b);
  230. _CLDECDELETE(t);
  231. }
  232. }
  233. void
  234. CLuceneIndexWriter::deleteAllEntries() {
  235. lucene::index::IndexReader* reader = manager->checkReader();
  236. if ( reader != NULL ){
  237. for ( int32_t i=0;i<reader->maxDoc();i++ ){
  238. reader->deleteDocument(i);
  239. }
  240. reader->flush();
  241. }
  242. }
  243. void
  244. CLuceneIndexWriter::commit() {
  245. lucene::index::IndexWriter* writer = manager->refWriter();
  246. writer->flush();
  247. manager->derefWriter();
  248. }
  249. void
  250. CLuceneIndexWriter::initWriterData(const FieldRegister& f) {
  251. map<string, RegisteredField*>::const_iterator i;
  252. map<string, RegisteredField*>::const_iterator end = f.fields().end();
  253. for (i = f.fields().begin(); i != end; ++i) {
  254. i->second->setWriterData(0);
  255. }
  256. }
  257. void
  258. CLuceneIndexWriter::releaseWriterData(const FieldRegister& f) {
  259. map<string, RegisteredField*>::const_iterator i;
  260. map<string, RegisteredField*>::const_iterator end = f.fields().end();
  261. for (i = f.fields().begin(); i != end; ++i) {
  262. delete static_cast<int*>(i->second->writerData());
  263. }
  264. }