/strigi-0.7.7/libstreamanalyzer/plugins/indexers/clucenengindexer/cluceneindexwriter.cpp
C++ | 279 lines | 239 code | 15 blank | 25 comment | 31 complexity | ee8d00f6ae9e0bbf6da80dd30d00c654 MD5 | raw file
Possible License(s): LGPL-2.0
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20
21#include "cluceneindexwriter.h"
22#include "tcharutils.h"
23#include <CLucene.h>
24#include <CLucene/store/Lock.h>
25#include "cluceneindexreader.h"
26#include "cluceneindexmanager.h"
27#include <CLucene/search/PrefixQuery.h>
28#include <sstream>
29#include <assert.h>
30#include <iostream>
31
32using lucene::document::Document;
33using lucene::document::Field;
34using lucene::index::IndexWriter;
35using lucene::index::Term;
36using lucene::index::TermDocs;
37using lucene::search::BooleanQuery;
38using lucene::search::IndexSearcher;
39using lucene::search::Hits;
40using lucene::search::PrefixFilter;
41using lucene::search::TermQuery;
42using lucene::search::Query;
43using lucene::util::BitSet;
44
45using lucene::util::Reader;
46using namespace std;
47using namespace Strigi;
48
49struct CLuceneDocData {
50 lucene::document::Document doc;
51 std::string content;
52};
53
54CLuceneIndexWriter::CLuceneIndexWriter(CLuceneIndexManager* m):
55 manager(m), doccount(0) {
56 string contentID(FieldRegister::contentFieldName.c_str());
57 wstring cID(utf8toucs2(contentID));
58 addMapping(_T(""),cID.c_str());
59}
60CLuceneIndexWriter::~CLuceneIndexWriter() {
61}
62const wchar_t*
63CLuceneIndexWriter::systemlocation() {
64 const static wstring s(utf8toucs2(FieldRegister::pathFieldName));
65 return s.c_str();
66}
67namespace {
68const wchar_t*
69parentlocation() {
70 const static wstring s(utf8toucs2(FieldRegister::parentLocationFieldName));
71 return s.c_str();
72}
73}
74void
75CLuceneIndexWriter::addText(const AnalysisResult* idx, const char* text,
76 int32_t length) {
77 CLuceneDocData* doc = static_cast<CLuceneDocData*>(idx->writerData());
78 doc->content.append(text, length);
79}
80
81typedef map<wstring, wstring> CLuceneIndexWriterFieldMapType;
82CLuceneIndexWriterFieldMapType CLuceneIndexWriterFieldMap;
83
84void CLuceneIndexWriter::addMapping(const TCHAR* from, const TCHAR* to){
85 CLuceneIndexWriterFieldMap[from] = to;
86}
87const TCHAR*
88CLuceneIndexWriter::mapId(const TCHAR* id) {
89 if (id == 0) id = _T("");
90 CLuceneIndexWriterFieldMapType::iterator itr
91 = CLuceneIndexWriterFieldMap.find(id);
92 if (itr == CLuceneIndexWriterFieldMap.end()) {
93 return id;
94 } else {
95 return itr->second.c_str();
96 }
97}
98void
99CLuceneIndexWriter::addValue(const AnalysisResult* idx,
100 AnalyzerConfiguration::FieldType type, const TCHAR* name,
101 const TCHAR* value) {
102 CLuceneDocData* doc = static_cast<CLuceneDocData*>(idx->writerData());
103 int config = 0;
104 if ((type & AnalyzerConfiguration::Stored) == AnalyzerConfiguration::Stored){
105 config |= Field::STORE_YES;
106 } else {
107 config |= Field::STORE_NO;
108 }
109
110 if ((type & AnalyzerConfiguration::Indexed)
111 == AnalyzerConfiguration::Indexed) {
112 if ((type & AnalyzerConfiguration::Tokenized)
113 == AnalyzerConfiguration::Tokenized) {
114 config |= Field::INDEX_TOKENIZED;
115 } else {
116 config |= Field::INDEX_UNTOKENIZED;
117 }
118 } else {
119 config |= Field::INDEX_NO;
120 }
121
122 Field* field = new Field(name, value, config);
123 doc->doc.add(*field);
124}
125void
126CLuceneIndexWriter::addValue(const AnalysisResult* idx,
127 AnalyzerConfiguration::FieldType type, const TCHAR* fn,
128 const std::string& value) {
129 addValue(idx, type, CLuceneIndexWriter::mapId(fn), utf8toucs2(value).c_str());
130}
131void
132CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
133 const Strigi::RegisteredField* field, const std::string& value) {
134 AnalyzerConfiguration::FieldType type
135 = idx->config().indexType(field);
136 if (type == AnalyzerConfiguration::None) return;
137 addValue(idx, type, utf8toucs2(field->key()).c_str(), value);
138}
139void
140CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
141 const Strigi::RegisteredField* field, uint32_t value) {
142 ostringstream o;
143 o << value;
144 addValue(idx, field, o.str());
145}
146void
147CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
148 const Strigi::RegisteredField* field, int32_t value) {
149 ostringstream o;
150 o << value;
151 addValue(idx, field, o.str());
152}
153void
154CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
155 const Strigi::RegisteredField* field,
156 const unsigned char* data, uint32_t size) {
157 addValue(idx, field, string((const char*)data, (string::size_type)size));
158}
159void
160CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
161 const Strigi::RegisteredField* field, double value) {
162 ostringstream o;
163 o << value;
164 addValue(idx, field, o.str());
165}
166void
167CLuceneIndexWriter::startAnalysis(const AnalysisResult* idx) {
168 doccount++;
169 CLuceneDocData*doc = new CLuceneDocData();
170 idx->setWriterData(doc);
171}
172/*
173 Close all left open indexwriters for this path.
174*/
175void
176CLuceneIndexWriter::finishAnalysis(const AnalysisResult* idx) {
177 CLuceneDocData* doc = static_cast<CLuceneDocData*>(idx->writerData());
178 wstring c(utf8toucs2(doc->content));
179
180 if (doc->content.length() > 0) {
181 const TCHAR* mappedFn = mapId(_T(""));
182
183 // add the stored field as compressed and indexed
184 doc->doc.add(*new Field(mappedFn, c.c_str(), Field::STORE_YES | Field::STORE_COMPRESS | Field::INDEX_TOKENIZED));
185 }
186 lucene::index::IndexWriter* writer = manager->refWriter();
187 if (writer) {
188 try {
189 writer->addDocument(&doc->doc);
190 fprintf(stderr, "added %s\n", idx->path().c_str());
191 } catch (CLuceneError& err) {
192 fprintf(stderr, "%s: %s\n", idx->path().c_str(), err.what());
193 }
194 }
195 manager->derefWriter();
196 delete doc;
197}
198void
199CLuceneIndexWriter::deleteEntries(const std::vector<std::string>& entries) {
200 // make sure the index reader is up to date
201 lucene::index::IndexReader* reader = manager->checkReader(true);
202 if (reader == NULL) {
203 fprintf(stderr,"cannot delete entry: lucene reader cannot be opened\n");
204 return;
205 }
206
207 lucene::index::IndexWriter* writer = manager->refWriter();
208
209 for (uint i=0; i<entries.size(); ++i) {
210 deleteEntry(entries[i], writer, reader);
211 }
212 writer->flush();
213 reader->commit();
214
215 manager->derefWriter();
216}
217void
218CLuceneIndexWriter::deleteEntry(const string& entry, lucene::index::IndexWriter* writer, lucene::index::IndexReader* reader) {
219 wstring path(utf8toucs2(entry));
220{
221 Term* t = _CLNEW Term(systemlocation(), path.c_str());
222 writer->deleteDocuments(t);
223 _CLDECDELETE(t);
224}
225{
226 Term* t = _CLNEW Term(parentlocation(), path.c_str());
227 writer->deleteDocuments(t);
228 _CLDECDELETE(t);
229}
230{
231 // delete all deeper nested files
232 wstring v = utf8toucs2(entry+"/");
233 Term* t(_CLNEW Term(parentlocation(), v.c_str()));
234 PrefixFilter* filter = _CLNEW PrefixFilter(t);
235 BitSet* b = filter->bits(reader);
236 _CLDELETE(filter);
237 int32_t size = b->size();
238 for (int id = 0; id < size; ++id) {
239 if (b->get(id) && !reader->isDeleted(id)) {
240 reader->deleteDocument(id);
241 }
242 }
243 _CLDELETE(b);
244 _CLDECDELETE(t);
245}
246}
247void
248CLuceneIndexWriter::deleteAllEntries() {
249 lucene::index::IndexReader* reader = manager->checkReader();
250 if ( reader != NULL ){
251 for ( int32_t i=0;i<reader->maxDoc();i++ ){
252 reader->deleteDocument(i);
253 }
254 reader->flush();
255 }
256}
257void
258CLuceneIndexWriter::commit() {
259 lucene::index::IndexWriter* writer = manager->refWriter();
260 writer->flush();
261 manager->derefWriter();
262}
263
264void
265CLuceneIndexWriter::initWriterData(const FieldRegister& f) {
266 map<string, RegisteredField*>::const_iterator i;
267 map<string, RegisteredField*>::const_iterator end = f.fields().end();
268 for (i = f.fields().begin(); i != end; ++i) {
269 i->second->setWriterData(0);
270 }
271}
272void
273CLuceneIndexWriter::releaseWriterData(const FieldRegister& f) {
274 map<string, RegisteredField*>::const_iterator i;
275 map<string, RegisteredField*>::const_iterator end = f.fields().end();
276 for (i = f.fields().begin(); i != end; ++i) {
277 delete static_cast<int*>(i->second->writerData());
278 }
279}