PageRenderTime 80ms CodeModel.GetById 12ms app.highlight 61ms RepoModel.GetById 0ms app.codeStats 0ms

/strigi-0.7.7/libstreamanalyzer/plugins/indexers/clucenengindexer/cluceneindexwriter.cpp

#
C++ | 279 lines | 239 code | 15 blank | 25 comment | 31 complexity | ee8d00f6ae9e0bbf6da80dd30d00c654 MD5 | raw file
Possible License(s): LGPL-2.0
  1/* This file is part of Strigi Desktop Search
  2 *
  3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
  4 *
  5 * This library is free software; you can redistribute it and/or
  6 * modify it under the terms of the GNU Library General Public
  7 * License as published by the Free Software Foundation; either
  8 * version 2 of the License, or (at your option) any later version.
  9 *
 10 * This library is distributed in the hope that it will be useful,
 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13 * Library General Public License for more details.
 14 *
 15 * You should have received a copy of the GNU Library General Public License
 16 * along with this library; see the file COPYING.LIB.  If not, write to
 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 18 * Boston, MA 02110-1301, USA.
 19 */
 20
 21#include "cluceneindexwriter.h"
 22#include "tcharutils.h"
 23#include <CLucene.h>
 24#include <CLucene/store/Lock.h>
 25#include "cluceneindexreader.h"
 26#include "cluceneindexmanager.h"
 27#include <CLucene/search/PrefixQuery.h>
 28#include <sstream>
 29#include <assert.h>
 30#include <iostream>
 31
 32using lucene::document::Document;
 33using lucene::document::Field;
 34using lucene::index::IndexWriter;
 35using lucene::index::Term;
 36using lucene::index::TermDocs;
 37using lucene::search::BooleanQuery;
 38using lucene::search::IndexSearcher;
 39using lucene::search::Hits;
 40using lucene::search::PrefixFilter;
 41using lucene::search::TermQuery;
 42using lucene::search::Query;
 43using lucene::util::BitSet;
 44
 45using lucene::util::Reader;
 46using namespace std;
 47using namespace Strigi;
 48
 49struct CLuceneDocData {
 50    lucene::document::Document doc;
 51    std::string content;
 52};
 53
 54CLuceneIndexWriter::CLuceneIndexWriter(CLuceneIndexManager* m):
 55    manager(m), doccount(0) {
 56    string contentID(FieldRegister::contentFieldName.c_str());
 57    wstring cID(utf8toucs2(contentID));
 58    addMapping(_T(""),cID.c_str());
 59}
 60CLuceneIndexWriter::~CLuceneIndexWriter() {
 61}
 62const wchar_t*
 63CLuceneIndexWriter::systemlocation() {
 64    const static wstring s(utf8toucs2(FieldRegister::pathFieldName));
 65    return s.c_str();
 66}
 67namespace {
 68const wchar_t*
 69parentlocation() {
 70    const static wstring s(utf8toucs2(FieldRegister::parentLocationFieldName));
 71    return s.c_str();
 72}
 73}
 74void
 75CLuceneIndexWriter::addText(const AnalysisResult* idx, const char* text,
 76        int32_t length) {
 77    CLuceneDocData* doc = static_cast<CLuceneDocData*>(idx->writerData());
 78    doc->content.append(text, length);
 79}
 80
 81typedef map<wstring, wstring> CLuceneIndexWriterFieldMapType;
 82CLuceneIndexWriterFieldMapType CLuceneIndexWriterFieldMap;
 83
 84void CLuceneIndexWriter::addMapping(const TCHAR* from, const TCHAR* to){
 85    CLuceneIndexWriterFieldMap[from] = to;
 86}
 87const TCHAR*
 88CLuceneIndexWriter::mapId(const TCHAR* id) {
 89    if (id == 0) id = _T("");
 90    CLuceneIndexWriterFieldMapType::iterator itr
 91        = CLuceneIndexWriterFieldMap.find(id);
 92    if (itr == CLuceneIndexWriterFieldMap.end()) {
 93        return id;
 94    } else {
 95        return itr->second.c_str();
 96    }
 97}
 98void
 99CLuceneIndexWriter::addValue(const AnalysisResult* idx,
100        AnalyzerConfiguration::FieldType type, const TCHAR* name,
101        const TCHAR* value) {
102    CLuceneDocData* doc = static_cast<CLuceneDocData*>(idx->writerData());
103    int config = 0;
104    if ((type & AnalyzerConfiguration::Stored) == AnalyzerConfiguration::Stored){
105        config |= Field::STORE_YES;
106    } else {
107        config |= Field::STORE_NO;
108    }
109
110    if ((type & AnalyzerConfiguration::Indexed)
111            == AnalyzerConfiguration::Indexed) {
112        if ((type & AnalyzerConfiguration::Tokenized)
113                == AnalyzerConfiguration::Tokenized) {
114            config |= Field::INDEX_TOKENIZED;
115        } else {
116            config |= Field::INDEX_UNTOKENIZED;
117        }
118    } else {
119        config |= Field::INDEX_NO;
120    }
121
122    Field* field = new Field(name, value, config);
123    doc->doc.add(*field);
124}
125void
126CLuceneIndexWriter::addValue(const AnalysisResult* idx,
127        AnalyzerConfiguration::FieldType type, const TCHAR* fn,
128        const std::string& value) {
129    addValue(idx, type, CLuceneIndexWriter::mapId(fn), utf8toucs2(value).c_str());
130}
131void
132CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
133        const Strigi::RegisteredField* field, const std::string& value) {
134    AnalyzerConfiguration::FieldType type
135        = idx->config().indexType(field);
136    if (type == AnalyzerConfiguration::None) return;
137    addValue(idx, type, utf8toucs2(field->key()).c_str(), value);
138}
139void
140CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
141        const Strigi::RegisteredField* field, uint32_t value) {
142    ostringstream o;
143    o << value;
144    addValue(idx, field, o.str());
145}
146void
147CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
148        const Strigi::RegisteredField* field, int32_t value) {
149    ostringstream o;
150    o << value;
151    addValue(idx, field, o.str());
152}
153void
154CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
155        const Strigi::RegisteredField* field,
156        const unsigned char* data, uint32_t size) {
157    addValue(idx, field, string((const char*)data, (string::size_type)size));
158}
159void
160CLuceneIndexWriter::addValue(const Strigi::AnalysisResult* idx,
161        const Strigi::RegisteredField* field, double value) {
162    ostringstream o;
163    o << value;
164    addValue(idx, field, o.str());
165}
166void
167CLuceneIndexWriter::startAnalysis(const AnalysisResult* idx) {
168    doccount++;
169    CLuceneDocData*doc = new CLuceneDocData();
170    idx->setWriterData(doc);
171}
172/*
173    Close all left open indexwriters for this path.
174*/
175void
176CLuceneIndexWriter::finishAnalysis(const AnalysisResult* idx) {
177    CLuceneDocData* doc = static_cast<CLuceneDocData*>(idx->writerData());
178    wstring c(utf8toucs2(doc->content));
179
180    if (doc->content.length() > 0) {
181      const TCHAR* mappedFn = mapId(_T(""));
182
183      // add the stored field as compressed and indexed
184      doc->doc.add(*new Field(mappedFn, c.c_str(), Field::STORE_YES | Field::STORE_COMPRESS | Field::INDEX_TOKENIZED));
185    }
186    lucene::index::IndexWriter* writer = manager->refWriter();
187    if (writer) {
188        try {
189            writer->addDocument(&doc->doc);
190            fprintf(stderr, "added %s\n", idx->path().c_str());
191        } catch (CLuceneError& err) {
192            fprintf(stderr, "%s: %s\n", idx->path().c_str(), err.what());
193        }
194    }
195    manager->derefWriter();
196    delete doc;
197}
198void
199CLuceneIndexWriter::deleteEntries(const std::vector<std::string>& entries) {
200    // make sure the index reader is up to date
201    lucene::index::IndexReader* reader = manager->checkReader(true);
202    if (reader == NULL) {
203        fprintf(stderr,"cannot delete entry: lucene reader cannot be opened\n");
204        return;
205    }
206
207    lucene::index::IndexWriter* writer = manager->refWriter();
208
209    for (uint i=0; i<entries.size(); ++i) {
210        deleteEntry(entries[i], writer, reader);
211    }
212    writer->flush();
213    reader->commit();
214
215    manager->derefWriter();
216}
217void
218CLuceneIndexWriter::deleteEntry(const string& entry, lucene::index::IndexWriter* writer, lucene::index::IndexReader* reader) {
219    wstring path(utf8toucs2(entry));
220{
221    Term* t = _CLNEW Term(systemlocation(), path.c_str());
222    writer->deleteDocuments(t);
223    _CLDECDELETE(t);
224}
225{
226    Term* t = _CLNEW Term(parentlocation(), path.c_str());
227    writer->deleteDocuments(t);
228    _CLDECDELETE(t);
229}
230{
231    // delete all deeper nested files
232    wstring v = utf8toucs2(entry+"/");
233    Term* t(_CLNEW Term(parentlocation(), v.c_str()));
234    PrefixFilter* filter = _CLNEW PrefixFilter(t);
235    BitSet* b = filter->bits(reader);
236    _CLDELETE(filter);
237    int32_t size = b->size();
238    for (int id = 0; id < size; ++id) {
239        if (b->get(id) && !reader->isDeleted(id)) {
240            reader->deleteDocument(id);
241        }
242    }
243    _CLDELETE(b);
244    _CLDECDELETE(t);
245}
246}
247void
248CLuceneIndexWriter::deleteAllEntries() {
249    lucene::index::IndexReader* reader = manager->checkReader();
250    if ( reader != NULL ){
251      for ( int32_t i=0;i<reader->maxDoc();i++ ){
252        reader->deleteDocument(i);
253      }
254      reader->flush();
255    }
256}
257void
258CLuceneIndexWriter::commit() {
259    lucene::index::IndexWriter* writer = manager->refWriter();
260    writer->flush();
261    manager->derefWriter();
262}
263
264void
265CLuceneIndexWriter::initWriterData(const FieldRegister& f) {
266    map<string, RegisteredField*>::const_iterator i;
267    map<string, RegisteredField*>::const_iterator end = f.fields().end();
268    for (i = f.fields().begin(); i != end; ++i) {
269        i->second->setWriterData(0);
270    }
271}
272void
273CLuceneIndexWriter::releaseWriterData(const FieldRegister& f) {
274    map<string, RegisteredField*>::const_iterator i;
275    map<string, RegisteredField*>::const_iterator end = f.fields().end();
276    for (i = f.fields().begin(); i != end; ++i) {
277        delete static_cast<int*>(i->second->writerData());
278    }
279}