PageRenderTime 52ms CodeModel.GetById 21ms app.highlight 26ms RepoModel.GetById 0ms app.codeStats 1ms

/strigi-0.7.7/libstreamanalyzer/lib/streamanalyzer.cpp

#
C++ | 513 lines | 447 code | 15 blank | 51 comment | 109 complexity | 15841476a48b36a0c27d5f3419158447 MD5 | raw file
Possible License(s): LGPL-2.0
  1/* This file is part of Strigi Desktop Search
  2 *
  3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
  4 *
  5 * This library is free software; you can redistribute it and/or
  6 * modify it under the terms of the GNU Library General Public
  7 * License as published by the Free Software Foundation; either
  8 * version 2 of the License, or (at your option) any later version.
  9 *
 10 * This library is distributed in the hope that it will be useful,
 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13 * Library General Public License for more details.
 14 *
 15 * You should have received a copy of the GNU Library General Public License
 16 * along with this library; see the file COPYING.LIB.  If not, write to
 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 18 * Boston, MA 02110-1301, USA.
 19 */
 20#include <strigi/streamanalyzer.h>
 21#include <strigi/fileinputstream.h>
 22#include <strigi/streamendanalyzer.h>
 23#include <strigi/streamthroughanalyzer.h>
 24#include <strigi/streamlineanalyzer.h>
 25#include <strigi/streameventanalyzer.h>
 26#include <strigi/streamsaxanalyzer.h>
 27#include "endanalyzers/bz2endanalyzer.h"
 28#include "endanalyzers/lzmaendanalyzer.h"
 29#include "eventanalyzers/mimeeventanalyzer.h"
 30#include "endanalyzers/bmpendanalyzer.h"
 31#include "endanalyzers/textendanalyzer.h"
 32#include "endanalyzers/tarendanalyzer.h"
 33#include "endanalyzers/arendanalyzer.h"
 34#include "endanalyzers/zipexeendanalyzer.h"
 35#include "endanalyzers/odfendanalyzer.h"
 36#include "endanalyzers/oleendanalyzer.h"
 37#include "endanalyzers/rpmendanalyzer.h"
 38#include "endanalyzers/cpioendanalyzer.h"
 39#include "endanalyzers/pdfendanalyzer.h"
 40#include "endanalyzers/sdfendanalyzer.h"
 41#include "endanalyzers/pngendanalyzer.h"
 42#include "endanalyzers/gzipendanalyzer.h"
 43#include "lineanalyzers/m3ustreamanalyzer.h"
 44#include "endanalyzers/mailendanalyzer.h"
 45#include "endanalyzers/mpegendanalyzer.h"
 46#include "endanalyzers/helperendanalyzer.h"
 47#include <strigi/dataeventinputstream.h>
 48#include "endanalyzers/id3endanalyzer.h"
 49#include "throughanalyzers/oggthroughanalyzer.h"
 50#include "endanalyzers/flacendanalyzer.h"
 51#include <strigi/analysisresult.h>
 52#include <strigi/indexwriter.h>
 53#include <strigi/analyzerconfiguration.h>
 54#include <strigi/textutils.h>
 55#include "analyzerloader.h"
 56#include "eventthroughanalyzer.h"
 57#include "saxanalyzers/htmlsaxanalyzer.h"
 58#include <strigi/indexpluginloader.h>
 59#include <sys/stat.h>
 60#ifdef WIN32
 61 //#include "ifilterendanalyzer.h"
 62#endif
 63#include <iostream>
 64#include <config.h>
 65
 66using namespace std;
 67using namespace Strigi;
 68
 69namespace Strigi {
 70
 71class StreamAnalyzerPrivate {
 72public:
 73    AnalyzerConfiguration& conf;
 74    vector<StreamThroughAnalyzerFactory*> throughfactories;
 75    vector<StreamEndAnalyzerFactory*> endfactories;
 76    vector<StreamSaxAnalyzerFactory*> saxfactories;
 77    vector<StreamLineAnalyzerFactory*> linefactories;
 78    vector<StreamEventAnalyzerFactory*> eventfactories;
 79    vector<vector<StreamEndAnalyzer*> > end;
 80    vector<vector<StreamThroughAnalyzer*> > through;
 81    IndexWriter* writer;
 82
 83    AnalyzerLoader* moduleLoader;
 84    const RegisteredField* sizefield;
 85    const RegisteredField* errorfield;
 86    void initializeThroughFactories();
 87    void initializeEndFactories();
 88    void initializeSaxFactories();
 89    void initializeLineFactories();
 90    void initializeEventFactories();
 91    void addFactory(StreamThroughAnalyzerFactory* f);
 92    void addFactory(StreamEndAnalyzerFactory* f);
 93    void addFactory(StreamSaxAnalyzerFactory* f);
 94    void addFactory(StreamLineAnalyzerFactory* f);
 95    void addFactory(StreamEventAnalyzerFactory* f);
 96    void addThroughAnalyzers();
 97    void addEndAnalyzers();
 98    void removeIndexable(unsigned depth);
 99    signed char analyze(AnalysisResult& idx, StreamBase<char>* input);
100
101    StreamAnalyzerPrivate(AnalyzerConfiguration& c);
102    ~StreamAnalyzerPrivate();
103};
104
105} // namespace Strigi
106StreamAnalyzerPrivate::StreamAnalyzerPrivate(AnalyzerConfiguration& c)
107        :conf(c), writer(0) {
108    moduleLoader = new AnalyzerLoader();
109    sizefield = c.fieldRegister().sizeField;
110    errorfield = c.fieldRegister().parseErrorField;
111
112    // load the plugins from the environment setting
113    const char* strigipluginpath(getenv("STRIGI_PLUGIN_PATH"));
114    if (strigipluginpath) {
115        vector<string> strigipluginpaths = getdirs(strigipluginpath);
116        for (uint i=0; i<strigipluginpaths.size(); ++i) {
117            moduleLoader->loadPlugins(strigipluginpaths[i].c_str());
118        }
119    } else {
120        moduleLoader->loadPlugins( LIBINSTALLDIR "/strigi");
121    }
122
123    initializeSaxFactories();
124    initializeLineFactories();
125    initializeEventFactories();
126    initializeThroughFactories();
127    initializeEndFactories();
128}
129StreamAnalyzerPrivate::~StreamAnalyzerPrivate() {
130    // delete all factories
131    vector<StreamThroughAnalyzerFactory*>::iterator ta;
132    for (ta = throughfactories.begin(); ta != throughfactories.end(); ++ta) {
133        delete *ta;
134    }
135    vector<StreamEndAnalyzerFactory*>::iterator ea;
136    for (ea = endfactories.begin(); ea != endfactories.end(); ++ea) {
137        delete *ea;
138    }
139    vector<StreamSaxAnalyzerFactory*>::iterator sa;
140    for (sa = saxfactories.begin(); sa != saxfactories.end(); ++sa) {
141        delete *sa;
142    }
143    vector<StreamLineAnalyzerFactory*>::iterator la;
144    for (la = linefactories.begin(); la != linefactories.end(); ++la) {
145        delete *la;
146    }
147    vector<StreamEventAnalyzerFactory*>::iterator da;
148    for (da = eventfactories.begin(); da != eventfactories.end(); ++da) {
149        delete *da;
150    }
151    // delete the through analyzers and end analyzers
152    vector<vector<StreamThroughAnalyzer*> >::iterator tIter;
153    for (tIter = through.begin(); tIter != through.end(); ++tIter) {
154        vector<StreamThroughAnalyzer*>::iterator t;
155        for (t = tIter->begin(); t != tIter->end(); ++t) {
156            delete *t;
157        }
158    }
159    vector<vector<StreamEndAnalyzer*> >::iterator eIter;
160    for (eIter = end.begin(); eIter != end.end(); ++eIter) {
161        vector<StreamEndAnalyzer*>::iterator e;
162        for (e = eIter->begin(); e != eIter->end(); ++e) {
163            delete *e;
164        }
165    }
166    delete moduleLoader;
167    if (writer) {
168        writer->releaseWriterData(conf.fieldRegister());
169    }
170}
171
172StreamAnalyzer::StreamAnalyzer(AnalyzerConfiguration& c)
173        :p(new StreamAnalyzerPrivate(c)) {
174}
175StreamAnalyzer::~StreamAnalyzer() {
176    delete p;
177}
178void
179StreamAnalyzer::setIndexWriter(IndexWriter& w) {
180    if (p->writer != 0) {
181        p->writer->releaseWriterData(p->conf.fieldRegister());
182    }
183    p->writer = &w;
184    p->writer->initWriterData(p->conf.fieldRegister());
185}
186signed char
187StreamAnalyzer::indexFile(const char *filepath) {
188    string path(filepath);
189    return indexFile(path);
190}
191signed char
192StreamAnalyzer::indexFile(const string& filepath) {
193    if (!checkUtf8(filepath.c_str())) {
194        return 1;
195    }
196    if (p->writer == 0) {
197        return 1;
198    }
199    struct stat s;
200    stat(filepath.c_str(), &s);
201    // ensure a decent buffer size
202    string name;
203    AnalysisResult analysisresult(filepath, s.st_mtime, *p->writer, *this);
204    InputStream* file = FileInputStream::open(filepath.c_str());
205    signed char r;
206    if (file->status() == Ok) {
207        r = analysisresult.index(file);
208    } else {
209        r = analysisresult.index(0);
210    }
211    delete file;
212    return r;
213}
214void
215StreamAnalyzerPrivate::addFactory(StreamThroughAnalyzerFactory* f) {
216    f->registerFields(conf.fieldRegister());
217    if (conf.useFactory(f)) {
218        throughfactories.push_back(f);
219    } else {
220        delete f;
221    }
222}
223void
224StreamAnalyzerPrivate::initializeSaxFactories() {
225    list<StreamSaxAnalyzerFactory*> plugins
226        = moduleLoader->streamSaxAnalyzerFactories();
227    list<StreamSaxAnalyzerFactory*>::iterator i;
228    for (i = plugins.begin(); i != plugins.end(); ++i) {
229        addFactory(*i);
230    }
231    addFactory(new HtmlSaxAnalyzerFactory());
232}
233void
234StreamAnalyzerPrivate::initializeLineFactories() {
235    list<StreamLineAnalyzerFactory*> plugins
236        = moduleLoader->streamLineAnalyzerFactories();
237    list<StreamLineAnalyzerFactory*>::iterator i;
238    for (i = plugins.begin(); i != plugins.end(); ++i) {
239        addFactory(*i);
240    }
241//    addFactory(new OdfMimeTypeLineAnalyzerFactory());
242    addFactory(new M3uLineAnalyzerFactory());
243}
244void
245StreamAnalyzerPrivate::initializeEventFactories() {
246    list<StreamEventAnalyzerFactory*> plugins
247        = moduleLoader->streamEventAnalyzerFactories();
248    list<StreamEventAnalyzerFactory*>::iterator i;
249    addFactory(new MimeEventAnalyzerFactory());
250    for (i = plugins.begin(); i != plugins.end(); ++i) {
251        addFactory(*i);
252    }
253}
254void
255StreamAnalyzerPrivate::initializeThroughFactories() {
256    list<StreamThroughAnalyzerFactory*> plugins
257        = moduleLoader->streamThroughAnalyzerFactories();
258    list<StreamThroughAnalyzerFactory*>::iterator i;
259    for (i = plugins.begin(); i != plugins.end(); ++i) {
260        addFactory(*i);
261    }
262    addFactory(new OggThroughAnalyzerFactory());
263    addFactory(new EventThroughAnalyzerFactory(saxfactories, linefactories,
264        eventfactories));
265}
266void
267StreamAnalyzerPrivate::addFactory(StreamEventAnalyzerFactory* f) {
268    f->registerFields(conf.fieldRegister());
269    if (conf.useFactory(f)) {
270        eventfactories.push_back(f);
271    } else {
272        delete f;
273    }
274}
275void
276StreamAnalyzerPrivate::addFactory(StreamLineAnalyzerFactory* f) {
277    f->registerFields(conf.fieldRegister());
278    if (conf.useFactory(f)) {
279        linefactories.push_back(f);
280    } else {
281        delete f;
282    }
283}
284void
285StreamAnalyzerPrivate::addFactory(StreamSaxAnalyzerFactory* f) {
286    f->registerFields(conf.fieldRegister());
287    if (conf.useFactory(f)) {
288        saxfactories.push_back(f);
289    } else {
290        delete f;
291    }
292}
293void
294StreamAnalyzerPrivate::addFactory(StreamEndAnalyzerFactory* f) {
295    f->registerFields(conf.fieldRegister());
296    if (conf.useFactory(f)) {
297        endfactories.push_back(f);
298    } else {
299        delete f;
300    }
301}
302/**
303 * Instantiate factories for all analyzers.
304 **/
305void
306StreamAnalyzerPrivate::initializeEndFactories() {
307    list<StreamEndAnalyzerFactory*> plugins
308        = moduleLoader->streamEndAnalyzerFactories();
309    list<StreamEndAnalyzerFactory*>::iterator i;
310    for (i = plugins.begin(); i != plugins.end(); ++i) {
311        addFactory(*i);
312    }
313    addFactory(new Bz2EndAnalyzerFactory());
314    addFactory(new GZipEndAnalyzerFactory());
315    addFactory(new OleEndAnalyzerFactory());
316    addFactory(new TarEndAnalyzerFactory());
317    addFactory(new ArEndAnalyzerFactory());
318    addFactory(new MailEndAnalyzerFactory());
319//    addFactory(new MpegEndAnalyzerFactory()); //Xine fallback works so much better now
320    addFactory(new OdfEndAnalyzerFactory());
321    addFactory(new ZipEndAnalyzerFactory());
322    addFactory(new ZipExeEndAnalyzerFactory());
323    addFactory(new RpmEndAnalyzerFactory());
324    addFactory(new CpioEndAnalyzerFactory());
325    addFactory(new PngEndAnalyzerFactory());
326    addFactory(new BmpEndAnalyzerFactory());
327    addFactory(new FlacEndAnalyzerFactory());
328    addFactory(new ID3EndAnalyzerFactory());
329    addFactory(new PdfEndAnalyzerFactory());
330    addFactory(new SdfEndAnalyzerFactory());
331    addFactory(new LzmaEndAnalyzerFactory());
332#ifndef _MSC_VER
333    addFactory(new HelperEndAnalyzerFactory());
334#endif
335    addFactory(new TextEndAnalyzerFactory());
336}
337void
338StreamAnalyzerPrivate::addThroughAnalyzers() {
339    through.resize(through.size()+1);
340    vector<vector<StreamThroughAnalyzer*> >::reverse_iterator tIter;
341    tIter = through.rbegin();
342    vector<StreamThroughAnalyzerFactory*>::iterator ta;
343    for (ta = throughfactories.begin(); ta != throughfactories.end(); ++ta) {
344        tIter->push_back((*ta)->newInstance());
345    }
346}
347void
348StreamAnalyzerPrivate::addEndAnalyzers() {
349    end.resize(end.size()+1);
350    vector<vector<StreamEndAnalyzer*> >::reverse_iterator eIter;
351    eIter = end.rbegin();
352    vector<StreamEndAnalyzerFactory*>::iterator ea;
353    for (ea = endfactories.begin(); ea != endfactories.end(); ++ea) {
354        eIter->push_back((*ea)->newInstance());
355    }
356}
357signed char
358StreamAnalyzer::analyze(AnalysisResult& idx, StreamBase<char>* input) {
359    return p->analyze(idx, input);
360}
361signed char
362StreamAnalyzerPrivate::analyze(AnalysisResult& idx, StreamBase<char>* input) {
363    //cerr << "analyze " << idx.path().c_str() << endl;
364
365    // retrieve or construct the through analyzers and end analyzers
366    vector<vector<StreamThroughAnalyzer*> >::iterator tIter;
367    vector<vector<StreamEndAnalyzer*> >::iterator eIter;
368    while ((int)through.size() <= idx.depth()) {
369        addThroughAnalyzers();
370        addEndAnalyzers();
371    }
372    tIter = through.begin() + idx.depth();
373    eIter = end.begin() + idx.depth();
374
375    // read the headersize size before connecting the throughanalyzers
376    // This ensures that the first read is at least this size, even if the
377    // throughanalyzers read smaller chunks.
378    bool finished = false;
379    const char* header = 0;
380    int32_t headersize = 1024;
381    if (input) {
382        headersize = input->read(header, headersize, headersize);
383        input->reset(0);
384        if (headersize < 0) finished = true;
385    }
386
387    // insert the through analyzers
388    vector<StreamThroughAnalyzer*>::iterator ts;
389    for (ts = tIter->begin(); (input == 0 || input->status() == Ok)
390            && ts != tIter->end(); ++ts) {
391        (*ts)->setIndexable(&idx);
392        input = (*ts)->connectInputStream(input);
393        if (input && input->position() != 0) {
394            cerr << "Analyzer " << (*ts)->name() << " has left the stream in a bad state." << endl;
395        }
396    }
397
398    // reread the header so we can use it for the endanalyzers
399    if (input && headersize > 0) {
400        headersize = input->read(header, headersize, headersize);
401        if (headersize <= 0) {
402            finished = true;
403        } else if (input->reset(0) != 0) {
404            cerr << "resetting is impossible!! pos: " << input->position()
405                << " status: " << input->status() << endl;
406        }
407    } else {
408        // indicate that we have no data in the stream
409        headersize = -1;
410        finished = true;
411    }
412    size_t es = 0;
413    size_t itersize = eIter->size();
414    while (!finished && es != itersize) {
415        StreamEndAnalyzer* sea = (*eIter)[es];
416        if (sea->checkHeader(header, headersize)) {
417            idx.setEndAnalyzer(sea);
418            char ar = sea->analyze(idx, input);
419            if (ar) {
420// FIXME: find either a NIE-compliant way to report errors or use some API for this
421//                idx.addValue(errorfield, sea->name() + string(": ")
422//                    + sea->error());
423                if (!idx.config().indexMore()) {
424                    removeIndexable(idx.depth());
425                    return -1;
426                }
427                int64_t pos = input->reset(0);
428                if (pos != 0) { // could not reset
429                    cerr << "could not reset stream of " << idx.path().c_str()
430                        << " from pos " << input->position()
431                        << " to 0 after reading with " << sea->name()
432                        << ": " << sea->error().c_str() << endl;
433                    finished = true;
434                } else {
435                    // refresh the pointer to the start of the data
436                    headersize = input->read(header, headersize, headersize);
437    		    if (input->reset(0) != 0) {
438        		cerr << "resetting again is impossible!! pos: "
439                             << input->position() << " status: "
440                             << input->status() << endl;
441    		    }
442                    if (headersize < 0) finished = true;
443                }
444            } else {
445                finished = true;
446            }
447            eIter = end.begin() + idx.depth();
448        }
449        if (!finished) {
450            finished = !conf.indexMore();
451        }
452        es++;
453    }
454    idx.setEndAnalyzer(0);
455    if (input) {
456        // make sure the entire stream is read if the size is not known
457        bool ready;
458        tIter = through.begin() + idx.depth();
459        uint32_t skipsize = 4096;
460        do {
461            // ask the analyzerconfiguration if we should continue
462            int64_t max = idx.config().maximalStreamReadLength(idx);
463            if (!idx.config().indexMore()
464                    || (max != -1 && input->position() >= max)) {
465                // we are done
466                return 0;
467            }
468            ready = input->size() != -1;
469            vector<StreamThroughAnalyzer*>::iterator ts;
470            for (ts = tIter->begin(); ready && ts != tIter->end(); ++ts) {
471                ready = (*ts)->isReadyWithStream();
472            }
473            if (!ready) {
474                input->skip(skipsize);
475                if (skipsize < 131072) {
476                    skipsize *= 4;
477                }
478            }
479        } while (!ready && input->status() == Ok);
480        if (input->status() == Error) {
481            fprintf(stderr, "Error: %s\n", input->error());
482            removeIndexable(idx.depth());
483            return -2;
484        }
485    }
486
487    // store the size of the stream
488    if (input && input->status() != Error && input->size() >= 0) {
489        // TODO remove cast
490        idx.addValue(sizefield, (uint32_t)input->size());
491    }
492
493    // remove references to the analysisresult before it goes out of scope
494    removeIndexable(idx.depth());
495    return 0;
496}
497/**
498 * Remove references to the analysisresult before it goes out of scope.
499 **/
500void
501StreamAnalyzerPrivate::removeIndexable(uint depth) {
502    vector<vector<StreamThroughAnalyzer*> >::iterator tIter;
503    vector<StreamThroughAnalyzer*>::iterator ts;
504    tIter = through.begin() + depth;
505    for (ts = tIter->begin(); ts != tIter->end(); ++ts) {
506        // remove references to the analysisresult before it goes out of scope
507        (*ts)->setIndexable(0);
508    }
509}
510AnalyzerConfiguration&
511StreamAnalyzer::configuration() const {
512    return p->conf;
513}