/strigi-0.7.7/libstreamanalyzer/lib/streamanalyzer.cpp
C++ | 513 lines | 447 code | 15 blank | 51 comment | 109 complexity | 15841476a48b36a0c27d5f3419158447 MD5 | raw file
Possible License(s): LGPL-2.0
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#include <strigi/streamanalyzer.h>
21#include <strigi/fileinputstream.h>
22#include <strigi/streamendanalyzer.h>
23#include <strigi/streamthroughanalyzer.h>
24#include <strigi/streamlineanalyzer.h>
25#include <strigi/streameventanalyzer.h>
26#include <strigi/streamsaxanalyzer.h>
27#include "endanalyzers/bz2endanalyzer.h"
28#include "endanalyzers/lzmaendanalyzer.h"
29#include "eventanalyzers/mimeeventanalyzer.h"
30#include "endanalyzers/bmpendanalyzer.h"
31#include "endanalyzers/textendanalyzer.h"
32#include "endanalyzers/tarendanalyzer.h"
33#include "endanalyzers/arendanalyzer.h"
34#include "endanalyzers/zipexeendanalyzer.h"
35#include "endanalyzers/odfendanalyzer.h"
36#include "endanalyzers/oleendanalyzer.h"
37#include "endanalyzers/rpmendanalyzer.h"
38#include "endanalyzers/cpioendanalyzer.h"
39#include "endanalyzers/pdfendanalyzer.h"
40#include "endanalyzers/sdfendanalyzer.h"
41#include "endanalyzers/pngendanalyzer.h"
42#include "endanalyzers/gzipendanalyzer.h"
43#include "lineanalyzers/m3ustreamanalyzer.h"
44#include "endanalyzers/mailendanalyzer.h"
45#include "endanalyzers/mpegendanalyzer.h"
46#include "endanalyzers/helperendanalyzer.h"
47#include <strigi/dataeventinputstream.h>
48#include "endanalyzers/id3endanalyzer.h"
49#include "throughanalyzers/oggthroughanalyzer.h"
50#include "endanalyzers/flacendanalyzer.h"
51#include <strigi/analysisresult.h>
52#include <strigi/indexwriter.h>
53#include <strigi/analyzerconfiguration.h>
54#include <strigi/textutils.h>
55#include "analyzerloader.h"
56#include "eventthroughanalyzer.h"
57#include "saxanalyzers/htmlsaxanalyzer.h"
58#include <strigi/indexpluginloader.h>
59#include <sys/stat.h>
60#ifdef WIN32
61 //#include "ifilterendanalyzer.h"
62#endif
63#include <iostream>
64#include <config.h>
65
66using namespace std;
67using namespace Strigi;
68
69namespace Strigi {
70
71class StreamAnalyzerPrivate {
72public:
73 AnalyzerConfiguration& conf;
74 vector<StreamThroughAnalyzerFactory*> throughfactories;
75 vector<StreamEndAnalyzerFactory*> endfactories;
76 vector<StreamSaxAnalyzerFactory*> saxfactories;
77 vector<StreamLineAnalyzerFactory*> linefactories;
78 vector<StreamEventAnalyzerFactory*> eventfactories;
79 vector<vector<StreamEndAnalyzer*> > end;
80 vector<vector<StreamThroughAnalyzer*> > through;
81 IndexWriter* writer;
82
83 AnalyzerLoader* moduleLoader;
84 const RegisteredField* sizefield;
85 const RegisteredField* errorfield;
86 void initializeThroughFactories();
87 void initializeEndFactories();
88 void initializeSaxFactories();
89 void initializeLineFactories();
90 void initializeEventFactories();
91 void addFactory(StreamThroughAnalyzerFactory* f);
92 void addFactory(StreamEndAnalyzerFactory* f);
93 void addFactory(StreamSaxAnalyzerFactory* f);
94 void addFactory(StreamLineAnalyzerFactory* f);
95 void addFactory(StreamEventAnalyzerFactory* f);
96 void addThroughAnalyzers();
97 void addEndAnalyzers();
98 void removeIndexable(unsigned depth);
99 signed char analyze(AnalysisResult& idx, StreamBase<char>* input);
100
101 StreamAnalyzerPrivate(AnalyzerConfiguration& c);
102 ~StreamAnalyzerPrivate();
103};
104
105} // namespace Strigi
106StreamAnalyzerPrivate::StreamAnalyzerPrivate(AnalyzerConfiguration& c)
107 :conf(c), writer(0) {
108 moduleLoader = new AnalyzerLoader();
109 sizefield = c.fieldRegister().sizeField;
110 errorfield = c.fieldRegister().parseErrorField;
111
112 // load the plugins from the environment setting
113 const char* strigipluginpath(getenv("STRIGI_PLUGIN_PATH"));
114 if (strigipluginpath) {
115 vector<string> strigipluginpaths = getdirs(strigipluginpath);
116 for (uint i=0; i<strigipluginpaths.size(); ++i) {
117 moduleLoader->loadPlugins(strigipluginpaths[i].c_str());
118 }
119 } else {
120 moduleLoader->loadPlugins( LIBINSTALLDIR "/strigi");
121 }
122
123 initializeSaxFactories();
124 initializeLineFactories();
125 initializeEventFactories();
126 initializeThroughFactories();
127 initializeEndFactories();
128}
129StreamAnalyzerPrivate::~StreamAnalyzerPrivate() {
130 // delete all factories
131 vector<StreamThroughAnalyzerFactory*>::iterator ta;
132 for (ta = throughfactories.begin(); ta != throughfactories.end(); ++ta) {
133 delete *ta;
134 }
135 vector<StreamEndAnalyzerFactory*>::iterator ea;
136 for (ea = endfactories.begin(); ea != endfactories.end(); ++ea) {
137 delete *ea;
138 }
139 vector<StreamSaxAnalyzerFactory*>::iterator sa;
140 for (sa = saxfactories.begin(); sa != saxfactories.end(); ++sa) {
141 delete *sa;
142 }
143 vector<StreamLineAnalyzerFactory*>::iterator la;
144 for (la = linefactories.begin(); la != linefactories.end(); ++la) {
145 delete *la;
146 }
147 vector<StreamEventAnalyzerFactory*>::iterator da;
148 for (da = eventfactories.begin(); da != eventfactories.end(); ++da) {
149 delete *da;
150 }
151 // delete the through analyzers and end analyzers
152 vector<vector<StreamThroughAnalyzer*> >::iterator tIter;
153 for (tIter = through.begin(); tIter != through.end(); ++tIter) {
154 vector<StreamThroughAnalyzer*>::iterator t;
155 for (t = tIter->begin(); t != tIter->end(); ++t) {
156 delete *t;
157 }
158 }
159 vector<vector<StreamEndAnalyzer*> >::iterator eIter;
160 for (eIter = end.begin(); eIter != end.end(); ++eIter) {
161 vector<StreamEndAnalyzer*>::iterator e;
162 for (e = eIter->begin(); e != eIter->end(); ++e) {
163 delete *e;
164 }
165 }
166 delete moduleLoader;
167 if (writer) {
168 writer->releaseWriterData(conf.fieldRegister());
169 }
170}
171
172StreamAnalyzer::StreamAnalyzer(AnalyzerConfiguration& c)
173 :p(new StreamAnalyzerPrivate(c)) {
174}
175StreamAnalyzer::~StreamAnalyzer() {
176 delete p;
177}
178void
179StreamAnalyzer::setIndexWriter(IndexWriter& w) {
180 if (p->writer != 0) {
181 p->writer->releaseWriterData(p->conf.fieldRegister());
182 }
183 p->writer = &w;
184 p->writer->initWriterData(p->conf.fieldRegister());
185}
186signed char
187StreamAnalyzer::indexFile(const char *filepath) {
188 string path(filepath);
189 return indexFile(path);
190}
191signed char
192StreamAnalyzer::indexFile(const string& filepath) {
193 if (!checkUtf8(filepath.c_str())) {
194 return 1;
195 }
196 if (p->writer == 0) {
197 return 1;
198 }
199 struct stat s;
200 stat(filepath.c_str(), &s);
201 // ensure a decent buffer size
202 string name;
203 AnalysisResult analysisresult(filepath, s.st_mtime, *p->writer, *this);
204 InputStream* file = FileInputStream::open(filepath.c_str());
205 signed char r;
206 if (file->status() == Ok) {
207 r = analysisresult.index(file);
208 } else {
209 r = analysisresult.index(0);
210 }
211 delete file;
212 return r;
213}
214void
215StreamAnalyzerPrivate::addFactory(StreamThroughAnalyzerFactory* f) {
216 f->registerFields(conf.fieldRegister());
217 if (conf.useFactory(f)) {
218 throughfactories.push_back(f);
219 } else {
220 delete f;
221 }
222}
223void
224StreamAnalyzerPrivate::initializeSaxFactories() {
225 list<StreamSaxAnalyzerFactory*> plugins
226 = moduleLoader->streamSaxAnalyzerFactories();
227 list<StreamSaxAnalyzerFactory*>::iterator i;
228 for (i = plugins.begin(); i != plugins.end(); ++i) {
229 addFactory(*i);
230 }
231 addFactory(new HtmlSaxAnalyzerFactory());
232}
233void
234StreamAnalyzerPrivate::initializeLineFactories() {
235 list<StreamLineAnalyzerFactory*> plugins
236 = moduleLoader->streamLineAnalyzerFactories();
237 list<StreamLineAnalyzerFactory*>::iterator i;
238 for (i = plugins.begin(); i != plugins.end(); ++i) {
239 addFactory(*i);
240 }
241// addFactory(new OdfMimeTypeLineAnalyzerFactory());
242 addFactory(new M3uLineAnalyzerFactory());
243}
244void
245StreamAnalyzerPrivate::initializeEventFactories() {
246 list<StreamEventAnalyzerFactory*> plugins
247 = moduleLoader->streamEventAnalyzerFactories();
248 list<StreamEventAnalyzerFactory*>::iterator i;
249 addFactory(new MimeEventAnalyzerFactory());
250 for (i = plugins.begin(); i != plugins.end(); ++i) {
251 addFactory(*i);
252 }
253}
254void
255StreamAnalyzerPrivate::initializeThroughFactories() {
256 list<StreamThroughAnalyzerFactory*> plugins
257 = moduleLoader->streamThroughAnalyzerFactories();
258 list<StreamThroughAnalyzerFactory*>::iterator i;
259 for (i = plugins.begin(); i != plugins.end(); ++i) {
260 addFactory(*i);
261 }
262 addFactory(new OggThroughAnalyzerFactory());
263 addFactory(new EventThroughAnalyzerFactory(saxfactories, linefactories,
264 eventfactories));
265}
266void
267StreamAnalyzerPrivate::addFactory(StreamEventAnalyzerFactory* f) {
268 f->registerFields(conf.fieldRegister());
269 if (conf.useFactory(f)) {
270 eventfactories.push_back(f);
271 } else {
272 delete f;
273 }
274}
275void
276StreamAnalyzerPrivate::addFactory(StreamLineAnalyzerFactory* f) {
277 f->registerFields(conf.fieldRegister());
278 if (conf.useFactory(f)) {
279 linefactories.push_back(f);
280 } else {
281 delete f;
282 }
283}
284void
285StreamAnalyzerPrivate::addFactory(StreamSaxAnalyzerFactory* f) {
286 f->registerFields(conf.fieldRegister());
287 if (conf.useFactory(f)) {
288 saxfactories.push_back(f);
289 } else {
290 delete f;
291 }
292}
293void
294StreamAnalyzerPrivate::addFactory(StreamEndAnalyzerFactory* f) {
295 f->registerFields(conf.fieldRegister());
296 if (conf.useFactory(f)) {
297 endfactories.push_back(f);
298 } else {
299 delete f;
300 }
301}
302/**
303 * Instantiate factories for all analyzers.
304 **/
305void
306StreamAnalyzerPrivate::initializeEndFactories() {
307 list<StreamEndAnalyzerFactory*> plugins
308 = moduleLoader->streamEndAnalyzerFactories();
309 list<StreamEndAnalyzerFactory*>::iterator i;
310 for (i = plugins.begin(); i != plugins.end(); ++i) {
311 addFactory(*i);
312 }
313 addFactory(new Bz2EndAnalyzerFactory());
314 addFactory(new GZipEndAnalyzerFactory());
315 addFactory(new OleEndAnalyzerFactory());
316 addFactory(new TarEndAnalyzerFactory());
317 addFactory(new ArEndAnalyzerFactory());
318 addFactory(new MailEndAnalyzerFactory());
319// addFactory(new MpegEndAnalyzerFactory()); //Xine fallback works so much better now
320 addFactory(new OdfEndAnalyzerFactory());
321 addFactory(new ZipEndAnalyzerFactory());
322 addFactory(new ZipExeEndAnalyzerFactory());
323 addFactory(new RpmEndAnalyzerFactory());
324 addFactory(new CpioEndAnalyzerFactory());
325 addFactory(new PngEndAnalyzerFactory());
326 addFactory(new BmpEndAnalyzerFactory());
327 addFactory(new FlacEndAnalyzerFactory());
328 addFactory(new ID3EndAnalyzerFactory());
329 addFactory(new PdfEndAnalyzerFactory());
330 addFactory(new SdfEndAnalyzerFactory());
331 addFactory(new LzmaEndAnalyzerFactory());
332#ifndef _MSC_VER
333 addFactory(new HelperEndAnalyzerFactory());
334#endif
335 addFactory(new TextEndAnalyzerFactory());
336}
337void
338StreamAnalyzerPrivate::addThroughAnalyzers() {
339 through.resize(through.size()+1);
340 vector<vector<StreamThroughAnalyzer*> >::reverse_iterator tIter;
341 tIter = through.rbegin();
342 vector<StreamThroughAnalyzerFactory*>::iterator ta;
343 for (ta = throughfactories.begin(); ta != throughfactories.end(); ++ta) {
344 tIter->push_back((*ta)->newInstance());
345 }
346}
347void
348StreamAnalyzerPrivate::addEndAnalyzers() {
349 end.resize(end.size()+1);
350 vector<vector<StreamEndAnalyzer*> >::reverse_iterator eIter;
351 eIter = end.rbegin();
352 vector<StreamEndAnalyzerFactory*>::iterator ea;
353 for (ea = endfactories.begin(); ea != endfactories.end(); ++ea) {
354 eIter->push_back((*ea)->newInstance());
355 }
356}
357signed char
358StreamAnalyzer::analyze(AnalysisResult& idx, StreamBase<char>* input) {
359 return p->analyze(idx, input);
360}
361signed char
362StreamAnalyzerPrivate::analyze(AnalysisResult& idx, StreamBase<char>* input) {
363 //cerr << "analyze " << idx.path().c_str() << endl;
364
365 // retrieve or construct the through analyzers and end analyzers
366 vector<vector<StreamThroughAnalyzer*> >::iterator tIter;
367 vector<vector<StreamEndAnalyzer*> >::iterator eIter;
368 while ((int)through.size() <= idx.depth()) {
369 addThroughAnalyzers();
370 addEndAnalyzers();
371 }
372 tIter = through.begin() + idx.depth();
373 eIter = end.begin() + idx.depth();
374
375 // read the headersize size before connecting the throughanalyzers
376 // This ensures that the first read is at least this size, even if the
377 // throughanalyzers read smaller chunks.
378 bool finished = false;
379 const char* header = 0;
380 int32_t headersize = 1024;
381 if (input) {
382 headersize = input->read(header, headersize, headersize);
383 input->reset(0);
384 if (headersize < 0) finished = true;
385 }
386
387 // insert the through analyzers
388 vector<StreamThroughAnalyzer*>::iterator ts;
389 for (ts = tIter->begin(); (input == 0 || input->status() == Ok)
390 && ts != tIter->end(); ++ts) {
391 (*ts)->setIndexable(&idx);
392 input = (*ts)->connectInputStream(input);
393 if (input && input->position() != 0) {
394 cerr << "Analyzer " << (*ts)->name() << " has left the stream in a bad state." << endl;
395 }
396 }
397
398 // reread the header so we can use it for the endanalyzers
399 if (input && headersize > 0) {
400 headersize = input->read(header, headersize, headersize);
401 if (headersize <= 0) {
402 finished = true;
403 } else if (input->reset(0) != 0) {
404 cerr << "resetting is impossible!! pos: " << input->position()
405 << " status: " << input->status() << endl;
406 }
407 } else {
408 // indicate that we have no data in the stream
409 headersize = -1;
410 finished = true;
411 }
412 size_t es = 0;
413 size_t itersize = eIter->size();
414 while (!finished && es != itersize) {
415 StreamEndAnalyzer* sea = (*eIter)[es];
416 if (sea->checkHeader(header, headersize)) {
417 idx.setEndAnalyzer(sea);
418 char ar = sea->analyze(idx, input);
419 if (ar) {
420// FIXME: find either a NIE-compliant way to report errors or use some API for this
421// idx.addValue(errorfield, sea->name() + string(": ")
422// + sea->error());
423 if (!idx.config().indexMore()) {
424 removeIndexable(idx.depth());
425 return -1;
426 }
427 int64_t pos = input->reset(0);
428 if (pos != 0) { // could not reset
429 cerr << "could not reset stream of " << idx.path().c_str()
430 << " from pos " << input->position()
431 << " to 0 after reading with " << sea->name()
432 << ": " << sea->error().c_str() << endl;
433 finished = true;
434 } else {
435 // refresh the pointer to the start of the data
436 headersize = input->read(header, headersize, headersize);
437 if (input->reset(0) != 0) {
438 cerr << "resetting again is impossible!! pos: "
439 << input->position() << " status: "
440 << input->status() << endl;
441 }
442 if (headersize < 0) finished = true;
443 }
444 } else {
445 finished = true;
446 }
447 eIter = end.begin() + idx.depth();
448 }
449 if (!finished) {
450 finished = !conf.indexMore();
451 }
452 es++;
453 }
454 idx.setEndAnalyzer(0);
455 if (input) {
456 // make sure the entire stream is read if the size is not known
457 bool ready;
458 tIter = through.begin() + idx.depth();
459 uint32_t skipsize = 4096;
460 do {
461 // ask the analyzerconfiguration if we should continue
462 int64_t max = idx.config().maximalStreamReadLength(idx);
463 if (!idx.config().indexMore()
464 || (max != -1 && input->position() >= max)) {
465 // we are done
466 return 0;
467 }
468 ready = input->size() != -1;
469 vector<StreamThroughAnalyzer*>::iterator ts;
470 for (ts = tIter->begin(); ready && ts != tIter->end(); ++ts) {
471 ready = (*ts)->isReadyWithStream();
472 }
473 if (!ready) {
474 input->skip(skipsize);
475 if (skipsize < 131072) {
476 skipsize *= 4;
477 }
478 }
479 } while (!ready && input->status() == Ok);
480 if (input->status() == Error) {
481 fprintf(stderr, "Error: %s\n", input->error());
482 removeIndexable(idx.depth());
483 return -2;
484 }
485 }
486
487 // store the size of the stream
488 if (input && input->status() != Error && input->size() >= 0) {
489 // TODO remove cast
490 idx.addValue(sizefield, (uint32_t)input->size());
491 }
492
493 // remove references to the analysisresult before it goes out of scope
494 removeIndexable(idx.depth());
495 return 0;
496}
497/**
498 * Remove references to the analysisresult before it goes out of scope.
499 **/
500void
501StreamAnalyzerPrivate::removeIndexable(uint depth) {
502 vector<vector<StreamThroughAnalyzer*> >::iterator tIter;
503 vector<StreamThroughAnalyzer*>::iterator ts;
504 tIter = through.begin() + depth;
505 for (ts = tIter->begin(); ts != tIter->end(); ++ts) {
506 // remove references to the analysisresult before it goes out of scope
507 (*ts)->setIndexable(0);
508 }
509}
510AnalyzerConfiguration&
511StreamAnalyzer::configuration() const {
512 return p->conf;
513}