PageRenderTime 39ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/strigi-0.7.7/libstreamanalyzer/lib/streamanalyzer.cpp

#
C++ | 513 lines | 447 code | 15 blank | 51 comment | 109 complexity | 15841476a48b36a0c27d5f3419158447 MD5 | raw file
Possible License(s): LGPL-2.0
  1. /* This file is part of Strigi Desktop Search
  2. *
  3. * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Library General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Library General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Library General Public License
  16. * along with this library; see the file COPYING.LIB. If not, write to
  17. * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  18. * Boston, MA 02110-1301, USA.
  19. */
  20. #include <strigi/streamanalyzer.h>
  21. #include <strigi/fileinputstream.h>
  22. #include <strigi/streamendanalyzer.h>
  23. #include <strigi/streamthroughanalyzer.h>
  24. #include <strigi/streamlineanalyzer.h>
  25. #include <strigi/streameventanalyzer.h>
  26. #include <strigi/streamsaxanalyzer.h>
  27. #include "endanalyzers/bz2endanalyzer.h"
  28. #include "endanalyzers/lzmaendanalyzer.h"
  29. #include "eventanalyzers/mimeeventanalyzer.h"
  30. #include "endanalyzers/bmpendanalyzer.h"
  31. #include "endanalyzers/textendanalyzer.h"
  32. #include "endanalyzers/tarendanalyzer.h"
  33. #include "endanalyzers/arendanalyzer.h"
  34. #include "endanalyzers/zipexeendanalyzer.h"
  35. #include "endanalyzers/odfendanalyzer.h"
  36. #include "endanalyzers/oleendanalyzer.h"
  37. #include "endanalyzers/rpmendanalyzer.h"
  38. #include "endanalyzers/cpioendanalyzer.h"
  39. #include "endanalyzers/pdfendanalyzer.h"
  40. #include "endanalyzers/sdfendanalyzer.h"
  41. #include "endanalyzers/pngendanalyzer.h"
  42. #include "endanalyzers/gzipendanalyzer.h"
  43. #include "lineanalyzers/m3ustreamanalyzer.h"
  44. #include "endanalyzers/mailendanalyzer.h"
  45. #include "endanalyzers/mpegendanalyzer.h"
  46. #include "endanalyzers/helperendanalyzer.h"
  47. #include <strigi/dataeventinputstream.h>
  48. #include "endanalyzers/id3endanalyzer.h"
  49. #include "throughanalyzers/oggthroughanalyzer.h"
  50. #include "endanalyzers/flacendanalyzer.h"
  51. #include <strigi/analysisresult.h>
  52. #include <strigi/indexwriter.h>
  53. #include <strigi/analyzerconfiguration.h>
  54. #include <strigi/textutils.h>
  55. #include "analyzerloader.h"
  56. #include "eventthroughanalyzer.h"
  57. #include "saxanalyzers/htmlsaxanalyzer.h"
  58. #include <strigi/indexpluginloader.h>
  59. #include <sys/stat.h>
  60. #ifdef WIN32
  61. //#include "ifilterendanalyzer.h"
  62. #endif
  63. #include <iostream>
  64. #include <config.h>
  65. using namespace std;
  66. using namespace Strigi;
  67. namespace Strigi {
  68. class StreamAnalyzerPrivate {
  69. public:
  70. AnalyzerConfiguration& conf;
  71. vector<StreamThroughAnalyzerFactory*> throughfactories;
  72. vector<StreamEndAnalyzerFactory*> endfactories;
  73. vector<StreamSaxAnalyzerFactory*> saxfactories;
  74. vector<StreamLineAnalyzerFactory*> linefactories;
  75. vector<StreamEventAnalyzerFactory*> eventfactories;
  76. vector<vector<StreamEndAnalyzer*> > end;
  77. vector<vector<StreamThroughAnalyzer*> > through;
  78. IndexWriter* writer;
  79. AnalyzerLoader* moduleLoader;
  80. const RegisteredField* sizefield;
  81. const RegisteredField* errorfield;
  82. void initializeThroughFactories();
  83. void initializeEndFactories();
  84. void initializeSaxFactories();
  85. void initializeLineFactories();
  86. void initializeEventFactories();
  87. void addFactory(StreamThroughAnalyzerFactory* f);
  88. void addFactory(StreamEndAnalyzerFactory* f);
  89. void addFactory(StreamSaxAnalyzerFactory* f);
  90. void addFactory(StreamLineAnalyzerFactory* f);
  91. void addFactory(StreamEventAnalyzerFactory* f);
  92. void addThroughAnalyzers();
  93. void addEndAnalyzers();
  94. void removeIndexable(unsigned depth);
  95. signed char analyze(AnalysisResult& idx, StreamBase<char>* input);
  96. StreamAnalyzerPrivate(AnalyzerConfiguration& c);
  97. ~StreamAnalyzerPrivate();
  98. };
  99. } // namespace Strigi
  100. StreamAnalyzerPrivate::StreamAnalyzerPrivate(AnalyzerConfiguration& c)
  101. :conf(c), writer(0) {
  102. moduleLoader = new AnalyzerLoader();
  103. sizefield = c.fieldRegister().sizeField;
  104. errorfield = c.fieldRegister().parseErrorField;
  105. // load the plugins from the environment setting
  106. const char* strigipluginpath(getenv("STRIGI_PLUGIN_PATH"));
  107. if (strigipluginpath) {
  108. vector<string> strigipluginpaths = getdirs(strigipluginpath);
  109. for (uint i=0; i<strigipluginpaths.size(); ++i) {
  110. moduleLoader->loadPlugins(strigipluginpaths[i].c_str());
  111. }
  112. } else {
  113. moduleLoader->loadPlugins( LIBINSTALLDIR "/strigi");
  114. }
  115. initializeSaxFactories();
  116. initializeLineFactories();
  117. initializeEventFactories();
  118. initializeThroughFactories();
  119. initializeEndFactories();
  120. }
  121. StreamAnalyzerPrivate::~StreamAnalyzerPrivate() {
  122. // delete all factories
  123. vector<StreamThroughAnalyzerFactory*>::iterator ta;
  124. for (ta = throughfactories.begin(); ta != throughfactories.end(); ++ta) {
  125. delete *ta;
  126. }
  127. vector<StreamEndAnalyzerFactory*>::iterator ea;
  128. for (ea = endfactories.begin(); ea != endfactories.end(); ++ea) {
  129. delete *ea;
  130. }
  131. vector<StreamSaxAnalyzerFactory*>::iterator sa;
  132. for (sa = saxfactories.begin(); sa != saxfactories.end(); ++sa) {
  133. delete *sa;
  134. }
  135. vector<StreamLineAnalyzerFactory*>::iterator la;
  136. for (la = linefactories.begin(); la != linefactories.end(); ++la) {
  137. delete *la;
  138. }
  139. vector<StreamEventAnalyzerFactory*>::iterator da;
  140. for (da = eventfactories.begin(); da != eventfactories.end(); ++da) {
  141. delete *da;
  142. }
  143. // delete the through analyzers and end analyzers
  144. vector<vector<StreamThroughAnalyzer*> >::iterator tIter;
  145. for (tIter = through.begin(); tIter != through.end(); ++tIter) {
  146. vector<StreamThroughAnalyzer*>::iterator t;
  147. for (t = tIter->begin(); t != tIter->end(); ++t) {
  148. delete *t;
  149. }
  150. }
  151. vector<vector<StreamEndAnalyzer*> >::iterator eIter;
  152. for (eIter = end.begin(); eIter != end.end(); ++eIter) {
  153. vector<StreamEndAnalyzer*>::iterator e;
  154. for (e = eIter->begin(); e != eIter->end(); ++e) {
  155. delete *e;
  156. }
  157. }
  158. delete moduleLoader;
  159. if (writer) {
  160. writer->releaseWriterData(conf.fieldRegister());
  161. }
  162. }
  163. StreamAnalyzer::StreamAnalyzer(AnalyzerConfiguration& c)
  164. :p(new StreamAnalyzerPrivate(c)) {
  165. }
  166. StreamAnalyzer::~StreamAnalyzer() {
  167. delete p;
  168. }
  169. void
  170. StreamAnalyzer::setIndexWriter(IndexWriter& w) {
  171. if (p->writer != 0) {
  172. p->writer->releaseWriterData(p->conf.fieldRegister());
  173. }
  174. p->writer = &w;
  175. p->writer->initWriterData(p->conf.fieldRegister());
  176. }
  177. signed char
  178. StreamAnalyzer::indexFile(const char *filepath) {
  179. string path(filepath);
  180. return indexFile(path);
  181. }
  182. signed char
  183. StreamAnalyzer::indexFile(const string& filepath) {
  184. if (!checkUtf8(filepath.c_str())) {
  185. return 1;
  186. }
  187. if (p->writer == 0) {
  188. return 1;
  189. }
  190. struct stat s;
  191. stat(filepath.c_str(), &s);
  192. // ensure a decent buffer size
  193. string name;
  194. AnalysisResult analysisresult(filepath, s.st_mtime, *p->writer, *this);
  195. InputStream* file = FileInputStream::open(filepath.c_str());
  196. signed char r;
  197. if (file->status() == Ok) {
  198. r = analysisresult.index(file);
  199. } else {
  200. r = analysisresult.index(0);
  201. }
  202. delete file;
  203. return r;
  204. }
  205. void
  206. StreamAnalyzerPrivate::addFactory(StreamThroughAnalyzerFactory* f) {
  207. f->registerFields(conf.fieldRegister());
  208. if (conf.useFactory(f)) {
  209. throughfactories.push_back(f);
  210. } else {
  211. delete f;
  212. }
  213. }
  214. void
  215. StreamAnalyzerPrivate::initializeSaxFactories() {
  216. list<StreamSaxAnalyzerFactory*> plugins
  217. = moduleLoader->streamSaxAnalyzerFactories();
  218. list<StreamSaxAnalyzerFactory*>::iterator i;
  219. for (i = plugins.begin(); i != plugins.end(); ++i) {
  220. addFactory(*i);
  221. }
  222. addFactory(new HtmlSaxAnalyzerFactory());
  223. }
  224. void
  225. StreamAnalyzerPrivate::initializeLineFactories() {
  226. list<StreamLineAnalyzerFactory*> plugins
  227. = moduleLoader->streamLineAnalyzerFactories();
  228. list<StreamLineAnalyzerFactory*>::iterator i;
  229. for (i = plugins.begin(); i != plugins.end(); ++i) {
  230. addFactory(*i);
  231. }
  232. // addFactory(new OdfMimeTypeLineAnalyzerFactory());
  233. addFactory(new M3uLineAnalyzerFactory());
  234. }
  235. void
  236. StreamAnalyzerPrivate::initializeEventFactories() {
  237. list<StreamEventAnalyzerFactory*> plugins
  238. = moduleLoader->streamEventAnalyzerFactories();
  239. list<StreamEventAnalyzerFactory*>::iterator i;
  240. addFactory(new MimeEventAnalyzerFactory());
  241. for (i = plugins.begin(); i != plugins.end(); ++i) {
  242. addFactory(*i);
  243. }
  244. }
  245. void
  246. StreamAnalyzerPrivate::initializeThroughFactories() {
  247. list<StreamThroughAnalyzerFactory*> plugins
  248. = moduleLoader->streamThroughAnalyzerFactories();
  249. list<StreamThroughAnalyzerFactory*>::iterator i;
  250. for (i = plugins.begin(); i != plugins.end(); ++i) {
  251. addFactory(*i);
  252. }
  253. addFactory(new OggThroughAnalyzerFactory());
  254. addFactory(new EventThroughAnalyzerFactory(saxfactories, linefactories,
  255. eventfactories));
  256. }
  257. void
  258. StreamAnalyzerPrivate::addFactory(StreamEventAnalyzerFactory* f) {
  259. f->registerFields(conf.fieldRegister());
  260. if (conf.useFactory(f)) {
  261. eventfactories.push_back(f);
  262. } else {
  263. delete f;
  264. }
  265. }
  266. void
  267. StreamAnalyzerPrivate::addFactory(StreamLineAnalyzerFactory* f) {
  268. f->registerFields(conf.fieldRegister());
  269. if (conf.useFactory(f)) {
  270. linefactories.push_back(f);
  271. } else {
  272. delete f;
  273. }
  274. }
  275. void
  276. StreamAnalyzerPrivate::addFactory(StreamSaxAnalyzerFactory* f) {
  277. f->registerFields(conf.fieldRegister());
  278. if (conf.useFactory(f)) {
  279. saxfactories.push_back(f);
  280. } else {
  281. delete f;
  282. }
  283. }
  284. void
  285. StreamAnalyzerPrivate::addFactory(StreamEndAnalyzerFactory* f) {
  286. f->registerFields(conf.fieldRegister());
  287. if (conf.useFactory(f)) {
  288. endfactories.push_back(f);
  289. } else {
  290. delete f;
  291. }
  292. }
  293. /**
  294. * Instantiate factories for all analyzers.
  295. **/
  296. void
  297. StreamAnalyzerPrivate::initializeEndFactories() {
  298. list<StreamEndAnalyzerFactory*> plugins
  299. = moduleLoader->streamEndAnalyzerFactories();
  300. list<StreamEndAnalyzerFactory*>::iterator i;
  301. for (i = plugins.begin(); i != plugins.end(); ++i) {
  302. addFactory(*i);
  303. }
  304. addFactory(new Bz2EndAnalyzerFactory());
  305. addFactory(new GZipEndAnalyzerFactory());
  306. addFactory(new OleEndAnalyzerFactory());
  307. addFactory(new TarEndAnalyzerFactory());
  308. addFactory(new ArEndAnalyzerFactory());
  309. addFactory(new MailEndAnalyzerFactory());
  310. // addFactory(new MpegEndAnalyzerFactory()); //Xine fallback works so much better now
  311. addFactory(new OdfEndAnalyzerFactory());
  312. addFactory(new ZipEndAnalyzerFactory());
  313. addFactory(new ZipExeEndAnalyzerFactory());
  314. addFactory(new RpmEndAnalyzerFactory());
  315. addFactory(new CpioEndAnalyzerFactory());
  316. addFactory(new PngEndAnalyzerFactory());
  317. addFactory(new BmpEndAnalyzerFactory());
  318. addFactory(new FlacEndAnalyzerFactory());
  319. addFactory(new ID3EndAnalyzerFactory());
  320. addFactory(new PdfEndAnalyzerFactory());
  321. addFactory(new SdfEndAnalyzerFactory());
  322. addFactory(new LzmaEndAnalyzerFactory());
  323. #ifndef _MSC_VER
  324. addFactory(new HelperEndAnalyzerFactory());
  325. #endif
  326. addFactory(new TextEndAnalyzerFactory());
  327. }
  328. void
  329. StreamAnalyzerPrivate::addThroughAnalyzers() {
  330. through.resize(through.size()+1);
  331. vector<vector<StreamThroughAnalyzer*> >::reverse_iterator tIter;
  332. tIter = through.rbegin();
  333. vector<StreamThroughAnalyzerFactory*>::iterator ta;
  334. for (ta = throughfactories.begin(); ta != throughfactories.end(); ++ta) {
  335. tIter->push_back((*ta)->newInstance());
  336. }
  337. }
  338. void
  339. StreamAnalyzerPrivate::addEndAnalyzers() {
  340. end.resize(end.size()+1);
  341. vector<vector<StreamEndAnalyzer*> >::reverse_iterator eIter;
  342. eIter = end.rbegin();
  343. vector<StreamEndAnalyzerFactory*>::iterator ea;
  344. for (ea = endfactories.begin(); ea != endfactories.end(); ++ea) {
  345. eIter->push_back((*ea)->newInstance());
  346. }
  347. }
  348. signed char
  349. StreamAnalyzer::analyze(AnalysisResult& idx, StreamBase<char>* input) {
  350. return p->analyze(idx, input);
  351. }
  352. signed char
  353. StreamAnalyzerPrivate::analyze(AnalysisResult& idx, StreamBase<char>* input) {
  354. //cerr << "analyze " << idx.path().c_str() << endl;
  355. // retrieve or construct the through analyzers and end analyzers
  356. vector<vector<StreamThroughAnalyzer*> >::iterator tIter;
  357. vector<vector<StreamEndAnalyzer*> >::iterator eIter;
  358. while ((int)through.size() <= idx.depth()) {
  359. addThroughAnalyzers();
  360. addEndAnalyzers();
  361. }
  362. tIter = through.begin() + idx.depth();
  363. eIter = end.begin() + idx.depth();
  364. // read the headersize size before connecting the throughanalyzers
  365. // This ensures that the first read is at least this size, even if the
  366. // throughanalyzers read smaller chunks.
  367. bool finished = false;
  368. const char* header = 0;
  369. int32_t headersize = 1024;
  370. if (input) {
  371. headersize = input->read(header, headersize, headersize);
  372. input->reset(0);
  373. if (headersize < 0) finished = true;
  374. }
  375. // insert the through analyzers
  376. vector<StreamThroughAnalyzer*>::iterator ts;
  377. for (ts = tIter->begin(); (input == 0 || input->status() == Ok)
  378. && ts != tIter->end(); ++ts) {
  379. (*ts)->setIndexable(&idx);
  380. input = (*ts)->connectInputStream(input);
  381. if (input && input->position() != 0) {
  382. cerr << "Analyzer " << (*ts)->name() << " has left the stream in a bad state." << endl;
  383. }
  384. }
  385. // reread the header so we can use it for the endanalyzers
  386. if (input && headersize > 0) {
  387. headersize = input->read(header, headersize, headersize);
  388. if (headersize <= 0) {
  389. finished = true;
  390. } else if (input->reset(0) != 0) {
  391. cerr << "resetting is impossible!! pos: " << input->position()
  392. << " status: " << input->status() << endl;
  393. }
  394. } else {
  395. // indicate that we have no data in the stream
  396. headersize = -1;
  397. finished = true;
  398. }
  399. size_t es = 0;
  400. size_t itersize = eIter->size();
  401. while (!finished && es != itersize) {
  402. StreamEndAnalyzer* sea = (*eIter)[es];
  403. if (sea->checkHeader(header, headersize)) {
  404. idx.setEndAnalyzer(sea);
  405. char ar = sea->analyze(idx, input);
  406. if (ar) {
  407. // FIXME: find either a NIE-compliant way to report errors or use some API for this
  408. // idx.addValue(errorfield, sea->name() + string(": ")
  409. // + sea->error());
  410. if (!idx.config().indexMore()) {
  411. removeIndexable(idx.depth());
  412. return -1;
  413. }
  414. int64_t pos = input->reset(0);
  415. if (pos != 0) { // could not reset
  416. cerr << "could not reset stream of " << idx.path().c_str()
  417. << " from pos " << input->position()
  418. << " to 0 after reading with " << sea->name()
  419. << ": " << sea->error().c_str() << endl;
  420. finished = true;
  421. } else {
  422. // refresh the pointer to the start of the data
  423. headersize = input->read(header, headersize, headersize);
  424. if (input->reset(0) != 0) {
  425. cerr << "resetting again is impossible!! pos: "
  426. << input->position() << " status: "
  427. << input->status() << endl;
  428. }
  429. if (headersize < 0) finished = true;
  430. }
  431. } else {
  432. finished = true;
  433. }
  434. eIter = end.begin() + idx.depth();
  435. }
  436. if (!finished) {
  437. finished = !conf.indexMore();
  438. }
  439. es++;
  440. }
  441. idx.setEndAnalyzer(0);
  442. if (input) {
  443. // make sure the entire stream is read if the size is not known
  444. bool ready;
  445. tIter = through.begin() + idx.depth();
  446. uint32_t skipsize = 4096;
  447. do {
  448. // ask the analyzerconfiguration if we should continue
  449. int64_t max = idx.config().maximalStreamReadLength(idx);
  450. if (!idx.config().indexMore()
  451. || (max != -1 && input->position() >= max)) {
  452. // we are done
  453. return 0;
  454. }
  455. ready = input->size() != -1;
  456. vector<StreamThroughAnalyzer*>::iterator ts;
  457. for (ts = tIter->begin(); ready && ts != tIter->end(); ++ts) {
  458. ready = (*ts)->isReadyWithStream();
  459. }
  460. if (!ready) {
  461. input->skip(skipsize);
  462. if (skipsize < 131072) {
  463. skipsize *= 4;
  464. }
  465. }
  466. } while (!ready && input->status() == Ok);
  467. if (input->status() == Error) {
  468. fprintf(stderr, "Error: %s\n", input->error());
  469. removeIndexable(idx.depth());
  470. return -2;
  471. }
  472. }
  473. // store the size of the stream
  474. if (input && input->status() != Error && input->size() >= 0) {
  475. // TODO remove cast
  476. idx.addValue(sizefield, (uint32_t)input->size());
  477. }
  478. // remove references to the analysisresult before it goes out of scope
  479. removeIndexable(idx.depth());
  480. return 0;
  481. }
  482. /**
  483. * Remove references to the analysisresult before it goes out of scope.
  484. **/
  485. void
  486. StreamAnalyzerPrivate::removeIndexable(uint depth) {
  487. vector<vector<StreamThroughAnalyzer*> >::iterator tIter;
  488. vector<StreamThroughAnalyzer*>::iterator ts;
  489. tIter = through.begin() + depth;
  490. for (ts = tIter->begin(); ts != tIter->end(); ++ts) {
  491. // remove references to the analysisresult before it goes out of scope
  492. (*ts)->setIndexable(0);
  493. }
  494. }
  495. AnalyzerConfiguration&
  496. StreamAnalyzer::configuration() const {
  497. return p->conf;
  498. }