PageRenderTime 39ms CodeModel.GetById 12ms RepoModel.GetById 1ms app.codeStats 0ms

/strigi-0.7.7/libstreamanalyzer/lib/lineeventanalyzer.cpp

#
C++ | 322 lines | 269 code | 12 blank | 41 comment | 93 complexity | 57c708bd2749adadf135be9d7da9934a MD5 | raw file
Possible License(s): LGPL-2.0
  1. /* This file is part of Strigi Desktop Search
  2. *
  3. * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Library General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Library General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Library General Public License
  16. * along with this library; see the file COPYING.LIB. If not, write to
  17. * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  18. * Boston, MA 02110-1301, USA.
  19. */
  20. #ifdef HAVE_CONFIG_H
  21. # include "config.h"
  22. #endif
  23. #include "lineeventanalyzer.h"
  24. #include <strigi/streamlineanalyzer.h>
  25. #include <strigi/analysisresult.h>
  26. #include <strigi/textutils.h>
  27. #include <cstring>
  28. #include <cassert>
  29. #include <cerrno>
  30. using namespace Strigi;
  31. using namespace std;
  32. #ifdef ICONV_SECOND_ARGUMENT_IS_CONST
  33. #define ICONV_CONST const
  34. #else
  35. #define ICONV_CONST
  36. #endif
  37. // end of line is \r, \n or \r\n
  38. #define CONVBUFSIZE 65536
  39. LineEventAnalyzer::LineEventAnalyzer(vector<StreamLineAnalyzer*>& l)
  40. :line(l), converter((iconv_t)-1), numAnalyzers((uint)l.size()),
  41. convBuffer(new char[CONVBUFSIZE]), ready(true), initialized(false) {
  42. started = new bool[l.size()];
  43. for (uint i=0; i<numAnalyzers; ++i) {
  44. started[i] = false;
  45. }
  46. }
  47. LineEventAnalyzer::~LineEventAnalyzer() {
  48. vector<StreamLineAnalyzer*>::iterator l;
  49. for (l = line.begin(); l != line.end(); ++l) {
  50. delete *l;
  51. }
  52. if (converter != (iconv_t)-1) {
  53. iconv_close(converter);
  54. }
  55. delete [] convBuffer;
  56. delete [] started;
  57. }
  58. void
  59. LineEventAnalyzer::startAnalysis(AnalysisResult* r) {
  60. result = r;
  61. ready = numAnalyzers == 0;
  62. initialized = false;
  63. sawCarriageReturn = false;
  64. missingBytes = 0;
  65. iMissingBytes = 0;
  66. lineBuffer.assign("");
  67. byteBuffer.assign("");
  68. ibyteBuffer.assign("");
  69. initEncoding(r->encoding());
  70. for (uint i=0; i < numAnalyzers; ++i) {
  71. started[i] = false;
  72. }
  73. }
  74. void
  75. LineEventAnalyzer::initEncoding(std::string enc) {
  76. if (enc.size() == 0 || enc == "UTF-8") {
  77. encoding.assign("UTF-8");
  78. if (converter != (iconv_t)-1) {
  79. iconv_close(converter);
  80. converter = (iconv_t)-1;
  81. }
  82. } else if (converter != (iconv_t)-1 && encoding == enc) {
  83. // reset the converter
  84. iconv(converter, 0, 0, 0, 0);
  85. } else {
  86. encoding = enc;
  87. if (converter != (iconv_t)-1) {
  88. iconv_close(converter);
  89. }
  90. converter = iconv_open(encoding.c_str(), "UTF-8");
  91. }
  92. }
  93. void
  94. LineEventAnalyzer::endAnalysis(bool complete) {
  95. // flush the last line if it did not end with a newline character
  96. if(complete && lineBuffer.size() > 0) {
  97. emitData(lineBuffer.c_str(), (uint32_t)lineBuffer.size());
  98. lineBuffer.assign("");
  99. }
  100. for (uint i=0; i < numAnalyzers; ++i) {
  101. if (started[i]) {
  102. line[i]->endAnalysis(complete);
  103. }
  104. }
  105. }
  106. void
  107. LineEventAnalyzer::handleData(const char* data, uint32_t length) {
  108. if (ready) return;
  109. if (converter == (iconv_t)-1) {
  110. handleUtf8Data(data, length);
  111. return;
  112. }
  113. size_t r;
  114. ICONV_CONST char *inbuf;
  115. char* outbuf;
  116. size_t inbytesleft;
  117. size_t outbytesleft;
  118. if (iMissingBytes) {
  119. if (iMissingBytes > length) {
  120. ibyteBuffer.append(data, length);
  121. iMissingBytes = (unsigned char)(iMissingBytes - length);
  122. return;
  123. } else {
  124. ibyteBuffer.append(data, iMissingBytes);
  125. data += iMissingBytes;
  126. length -= iMissingBytes;
  127. inbuf = (char*)ibyteBuffer.c_str();
  128. inbytesleft = ibyteBuffer.length();
  129. outbytesleft = CONVBUFSIZE;
  130. outbuf = convBuffer;
  131. r = iconv(converter, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
  132. if (r == (size_t)-1) { // must be an error
  133. ready = true;
  134. return;
  135. }
  136. handleUtf8Data(convBuffer, (uint32_t)(CONVBUFSIZE-outbytesleft));
  137. }
  138. }
  139. do {
  140. inbuf = (char*)data;
  141. inbytesleft = length;
  142. outbuf = convBuffer;
  143. outbytesleft = CONVBUFSIZE;
  144. r = iconv(converter, &inbuf, &inbytesleft, &outbuf,
  145. &outbytesleft);
  146. int32_t left = (uint32_t)(CONVBUFSIZE-outbytesleft);
  147. if (r == (size_t)-1) {
  148. uint32_t read;
  149. switch (errno) {
  150. case EINVAL: // last character is incomplete
  151. handleUtf8Data(convBuffer, left);
  152. ibyteBuffer.assign(inbuf, inbytesleft);
  153. iMissingBytes = (unsigned char)(length - (inbuf-data));
  154. return;
  155. case E2BIG: // output buffer is full
  156. handleUtf8Data(convBuffer, left);
  157. read = (uint32_t)(inbuf-data);
  158. data += read;
  159. length -= read;
  160. break;
  161. case EILSEQ: //invalid multibyte sequence
  162. default:
  163. ready = true;
  164. return;
  165. }
  166. } else { //input sequence was completely converted
  167. handleUtf8Data(convBuffer, left);
  168. return;
  169. }
  170. } while (true);
  171. }
  172. void
  173. LineEventAnalyzer::handleUtf8Data(const char* data, uint32_t length) {
  174. assert(!(sawCarriageReturn && missingBytes > 0));
  175. // if the last block ended with '\r', the next '\n' can be skipped
  176. if (sawCarriageReturn) {
  177. if (length > 0 && data[0] == '\n') {
  178. data++;
  179. length--;
  180. }
  181. sawCarriageReturn = false;
  182. }
  183. // if we have incomplete characters left over from the last call,
  184. // complete them and validate them
  185. if (missingBytes > 0) {
  186. if (length > (unsigned char)missingBytes) {
  187. // we have enough data to finish the character
  188. byteBuffer.append(data, missingBytes);
  189. if (!checkUtf8(byteBuffer)) {
  190. // invalid utf8, nothing more to see here
  191. ready = true;
  192. return;
  193. }
  194. lineBuffer.append(byteBuffer);
  195. data += missingBytes;
  196. length -= missingBytes;
  197. // clean up the byte buffer
  198. byteBuffer.assign("");
  199. missingBytes = 0;
  200. } else {
  201. // not enough data, store it and wait for the next round
  202. byteBuffer.append(data, length);
  203. missingBytes = (unsigned char)(missingBytes - length);
  204. return;
  205. }
  206. }
  207. // validate the utf8
  208. const char* p = checkUtf8(data, length, missingBytes);
  209. if (p) {
  210. // the data ends in an incomplete character
  211. if (missingBytes > 0) {
  212. string::size_type charStartSize = length - (p - data);
  213. // store the start of the character
  214. byteBuffer.assign(p, charStartSize);
  215. // do not consider this incomplete character in the rest of this
  216. // function
  217. length = (uint32_t)(length - charStartSize);
  218. } else {
  219. // not valid
  220. ready = true;
  221. return;
  222. }
  223. }
  224. // find the first \n
  225. p = data;
  226. const char* end = data + length;
  227. while (p < end) {
  228. if (*p == '\n' || *p == '\r') break;
  229. p++;
  230. }
  231. if (p == end) { // no '\n' was found, we put this in the buffer
  232. lineBuffer.append(data, length);
  233. return;
  234. }
  235. const char* lineend = p;
  236. if (*p == '\r') {
  237. // if \r is followed by \n, we can ignore \n
  238. if (p + 1 != end) {
  239. if (p[1] == '\n') {
  240. p++;
  241. }
  242. } else {
  243. sawCarriageReturn = true;
  244. }
  245. }
  246. // handle the first line from this call
  247. if (lineBuffer.size()) {
  248. lineBuffer.append(data, lineend-data);
  249. emitData(lineBuffer.c_str(), (uint32_t)lineBuffer.size());
  250. lineBuffer.assign("");
  251. } else {
  252. emitData(data, (uint32_t)(p-data));
  253. }
  254. if (ready) return;
  255. // handle the other lines
  256. while (++p != end) {
  257. data = p;
  258. do {
  259. if (*p == '\n' || *p == '\r') break;
  260. } while (++p != end);
  261. if (p == end) {
  262. lineBuffer.assign(data, end-data);
  263. break;
  264. }
  265. lineend = p;
  266. if (*p == '\r') {
  267. // if \r is followed by \n, we can ignore \n
  268. if (p + 1 != end) {
  269. if (p[1] == '\n') {
  270. p++;
  271. }
  272. } else {
  273. sawCarriageReturn = true;
  274. }
  275. }
  276. emitData(data, (uint32_t)(lineend-data));
  277. if (ready) return;
  278. }
  279. }
  280. void
  281. LineEventAnalyzer::emitData(const char*data, uint32_t length) {
  282. // fprintf(stderr, "%.*s\n", length, data);
  283. bool more = false;
  284. vector<StreamLineAnalyzer*>::iterator i;
  285. if (!initialized) {
  286. for (uint j = 0; j < numAnalyzers; ++j) {
  287. StreamLineAnalyzer* s = line[j];
  288. s->startAnalysis(result);
  289. started[j] = true;
  290. more = more || !s->isReadyWithStream();
  291. }
  292. initialized = true;
  293. ready = !more;
  294. if (ready) {
  295. return;
  296. }
  297. more = false;
  298. }
  299. for (i = line.begin(); i != line.end(); ++i) {
  300. if (!(*i)->isReadyWithStream()) {
  301. (*i)->handleLine(data, length);
  302. }
  303. more = more || !(*i)->isReadyWithStream();
  304. }
  305. ready = !more;
  306. }
  307. bool
  308. LineEventAnalyzer::isReadyWithStream() {
  309. return ready;
  310. }