/strigi-0.7.7/libstreamanalyzer/lib/lineeventanalyzer.cpp
C++ | 322 lines | 269 code | 12 blank | 41 comment | 93 complexity | 57c708bd2749adadf135be9d7da9934a MD5 | raw file
Possible License(s): LGPL-2.0
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20
21#ifdef HAVE_CONFIG_H
22# include "config.h"
23#endif
24
25#include "lineeventanalyzer.h"
26#include <strigi/streamlineanalyzer.h>
27#include <strigi/analysisresult.h>
28#include <strigi/textutils.h>
29#include <cstring>
30#include <cassert>
31#include <cerrno>
32using namespace Strigi;
33using namespace std;
34
35#ifdef ICONV_SECOND_ARGUMENT_IS_CONST
36 #define ICONV_CONST const
37#else
38 #define ICONV_CONST
39#endif
40
41// end of line is \r, \n or \r\n
42#define CONVBUFSIZE 65536
43
44LineEventAnalyzer::LineEventAnalyzer(vector<StreamLineAnalyzer*>& l)
45 :line(l), converter((iconv_t)-1), numAnalyzers((uint)l.size()),
46 convBuffer(new char[CONVBUFSIZE]), ready(true), initialized(false) {
47 started = new bool[l.size()];
48 for (uint i=0; i<numAnalyzers; ++i) {
49 started[i] = false;
50 }
51}
52LineEventAnalyzer::~LineEventAnalyzer() {
53 vector<StreamLineAnalyzer*>::iterator l;
54 for (l = line.begin(); l != line.end(); ++l) {
55 delete *l;
56 }
57 if (converter != (iconv_t)-1) {
58 iconv_close(converter);
59 }
60 delete [] convBuffer;
61 delete [] started;
62}
63void
64LineEventAnalyzer::startAnalysis(AnalysisResult* r) {
65 result = r;
66 ready = numAnalyzers == 0;
67 initialized = false;
68 sawCarriageReturn = false;
69 missingBytes = 0;
70 iMissingBytes = 0;
71 lineBuffer.assign("");
72 byteBuffer.assign("");
73 ibyteBuffer.assign("");
74 initEncoding(r->encoding());
75 for (uint i=0; i < numAnalyzers; ++i) {
76 started[i] = false;
77 }
78}
79void
80LineEventAnalyzer::initEncoding(std::string enc) {
81 if (enc.size() == 0 || enc == "UTF-8") {
82 encoding.assign("UTF-8");
83 if (converter != (iconv_t)-1) {
84 iconv_close(converter);
85 converter = (iconv_t)-1;
86 }
87 } else if (converter != (iconv_t)-1 && encoding == enc) {
88 // reset the converter
89 iconv(converter, 0, 0, 0, 0);
90 } else {
91 encoding = enc;
92 if (converter != (iconv_t)-1) {
93 iconv_close(converter);
94 }
95 converter = iconv_open(encoding.c_str(), "UTF-8");
96 }
97}
98void
99LineEventAnalyzer::endAnalysis(bool complete) {
100 // flush the last line if it did not end with a newline character
101 if(complete && lineBuffer.size() > 0) {
102 emitData(lineBuffer.c_str(), (uint32_t)lineBuffer.size());
103 lineBuffer.assign("");
104 }
105
106 for (uint i=0; i < numAnalyzers; ++i) {
107 if (started[i]) {
108 line[i]->endAnalysis(complete);
109 }
110 }
111}
112void
113LineEventAnalyzer::handleData(const char* data, uint32_t length) {
114 if (ready) return;
115 if (converter == (iconv_t)-1) {
116 handleUtf8Data(data, length);
117 return;
118 }
119 size_t r;
120 ICONV_CONST char *inbuf;
121 char* outbuf;
122 size_t inbytesleft;
123 size_t outbytesleft;
124 if (iMissingBytes) {
125 if (iMissingBytes > length) {
126 ibyteBuffer.append(data, length);
127 iMissingBytes = (unsigned char)(iMissingBytes - length);
128 return;
129 } else {
130 ibyteBuffer.append(data, iMissingBytes);
131 data += iMissingBytes;
132 length -= iMissingBytes;
133 inbuf = (char*)ibyteBuffer.c_str();
134 inbytesleft = ibyteBuffer.length();
135 outbytesleft = CONVBUFSIZE;
136 outbuf = convBuffer;
137 r = iconv(converter, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
138 if (r == (size_t)-1) { // must be an error
139 ready = true;
140 return;
141 }
142 handleUtf8Data(convBuffer, (uint32_t)(CONVBUFSIZE-outbytesleft));
143 }
144 }
145 do {
146 inbuf = (char*)data;
147 inbytesleft = length;
148 outbuf = convBuffer;
149 outbytesleft = CONVBUFSIZE;
150 r = iconv(converter, &inbuf, &inbytesleft, &outbuf,
151 &outbytesleft);
152 int32_t left = (uint32_t)(CONVBUFSIZE-outbytesleft);
153 if (r == (size_t)-1) {
154 uint32_t read;
155 switch (errno) {
156 case EINVAL: // last character is incomplete
157 handleUtf8Data(convBuffer, left);
158 ibyteBuffer.assign(inbuf, inbytesleft);
159 iMissingBytes = (unsigned char)(length - (inbuf-data));
160 return;
161 case E2BIG: // output buffer is full
162 handleUtf8Data(convBuffer, left);
163 read = (uint32_t)(inbuf-data);
164 data += read;
165 length -= read;
166 break;
167 case EILSEQ: //invalid multibyte sequence
168 default:
169 ready = true;
170 return;
171 }
172 } else { //input sequence was completely converted
173 handleUtf8Data(convBuffer, left);
174 return;
175 }
176 } while (true);
177}
178void
179LineEventAnalyzer::handleUtf8Data(const char* data, uint32_t length) {
180 assert(!(sawCarriageReturn && missingBytes > 0));
181
182 // if the last block ended with '\r', the next '\n' can be skipped
183 if (sawCarriageReturn) {
184 if (length > 0 && data[0] == '\n') {
185 data++;
186 length--;
187 }
188 sawCarriageReturn = false;
189 }
190
191 // if we have incomplete characters left over from the last call,
192 // complete them and validate them
193 if (missingBytes > 0) {
194 if (length > (unsigned char)missingBytes) {
195 // we have enough data to finish the character
196 byteBuffer.append(data, missingBytes);
197 if (!checkUtf8(byteBuffer)) {
198 // invalid utf8, nothing more to see here
199 ready = true;
200 return;
201 }
202 lineBuffer.append(byteBuffer);
203 data += missingBytes;
204 length -= missingBytes;
205 // clean up the byte buffer
206 byteBuffer.assign("");
207 missingBytes = 0;
208 } else {
209 // not enough data, store it and wait for the next round
210 byteBuffer.append(data, length);
211 missingBytes = (unsigned char)(missingBytes - length);
212 return;
213 }
214 }
215
216 // validate the utf8
217 const char* p = checkUtf8(data, length, missingBytes);
218 if (p) {
219 // the data ends in an incomplete character
220 if (missingBytes > 0) {
221 string::size_type charStartSize = length - (p - data);
222 // store the start of the character
223 byteBuffer.assign(p, charStartSize);
224 // do not consider this incomplete character in the rest of this
225 // function
226 length = (uint32_t)(length - charStartSize);
227 } else {
228 // not valid
229 ready = true;
230 return;
231 }
232 }
233
234 // find the first \n
235 p = data;
236 const char* end = data + length;
237 while (p < end) {
238 if (*p == '\n' || *p == '\r') break;
239 p++;
240 }
241 if (p == end) { // no '\n' was found, we put this in the buffer
242 lineBuffer.append(data, length);
243 return;
244 }
245 const char* lineend = p;
246 if (*p == '\r') {
247 // if \r is followed by \n, we can ignore \n
248 if (p + 1 != end) {
249 if (p[1] == '\n') {
250 p++;
251 }
252 } else {
253 sawCarriageReturn = true;
254 }
255 }
256
257 // handle the first line from this call
258 if (lineBuffer.size()) {
259 lineBuffer.append(data, lineend-data);
260 emitData(lineBuffer.c_str(), (uint32_t)lineBuffer.size());
261 lineBuffer.assign("");
262 } else {
263 emitData(data, (uint32_t)(p-data));
264 }
265 if (ready) return;
266
267 // handle the other lines
268 while (++p != end) {
269 data = p;
270 do {
271 if (*p == '\n' || *p == '\r') break;
272 } while (++p != end);
273 if (p == end) {
274 lineBuffer.assign(data, end-data);
275 break;
276 }
277 lineend = p;
278 if (*p == '\r') {
279 // if \r is followed by \n, we can ignore \n
280 if (p + 1 != end) {
281 if (p[1] == '\n') {
282 p++;
283 }
284 } else {
285 sawCarriageReturn = true;
286 }
287 }
288 emitData(data, (uint32_t)(lineend-data));
289 if (ready) return;
290 }
291}
292void
293LineEventAnalyzer::emitData(const char*data, uint32_t length) {
294// fprintf(stderr, "%.*s\n", length, data);
295 bool more = false;
296 vector<StreamLineAnalyzer*>::iterator i;
297 if (!initialized) {
298 for (uint j = 0; j < numAnalyzers; ++j) {
299 StreamLineAnalyzer* s = line[j];
300 s->startAnalysis(result);
301 started[j] = true;
302 more = more || !s->isReadyWithStream();
303 }
304 initialized = true;
305 ready = !more;
306 if (ready) {
307 return;
308 }
309 more = false;
310 }
311 for (i = line.begin(); i != line.end(); ++i) {
312 if (!(*i)->isReadyWithStream()) {
313 (*i)->handleLine(data, length);
314 }
315 more = more || !(*i)->isReadyWithStream();
316 }
317 ready = !more;
318}
319bool
320LineEventAnalyzer::isReadyWithStream() {
321 return ready;
322}