PageRenderTime 30ms CodeModel.GetById 9ms app.highlight 17ms RepoModel.GetById 1ms app.codeStats 0ms

/strigi-0.7.7/libstreamanalyzer/lib/lineeventanalyzer.cpp

#
C++ | 322 lines | 269 code | 12 blank | 41 comment | 93 complexity | 57c708bd2749adadf135be9d7da9934a MD5 | raw file
Possible License(s): LGPL-2.0
  1/* This file is part of Strigi Desktop Search
  2 *
  3 * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
  4 *
  5 * This library is free software; you can redistribute it and/or
  6 * modify it under the terms of the GNU Library General Public
  7 * License as published by the Free Software Foundation; either
  8 * version 2 of the License, or (at your option) any later version.
  9 *
 10 * This library is distributed in the hope that it will be useful,
 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13 * Library General Public License for more details.
 14 *
 15 * You should have received a copy of the GNU Library General Public License
 16 * along with this library; see the file COPYING.LIB.  If not, write to
 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 18 * Boston, MA 02110-1301, USA.
 19 */
 20
 21#ifdef HAVE_CONFIG_H
 22# include "config.h"
 23#endif
 24
 25#include "lineeventanalyzer.h"
 26#include <strigi/streamlineanalyzer.h>
 27#include <strigi/analysisresult.h>
 28#include <strigi/textutils.h>
 29#include <cstring>
 30#include <cassert>
 31#include <cerrno>
 32using namespace Strigi;
 33using namespace std;
 34
 35#ifdef ICONV_SECOND_ARGUMENT_IS_CONST
 36     #define ICONV_CONST const
 37#else
 38     #define ICONV_CONST
 39#endif
 40
 41// end of line is \r, \n or \r\n
 42#define CONVBUFSIZE 65536
 43
 44LineEventAnalyzer::LineEventAnalyzer(vector<StreamLineAnalyzer*>& l)
 45        :line(l), converter((iconv_t)-1), numAnalyzers((uint)l.size()),
 46         convBuffer(new char[CONVBUFSIZE]), ready(true), initialized(false) {
 47    started = new bool[l.size()];
 48    for (uint i=0; i<numAnalyzers; ++i) {
 49        started[i] = false;
 50    }
 51}
 52LineEventAnalyzer::~LineEventAnalyzer() {
 53    vector<StreamLineAnalyzer*>::iterator l;
 54    for (l = line.begin(); l != line.end(); ++l) {
 55        delete *l;
 56    }
 57    if (converter != (iconv_t)-1) {
 58        iconv_close(converter);
 59    }
 60    delete [] convBuffer;
 61    delete [] started;
 62}
 63void
 64LineEventAnalyzer::startAnalysis(AnalysisResult* r) {
 65    result = r;
 66    ready = numAnalyzers == 0;
 67    initialized = false;
 68    sawCarriageReturn = false;
 69    missingBytes = 0;
 70    iMissingBytes = 0;
 71    lineBuffer.assign("");
 72    byteBuffer.assign("");
 73    ibyteBuffer.assign("");
 74    initEncoding(r->encoding());
 75    for (uint i=0; i < numAnalyzers; ++i) {
 76        started[i] = false;
 77    }
 78}
 79void
 80LineEventAnalyzer::initEncoding(std::string enc) {
 81    if (enc.size() == 0 || enc == "UTF-8") {
 82        encoding.assign("UTF-8");
 83        if (converter != (iconv_t)-1) {
 84            iconv_close(converter);
 85            converter = (iconv_t)-1;
 86        }
 87    } else if (converter != (iconv_t)-1 && encoding == enc) {
 88        // reset the converter
 89        iconv(converter, 0, 0, 0, 0);
 90    } else {
 91        encoding = enc;
 92        if (converter != (iconv_t)-1) {
 93            iconv_close(converter);
 94        }
 95        converter = iconv_open(encoding.c_str(), "UTF-8");
 96    }
 97}
 98void
 99LineEventAnalyzer::endAnalysis(bool complete) {
100    // flush the last line if it did not end with a newline character
101    if(complete && lineBuffer.size() > 0) {
102        emitData(lineBuffer.c_str(), (uint32_t)lineBuffer.size());
103        lineBuffer.assign("");
104    }
105
106    for (uint i=0; i < numAnalyzers; ++i) {
107        if (started[i]) {
108            line[i]->endAnalysis(complete);
109        }
110    }
111}
112void
113LineEventAnalyzer::handleData(const char* data, uint32_t length) {
114    if (ready) return;
115    if (converter == (iconv_t)-1) {
116        handleUtf8Data(data, length);
117        return;
118    }
119    size_t r;
120    ICONV_CONST char *inbuf;
121    char* outbuf;
122    size_t inbytesleft;
123    size_t outbytesleft;
124    if (iMissingBytes) {
125        if (iMissingBytes > length) {
126            ibyteBuffer.append(data, length);
127            iMissingBytes = (unsigned char)(iMissingBytes - length);
128            return;
129        } else {
130            ibyteBuffer.append(data, iMissingBytes);
131            data += iMissingBytes;
132            length -= iMissingBytes;
133            inbuf = (char*)ibyteBuffer.c_str();
134            inbytesleft = ibyteBuffer.length();
135            outbytesleft = CONVBUFSIZE;
136            outbuf = convBuffer;
137            r = iconv(converter, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
138            if (r == (size_t)-1) { // must be an error
139                ready = true;
140                return;
141            }
142            handleUtf8Data(convBuffer, (uint32_t)(CONVBUFSIZE-outbytesleft));
143        }
144    }
145    do {
146        inbuf = (char*)data;
147        inbytesleft = length;
148        outbuf = convBuffer;
149        outbytesleft = CONVBUFSIZE;
150        r = iconv(converter, &inbuf, &inbytesleft, &outbuf,
151            &outbytesleft);
152        int32_t left = (uint32_t)(CONVBUFSIZE-outbytesleft);
153        if (r == (size_t)-1) {
154            uint32_t read;
155            switch (errno) {
156            case EINVAL: // last character is incomplete
157                handleUtf8Data(convBuffer, left);
158                ibyteBuffer.assign(inbuf, inbytesleft);
159                iMissingBytes = (unsigned char)(length - (inbuf-data));
160                return;
161            case E2BIG: // output buffer is full
162                handleUtf8Data(convBuffer, left);
163                read = (uint32_t)(inbuf-data);
164                data += read;
165                length -= read;
166                break;
167            case EILSEQ: //invalid multibyte sequence
168            default:
169                ready = true;
170                return;
171            }
172        } else { //input sequence was completely converted
173            handleUtf8Data(convBuffer, left);
174            return;
175        }
176    } while (true);
177}
178void
179LineEventAnalyzer::handleUtf8Data(const char* data, uint32_t length) {
180    assert(!(sawCarriageReturn && missingBytes > 0));
181
182    // if the last block ended with '\r', the next '\n' can be skipped
183    if (sawCarriageReturn) {
184        if (length > 0 && data[0] == '\n') {
185            data++;
186            length--;
187        }
188        sawCarriageReturn = false;
189    }
190
191    // if we have incomplete characters left over from the last call,
192    // complete them and validate them
193    if (missingBytes > 0) {
194        if (length > (unsigned char)missingBytes) {
195            // we have enough data to finish the character
196            byteBuffer.append(data, missingBytes);
197            if (!checkUtf8(byteBuffer)) {
198                // invalid utf8, nothing more to see here
199                ready = true;
200                return;
201            }
202            lineBuffer.append(byteBuffer);
203            data += missingBytes;
204            length -= missingBytes;
205            // clean up the byte buffer
206            byteBuffer.assign("");
207            missingBytes = 0;
208        } else {
209            // not enough data, store it and wait for the next round
210            byteBuffer.append(data, length);
211            missingBytes = (unsigned char)(missingBytes - length);
212            return;
213        }
214    }
215
216    // validate the utf8
217    const char* p = checkUtf8(data, length, missingBytes);
218    if (p) {
219        // the data ends in an incomplete character
220        if (missingBytes > 0) {
221            string::size_type charStartSize = length - (p - data);
222            // store the start of the character
223            byteBuffer.assign(p, charStartSize);
224            // do not consider this incomplete character in the rest of this
225            // function
226            length = (uint32_t)(length - charStartSize);
227        } else {
228            // not valid
229            ready = true;
230            return;
231        }
232    }
233
234    // find the first \n
235    p = data;
236    const char* end = data + length;
237    while (p < end) {
238        if (*p == '\n' || *p == '\r') break;
239        p++; 
240    }
241    if (p == end) { // no '\n' was found, we put this in the buffer
242        lineBuffer.append(data, length);
243        return;
244    }
245    const char* lineend = p;
246    if (*p == '\r') {
247        // if \r is followed by \n, we can ignore \n
248        if (p + 1 != end) {
249            if (p[1] == '\n') {
250                p++;
251            }
252        } else {
253            sawCarriageReturn = true;
254        }
255    }
256
257    // handle the first line from this call
258    if (lineBuffer.size()) {
259        lineBuffer.append(data, lineend-data);
260        emitData(lineBuffer.c_str(), (uint32_t)lineBuffer.size());
261        lineBuffer.assign("");
262    } else {
263        emitData(data, (uint32_t)(p-data));
264    }
265    if (ready) return;
266
267    // handle the other lines
268    while (++p != end) {
269        data = p;
270        do {
271            if (*p == '\n' || *p == '\r') break; 
272        } while (++p != end);
273        if (p == end) {
274            lineBuffer.assign(data, end-data);
275            break;
276        }
277        lineend = p;
278        if (*p == '\r') {
279            // if \r is followed by \n, we can ignore \n
280            if (p + 1 != end) {
281                if (p[1] == '\n') {
282                    p++;
283                }
284            } else {
285                sawCarriageReturn = true;
286            }
287        }
288        emitData(data, (uint32_t)(lineend-data));
289        if (ready) return;
290    }
291}
292void
293LineEventAnalyzer::emitData(const char*data, uint32_t length) {
294//    fprintf(stderr, "%.*s\n", length, data);
295    bool more = false;
296    vector<StreamLineAnalyzer*>::iterator i;
297    if (!initialized) {
298        for (uint j = 0; j < numAnalyzers; ++j) {
299            StreamLineAnalyzer* s = line[j];
300            s->startAnalysis(result);
301            started[j] = true;
302            more = more || !s->isReadyWithStream();
303        }
304        initialized = true;
305        ready = !more;
306        if (ready) {
307            return;
308        }
309        more = false;
310    }
311    for (i = line.begin(); i != line.end(); ++i) {
312        if (!(*i)->isReadyWithStream()) {
313            (*i)->handleLine(data, length);
314        }
315        more = more || !(*i)->isReadyWithStream();
316    }
317    ready = !more;
318}
319bool
320LineEventAnalyzer::isReadyWithStream() {
321    return ready;
322}