PageRenderTime 44ms CodeModel.GetById 20ms app.highlight 21ms RepoModel.GetById 1ms app.codeStats 0ms

/strigi-0.7.7/strigiutils/bin/xmlindexer/peranalyzerxml.cpp

#
C++ | 316 lines | 263 code | 21 blank | 32 comment | 91 complexity | b1b007237cbbf3943e1f6c9e64efad12 MD5 | raw file
Possible License(s): LGPL-2.0
  1/* This file is part of Strigi Desktop Search
  2 *
  3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
  4 *
  5 * This library is free software; you can redistribute it and/or
  6 * modify it under the terms of the GNU Library General Public
  7 * License as published by the Free Software Foundation; either
  8 * version 2 of the License, or (at your option) any later version.
  9 *
 10 * This library is distributed in the hope that it will be useful,
 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13 * Library General Public License for more details.
 14 *
 15 * You should have received a copy of the GNU Library General Public License
 16 * along with this library; see the file COPYING.LIB.  If not, write to
 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 18 * Boston, MA 02110-1301, USA.
 19 */
 20#ifdef HAVE_CONFIG_H
 21 #include <config.h>
 22#endif
 23
 24#include <strigi/strigiconfig.h>
 25//#include "compat.h"
 26#include <strigi/fileinputstream.h>
 27#include <strigi/bz2inputstream.h>
 28#include <strigi/diranalyzer.h>
 29#include <strigi/analyzerconfiguration.h>
 30#include <strigi/streamendanalyzer.h>
 31#include <strigi/streamthroughanalyzer.h>
 32#include <strigi/streamlineanalyzer.h>
 33#include <strigi/streamsaxanalyzer.h>
 34#include <strigi/streameventanalyzer.h>
 35#include "xmlindexwriter.h"
 36
 37#include <cstdio>
 38#include <cstring>
 39#include <cerrno>
 40#include <algorithm>
 41#ifdef HAVE_UNISTD_H
 42 #include <unistd.h>
 43#endif
 44#include <stdlib.h>
 45#ifdef HAVE_DIRECT_H
 46 #include <direct.h>
 47#endif
 48#include <iostream>
 49#include <sstream>
 50#include <fstream>
 51#include <set>
 52using namespace Strigi;
 53using namespace std;
 54
 55class SelectedAnalyzerConfiguration : public Strigi::AnalyzerConfiguration {
 56public:
 57    const set<string> requiredAnalyzers;
 58    set<string> obligatoryAnalyzers;
 59    mutable set<string> usedAnalyzers;
 60    mutable set<string> availableAnalyzers;
 61
 62    explicit SelectedAnalyzerConfiguration(const set<string> an)
 63            : requiredAnalyzers(an) {
 64        obligatoryAnalyzers.insert("EventThroughAnalyzer");
 65    }
 66
 67    bool valid() const {
 68        return requiredAnalyzers.size() + 1 == usedAnalyzers.size()
 69            || requiredAnalyzers.size() == 0;
 70    }
 71    bool useFactory(const string& name) const {
 72        bool use = requiredAnalyzers.find(name) != requiredAnalyzers.end()
 73            || obligatoryAnalyzers.find(name) != obligatoryAnalyzers.end()
 74            || requiredAnalyzers.size() == 0;
 75        if (use) {
 76            usedAnalyzers.insert(name);
 77        }
 78        availableAnalyzers.insert(name);
 79        return use;
 80    }
 81    bool useFactory(StreamEndAnalyzerFactory* f) const {
 82        return useFactory(f->name());
 83    }
 84    bool useFactory(StreamThroughAnalyzerFactory* f) const {
 85        return useFactory(f->name());
 86    }
 87    bool useFactory(StreamSaxAnalyzerFactory* f) const {
 88        return useFactory(f->name());
 89    }
 90    bool useFactory(StreamEventAnalyzerFactory* f) const {
 91        return useFactory(f->name());
 92    }
 93    bool useFactory(StreamLineAnalyzerFactory* f) const {
 94        return useFactory(f->name());
 95    }
 96};
 97
 98void
 99printUsage(char** argv) {
100    fprintf(stderr, "Usage: %s [OPTIONS] SOURCE\n"
101        "Analyze the given file and output the result as XML.\n"
102        " -c   configuration file\n"
103        " -a   comma-separated list of analyzers\n"
104        " -r   reference output, when specified, the reference output is \n"
105        "      compared to the given output and the first difference is \n"
106        "      reported.\n",
107        argv[0]);
108}
109bool
110containsHelp(int argc, char **argv) {
111    for (int i=1; i<argc; ++i) {
112         if (strcmp(argv[i], "--help") == 0
113             || strcmp(argv[i], "-h") == 0) return true;
114    }
115    return false;
116}
117set<string>
118parseAnalyzerNames(const char* names) {
119    set<string> n;
120    string ns(names);
121    string::size_type start = 0, p = ns.find(',');
122    while (p != string::npos) {
123        n.insert(ns.substr(start, p-start));
124        start  = p + 1;
125        p = ns.find(',', start);
126    }
127    n.insert(ns.substr(start));
128    return n;
129}
130set<string>
131parseConfig(const char* config) {
132    set<string> n;
133    ifstream f(config);
134    string line;
135    while (f.good()) {
136        getline(f, line);
137        if (strncmp("analyzer=", line.c_str(), 9) == 0) {
138            n.insert(line.substr(9));
139        }
140    }
141    
142    return n;
143}
144/**
145 * Usage: $0 [OPTIONS] SOURCE
146 **/
147int
148main(int argc, char** argv) {
149    setenv("XDG_DATA_HOME", SOURCEDIR"/src/streamanalyzer/fieldproperties", 1);
150    setenv("XDG_DATA_DIRS", SOURCEDIR"/src/streamanalyzer/fieldproperties", 1);
151    setenv("STRIGI_PLUGIN_PATH", BINARYDIR"/src/streamanalyzer/throughplugins"
152        PATH_SEPARATOR BINARYDIR"/src/streamanalyzer/lineplugins"
153        PATH_SEPARATOR BINARYDIR"/src/streamanalyzer/saxplugins", 1);
154    // there are 2 optional options that both require an argument.
155    // one can specify 1 source, so the number of arguments must be
156    // 2, 4 or 6
157    if (containsHelp(argc, argv) || (argc != 2 && argc != 4 && argc != 6)) {
158        printUsage(argv);
159        return -1;
160    }
161
162    set<string> analyzers;
163    const char* targetFile;
164    const char* referenceFile = 0;
165    if (argc == 4) {
166        if (strcmp(argv[1],"-a") == 0) {
167            analyzers = parseAnalyzerNames(argv[2]);
168        } else if (strcmp(argv[1], "-r") == 0) {
169            referenceFile = argv[2];
170        } else if (strcmp(argv[1], "-c") == 0) {
171            analyzers = parseConfig(argv[2]);
172        } else {
173            printUsage(argv);
174            return -1;
175        }
176        targetFile = argv[3];
177    } else if (argc == 6) {
178        if (strcmp(argv[1], "-a") == 0) {
179            analyzers = parseAnalyzerNames(argv[2]);
180            if (strcmp(argv[3], "-r") == 0) {
181                referenceFile = argv[4];
182            }
183        } else if (strcmp(argv[1], "-c") == 0) {
184            analyzers = parseConfig(argv[2]);
185            if (strcmp(argv[3], "-r") == 0) {
186                referenceFile = argv[4];
187            }
188        } else if (strcmp(argv[1], "-r") == 0) {
189            referenceFile = argv[2];
190            if (strcmp(argv[3], "-a") == 0) {
191                analyzers = parseAnalyzerNames(argv[4]);
192            } else if (strcmp(argv[3], "-c") == 0) {
193                analyzers = parseConfig(argv[4]);
194            }
195        } else {
196            printUsage(argv);
197            return -1;
198        }
199        targetFile = argv[5];
200    } else {
201        targetFile = argv[1];
202    }
203
204    const char* mappingFile = 0;
205
206    // check that the target file exists
207    {
208        ifstream filetest(targetFile);
209        if (!filetest.good()) {
210            cerr << "The file '" << targetFile << "' cannot be read." << endl;
211            return 1;
212        }
213    }
214    // check that the result file is ok
215    FileInputStream f(referenceFile);
216    if (referenceFile != 0 && f.status() != Ok) {
217        cerr << "The file '" << referenceFile << "' cannot be read." << endl;
218        return 1;
219    }
220
221    const TagMapping mapping(mappingFile);
222    ostringstream out;
223    out << "<?xml version='1.0' encoding='UTF-8'?>\n<"
224        << mapping.map("metadata");
225    map<string, string>::const_iterator i = mapping.namespaces().begin();
226    while (i != mapping.namespaces().end()) {
227        out << " xmlns:" << i->first << "='" << i->second << "'";
228        i++;
229    }
230    out << ">\n";
231
232    SelectedAnalyzerConfiguration ic(analyzers);
233
234    XmlIndexManager manager(out, mapping);
235    DirAnalyzer analyzer(manager, ic);
236    if (!ic.valid()) {
237        set<string>::const_iterator i;
238        set<string> missing;
239        set_difference(analyzers.begin(), analyzers.end(),
240            ic.availableAnalyzers.begin(), ic.availableAnalyzers.end(),
241            insert_iterator<set<string> >(missing, missing.begin()));
242        if (missing.size() == 1) {
243            fprintf(stderr, "No analyzer with name %s was found.\n",
244               missing.begin()->c_str());
245        } else {
246            cerr << "The analyzers";
247            for (i = missing.begin(); i != missing.end(); ++i) {
248                cerr << ", " << *i; 
249            }
250            cerr << " were not found." << endl;
251        }
252        fprintf(stderr, "Choose from:\n");
253        for (i = ic.availableAnalyzers.begin();
254                i != ic.availableAnalyzers.end(); ++i) {
255            cerr << " " << *i << endl;
256        }
257        return 1;
258    }
259
260    // change to the directory of the file to analyze
261    // this ensures a consistent naming of the file uris, regardless of cwd
262    string targetPath(targetFile);
263    string::size_type slashpos = targetPath.rfind('/');
264    if (slashpos == string::npos) {
265         analyzer.analyzeDir(targetFile);
266    } else {
267        if (chdir(targetPath.substr(0,slashpos).c_str()) == -1) {
268            fprintf(stderr, "%s\n", strerror(errno));
269            return -1;
270        }
271        analyzer.analyzeDir(targetPath.substr(slashpos+1).c_str());
272    }
273    string str = out.str();
274    int32_t n = 2*(int32_t)str.length();
275
276    // if no reference file was specified, we output the analysis
277    if (referenceFile == 0) {
278        cout << str;
279        return 0;
280    }
281
282    // load the file to compare with
283    const char* c;
284    n = f.read(c, n, n);
285    if (n < 0) {
286        fprintf(stderr, "Error: %s\n", f.error());
287        return -1;
288    }
289    if (n != (int32_t)out.str().length()) {
290        cout << "output length differs " << out.str().length() << " instead of "
291            << n << endl;
292        return -1;
293    }
294
295    const char* p1 = c;
296    const char* p2 = str.c_str();
297    int32_t n1 = n;
298    string::size_type n2 = str.length();
299    while (n1-- && n2-- && *p1 == *p2) {
300        p1++;
301        p2++;
302    }
303    if (n1 ==0 && (*p1 || *p2)) {
304         cout << "difference at position " << p1-c << endl;
305
306         int32_t m = (80 > str.length())?(int32_t)str.length():80;
307         printf("%i %.*s\n", m, m, str.c_str());
308
309         m = (80 > n)?n:80;
310         printf("%i %.*s\n", m, m, c);
311
312         return -1;
313    }
314
315    return 0;
316}