/strigi-0.7.7/strigiutils/bin/xmlindexer/peranalyzerxml.cpp
C++ | 316 lines | 263 code | 21 blank | 32 comment | 91 complexity | b1b007237cbbf3943e1f6c9e64efad12 MD5 | raw file
Possible License(s): LGPL-2.0
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#ifdef HAVE_CONFIG_H
21 #include <config.h>
22#endif
23
24#include <strigi/strigiconfig.h>
25//#include "compat.h"
26#include <strigi/fileinputstream.h>
27#include <strigi/bz2inputstream.h>
28#include <strigi/diranalyzer.h>
29#include <strigi/analyzerconfiguration.h>
30#include <strigi/streamendanalyzer.h>
31#include <strigi/streamthroughanalyzer.h>
32#include <strigi/streamlineanalyzer.h>
33#include <strigi/streamsaxanalyzer.h>
34#include <strigi/streameventanalyzer.h>
35#include "xmlindexwriter.h"
36
37#include <cstdio>
38#include <cstring>
39#include <cerrno>
40#include <algorithm>
41#ifdef HAVE_UNISTD_H
42 #include <unistd.h>
43#endif
44#include <stdlib.h>
45#ifdef HAVE_DIRECT_H
46 #include <direct.h>
47#endif
48#include <iostream>
49#include <sstream>
50#include <fstream>
51#include <set>
52using namespace Strigi;
53using namespace std;
54
55class SelectedAnalyzerConfiguration : public Strigi::AnalyzerConfiguration {
56public:
57 const set<string> requiredAnalyzers;
58 set<string> obligatoryAnalyzers;
59 mutable set<string> usedAnalyzers;
60 mutable set<string> availableAnalyzers;
61
62 explicit SelectedAnalyzerConfiguration(const set<string> an)
63 : requiredAnalyzers(an) {
64 obligatoryAnalyzers.insert("EventThroughAnalyzer");
65 }
66
67 bool valid() const {
68 return requiredAnalyzers.size() + 1 == usedAnalyzers.size()
69 || requiredAnalyzers.size() == 0;
70 }
71 bool useFactory(const string& name) const {
72 bool use = requiredAnalyzers.find(name) != requiredAnalyzers.end()
73 || obligatoryAnalyzers.find(name) != obligatoryAnalyzers.end()
74 || requiredAnalyzers.size() == 0;
75 if (use) {
76 usedAnalyzers.insert(name);
77 }
78 availableAnalyzers.insert(name);
79 return use;
80 }
81 bool useFactory(StreamEndAnalyzerFactory* f) const {
82 return useFactory(f->name());
83 }
84 bool useFactory(StreamThroughAnalyzerFactory* f) const {
85 return useFactory(f->name());
86 }
87 bool useFactory(StreamSaxAnalyzerFactory* f) const {
88 return useFactory(f->name());
89 }
90 bool useFactory(StreamEventAnalyzerFactory* f) const {
91 return useFactory(f->name());
92 }
93 bool useFactory(StreamLineAnalyzerFactory* f) const {
94 return useFactory(f->name());
95 }
96};
97
98void
99printUsage(char** argv) {
100 fprintf(stderr, "Usage: %s [OPTIONS] SOURCE\n"
101 "Analyze the given file and output the result as XML.\n"
102 " -c configuration file\n"
103 " -a comma-separated list of analyzers\n"
104 " -r reference output, when specified, the reference output is \n"
105 " compared to the given output and the first difference is \n"
106 " reported.\n",
107 argv[0]);
108}
109bool
110containsHelp(int argc, char **argv) {
111 for (int i=1; i<argc; ++i) {
112 if (strcmp(argv[i], "--help") == 0
113 || strcmp(argv[i], "-h") == 0) return true;
114 }
115 return false;
116}
117set<string>
118parseAnalyzerNames(const char* names) {
119 set<string> n;
120 string ns(names);
121 string::size_type start = 0, p = ns.find(',');
122 while (p != string::npos) {
123 n.insert(ns.substr(start, p-start));
124 start = p + 1;
125 p = ns.find(',', start);
126 }
127 n.insert(ns.substr(start));
128 return n;
129}
130set<string>
131parseConfig(const char* config) {
132 set<string> n;
133 ifstream f(config);
134 string line;
135 while (f.good()) {
136 getline(f, line);
137 if (strncmp("analyzer=", line.c_str(), 9) == 0) {
138 n.insert(line.substr(9));
139 }
140 }
141
142 return n;
143}
144/**
145 * Usage: $0 [OPTIONS] SOURCE
146 **/
147int
148main(int argc, char** argv) {
149 setenv("XDG_DATA_HOME", SOURCEDIR"/src/streamanalyzer/fieldproperties", 1);
150 setenv("XDG_DATA_DIRS", SOURCEDIR"/src/streamanalyzer/fieldproperties", 1);
151 setenv("STRIGI_PLUGIN_PATH", BINARYDIR"/src/streamanalyzer/throughplugins"
152 PATH_SEPARATOR BINARYDIR"/src/streamanalyzer/lineplugins"
153 PATH_SEPARATOR BINARYDIR"/src/streamanalyzer/saxplugins", 1);
154 // there are 2 optional options that both require an argument.
155 // one can specify 1 source, so the number of arguments must be
156 // 2, 4 or 6
157 if (containsHelp(argc, argv) || (argc != 2 && argc != 4 && argc != 6)) {
158 printUsage(argv);
159 return -1;
160 }
161
162 set<string> analyzers;
163 const char* targetFile;
164 const char* referenceFile = 0;
165 if (argc == 4) {
166 if (strcmp(argv[1],"-a") == 0) {
167 analyzers = parseAnalyzerNames(argv[2]);
168 } else if (strcmp(argv[1], "-r") == 0) {
169 referenceFile = argv[2];
170 } else if (strcmp(argv[1], "-c") == 0) {
171 analyzers = parseConfig(argv[2]);
172 } else {
173 printUsage(argv);
174 return -1;
175 }
176 targetFile = argv[3];
177 } else if (argc == 6) {
178 if (strcmp(argv[1], "-a") == 0) {
179 analyzers = parseAnalyzerNames(argv[2]);
180 if (strcmp(argv[3], "-r") == 0) {
181 referenceFile = argv[4];
182 }
183 } else if (strcmp(argv[1], "-c") == 0) {
184 analyzers = parseConfig(argv[2]);
185 if (strcmp(argv[3], "-r") == 0) {
186 referenceFile = argv[4];
187 }
188 } else if (strcmp(argv[1], "-r") == 0) {
189 referenceFile = argv[2];
190 if (strcmp(argv[3], "-a") == 0) {
191 analyzers = parseAnalyzerNames(argv[4]);
192 } else if (strcmp(argv[3], "-c") == 0) {
193 analyzers = parseConfig(argv[4]);
194 }
195 } else {
196 printUsage(argv);
197 return -1;
198 }
199 targetFile = argv[5];
200 } else {
201 targetFile = argv[1];
202 }
203
204 const char* mappingFile = 0;
205
206 // check that the target file exists
207 {
208 ifstream filetest(targetFile);
209 if (!filetest.good()) {
210 cerr << "The file '" << targetFile << "' cannot be read." << endl;
211 return 1;
212 }
213 }
214 // check that the result file is ok
215 FileInputStream f(referenceFile);
216 if (referenceFile != 0 && f.status() != Ok) {
217 cerr << "The file '" << referenceFile << "' cannot be read." << endl;
218 return 1;
219 }
220
221 const TagMapping mapping(mappingFile);
222 ostringstream out;
223 out << "<?xml version='1.0' encoding='UTF-8'?>\n<"
224 << mapping.map("metadata");
225 map<string, string>::const_iterator i = mapping.namespaces().begin();
226 while (i != mapping.namespaces().end()) {
227 out << " xmlns:" << i->first << "='" << i->second << "'";
228 i++;
229 }
230 out << ">\n";
231
232 SelectedAnalyzerConfiguration ic(analyzers);
233
234 XmlIndexManager manager(out, mapping);
235 DirAnalyzer analyzer(manager, ic);
236 if (!ic.valid()) {
237 set<string>::const_iterator i;
238 set<string> missing;
239 set_difference(analyzers.begin(), analyzers.end(),
240 ic.availableAnalyzers.begin(), ic.availableAnalyzers.end(),
241 insert_iterator<set<string> >(missing, missing.begin()));
242 if (missing.size() == 1) {
243 fprintf(stderr, "No analyzer with name %s was found.\n",
244 missing.begin()->c_str());
245 } else {
246 cerr << "The analyzers";
247 for (i = missing.begin(); i != missing.end(); ++i) {
248 cerr << ", " << *i;
249 }
250 cerr << " were not found." << endl;
251 }
252 fprintf(stderr, "Choose from:\n");
253 for (i = ic.availableAnalyzers.begin();
254 i != ic.availableAnalyzers.end(); ++i) {
255 cerr << " " << *i << endl;
256 }
257 return 1;
258 }
259
260 // change to the directory of the file to analyze
261 // this ensures a consistent naming of the file uris, regardless of cwd
262 string targetPath(targetFile);
263 string::size_type slashpos = targetPath.rfind('/');
264 if (slashpos == string::npos) {
265 analyzer.analyzeDir(targetFile);
266 } else {
267 if (chdir(targetPath.substr(0,slashpos).c_str()) == -1) {
268 fprintf(stderr, "%s\n", strerror(errno));
269 return -1;
270 }
271 analyzer.analyzeDir(targetPath.substr(slashpos+1).c_str());
272 }
273 string str = out.str();
274 int32_t n = 2*(int32_t)str.length();
275
276 // if no reference file was specified, we output the analysis
277 if (referenceFile == 0) {
278 cout << str;
279 return 0;
280 }
281
282 // load the file to compare with
283 const char* c;
284 n = f.read(c, n, n);
285 if (n < 0) {
286 fprintf(stderr, "Error: %s\n", f.error());
287 return -1;
288 }
289 if (n != (int32_t)out.str().length()) {
290 cout << "output length differs " << out.str().length() << " instead of "
291 << n << endl;
292 return -1;
293 }
294
295 const char* p1 = c;
296 const char* p2 = str.c_str();
297 int32_t n1 = n;
298 string::size_type n2 = str.length();
299 while (n1-- && n2-- && *p1 == *p2) {
300 p1++;
301 p2++;
302 }
303 if (n1 ==0 && (*p1 || *p2)) {
304 cout << "difference at position " << p1-c << endl;
305
306 int32_t m = (80 > str.length())?(int32_t)str.length():80;
307 printf("%i %.*s\n", m, m, str.c_str());
308
309 m = (80 > n)?n:80;
310 printf("%i %.*s\n", m, m, c);
311
312 return -1;
313 }
314
315 return 0;
316}