PageRenderTime 49ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/strigi-0.7.7/strigiutils/bin/xmlindexer/peranalyzerxml.cpp

#
C++ | 316 lines | 263 code | 21 blank | 32 comment | 91 complexity | b1b007237cbbf3943e1f6c9e64efad12 MD5 | raw file
Possible License(s): LGPL-2.0
  1. /* This file is part of Strigi Desktop Search
  2. *
  3. * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Library General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Library General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Library General Public License
  16. * along with this library; see the file COPYING.LIB. If not, write to
  17. * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  18. * Boston, MA 02110-1301, USA.
  19. */
  20. #ifdef HAVE_CONFIG_H
  21. #include <config.h>
  22. #endif
  23. #include <strigi/strigiconfig.h>
  24. //#include "compat.h"
  25. #include <strigi/fileinputstream.h>
  26. #include <strigi/bz2inputstream.h>
  27. #include <strigi/diranalyzer.h>
  28. #include <strigi/analyzerconfiguration.h>
  29. #include <strigi/streamendanalyzer.h>
  30. #include <strigi/streamthroughanalyzer.h>
  31. #include <strigi/streamlineanalyzer.h>
  32. #include <strigi/streamsaxanalyzer.h>
  33. #include <strigi/streameventanalyzer.h>
  34. #include "xmlindexwriter.h"
  35. #include <cstdio>
  36. #include <cstring>
  37. #include <cerrno>
  38. #include <algorithm>
  39. #ifdef HAVE_UNISTD_H
  40. #include <unistd.h>
  41. #endif
  42. #include <stdlib.h>
  43. #ifdef HAVE_DIRECT_H
  44. #include <direct.h>
  45. #endif
  46. #include <iostream>
  47. #include <sstream>
  48. #include <fstream>
  49. #include <set>
  50. using namespace Strigi;
  51. using namespace std;
  52. class SelectedAnalyzerConfiguration : public Strigi::AnalyzerConfiguration {
  53. public:
  54. const set<string> requiredAnalyzers;
  55. set<string> obligatoryAnalyzers;
  56. mutable set<string> usedAnalyzers;
  57. mutable set<string> availableAnalyzers;
  58. explicit SelectedAnalyzerConfiguration(const set<string> an)
  59. : requiredAnalyzers(an) {
  60. obligatoryAnalyzers.insert("EventThroughAnalyzer");
  61. }
  62. bool valid() const {
  63. return requiredAnalyzers.size() + 1 == usedAnalyzers.size()
  64. || requiredAnalyzers.size() == 0;
  65. }
  66. bool useFactory(const string& name) const {
  67. bool use = requiredAnalyzers.find(name) != requiredAnalyzers.end()
  68. || obligatoryAnalyzers.find(name) != obligatoryAnalyzers.end()
  69. || requiredAnalyzers.size() == 0;
  70. if (use) {
  71. usedAnalyzers.insert(name);
  72. }
  73. availableAnalyzers.insert(name);
  74. return use;
  75. }
  76. bool useFactory(StreamEndAnalyzerFactory* f) const {
  77. return useFactory(f->name());
  78. }
  79. bool useFactory(StreamThroughAnalyzerFactory* f) const {
  80. return useFactory(f->name());
  81. }
  82. bool useFactory(StreamSaxAnalyzerFactory* f) const {
  83. return useFactory(f->name());
  84. }
  85. bool useFactory(StreamEventAnalyzerFactory* f) const {
  86. return useFactory(f->name());
  87. }
  88. bool useFactory(StreamLineAnalyzerFactory* f) const {
  89. return useFactory(f->name());
  90. }
  91. };
  92. void
  93. printUsage(char** argv) {
  94. fprintf(stderr, "Usage: %s [OPTIONS] SOURCE\n"
  95. "Analyze the given file and output the result as XML.\n"
  96. " -c configuration file\n"
  97. " -a comma-separated list of analyzers\n"
  98. " -r reference output, when specified, the reference output is \n"
  99. " compared to the given output and the first difference is \n"
  100. " reported.\n",
  101. argv[0]);
  102. }
  103. bool
  104. containsHelp(int argc, char **argv) {
  105. for (int i=1; i<argc; ++i) {
  106. if (strcmp(argv[i], "--help") == 0
  107. || strcmp(argv[i], "-h") == 0) return true;
  108. }
  109. return false;
  110. }
  111. set<string>
  112. parseAnalyzerNames(const char* names) {
  113. set<string> n;
  114. string ns(names);
  115. string::size_type start = 0, p = ns.find(',');
  116. while (p != string::npos) {
  117. n.insert(ns.substr(start, p-start));
  118. start = p + 1;
  119. p = ns.find(',', start);
  120. }
  121. n.insert(ns.substr(start));
  122. return n;
  123. }
  124. set<string>
  125. parseConfig(const char* config) {
  126. set<string> n;
  127. ifstream f(config);
  128. string line;
  129. while (f.good()) {
  130. getline(f, line);
  131. if (strncmp("analyzer=", line.c_str(), 9) == 0) {
  132. n.insert(line.substr(9));
  133. }
  134. }
  135. return n;
  136. }
  137. /**
  138. * Usage: $0 [OPTIONS] SOURCE
  139. **/
  140. int
  141. main(int argc, char** argv) {
  142. setenv("XDG_DATA_HOME", SOURCEDIR"/src/streamanalyzer/fieldproperties", 1);
  143. setenv("XDG_DATA_DIRS", SOURCEDIR"/src/streamanalyzer/fieldproperties", 1);
  144. setenv("STRIGI_PLUGIN_PATH", BINARYDIR"/src/streamanalyzer/throughplugins"
  145. PATH_SEPARATOR BINARYDIR"/src/streamanalyzer/lineplugins"
  146. PATH_SEPARATOR BINARYDIR"/src/streamanalyzer/saxplugins", 1);
  147. // there are 2 optional options that both require an argument.
  148. // one can specify 1 source, so the number of arguments must be
  149. // 2, 4 or 6
  150. if (containsHelp(argc, argv) || (argc != 2 && argc != 4 && argc != 6)) {
  151. printUsage(argv);
  152. return -1;
  153. }
  154. set<string> analyzers;
  155. const char* targetFile;
  156. const char* referenceFile = 0;
  157. if (argc == 4) {
  158. if (strcmp(argv[1],"-a") == 0) {
  159. analyzers = parseAnalyzerNames(argv[2]);
  160. } else if (strcmp(argv[1], "-r") == 0) {
  161. referenceFile = argv[2];
  162. } else if (strcmp(argv[1], "-c") == 0) {
  163. analyzers = parseConfig(argv[2]);
  164. } else {
  165. printUsage(argv);
  166. return -1;
  167. }
  168. targetFile = argv[3];
  169. } else if (argc == 6) {
  170. if (strcmp(argv[1], "-a") == 0) {
  171. analyzers = parseAnalyzerNames(argv[2]);
  172. if (strcmp(argv[3], "-r") == 0) {
  173. referenceFile = argv[4];
  174. }
  175. } else if (strcmp(argv[1], "-c") == 0) {
  176. analyzers = parseConfig(argv[2]);
  177. if (strcmp(argv[3], "-r") == 0) {
  178. referenceFile = argv[4];
  179. }
  180. } else if (strcmp(argv[1], "-r") == 0) {
  181. referenceFile = argv[2];
  182. if (strcmp(argv[3], "-a") == 0) {
  183. analyzers = parseAnalyzerNames(argv[4]);
  184. } else if (strcmp(argv[3], "-c") == 0) {
  185. analyzers = parseConfig(argv[4]);
  186. }
  187. } else {
  188. printUsage(argv);
  189. return -1;
  190. }
  191. targetFile = argv[5];
  192. } else {
  193. targetFile = argv[1];
  194. }
  195. const char* mappingFile = 0;
  196. // check that the target file exists
  197. {
  198. ifstream filetest(targetFile);
  199. if (!filetest.good()) {
  200. cerr << "The file '" << targetFile << "' cannot be read." << endl;
  201. return 1;
  202. }
  203. }
  204. // check that the result file is ok
  205. FileInputStream f(referenceFile);
  206. if (referenceFile != 0 && f.status() != Ok) {
  207. cerr << "The file '" << referenceFile << "' cannot be read." << endl;
  208. return 1;
  209. }
  210. const TagMapping mapping(mappingFile);
  211. ostringstream out;
  212. out << "<?xml version='1.0' encoding='UTF-8'?>\n<"
  213. << mapping.map("metadata");
  214. map<string, string>::const_iterator i = mapping.namespaces().begin();
  215. while (i != mapping.namespaces().end()) {
  216. out << " xmlns:" << i->first << "='" << i->second << "'";
  217. i++;
  218. }
  219. out << ">\n";
  220. SelectedAnalyzerConfiguration ic(analyzers);
  221. XmlIndexManager manager(out, mapping);
  222. DirAnalyzer analyzer(manager, ic);
  223. if (!ic.valid()) {
  224. set<string>::const_iterator i;
  225. set<string> missing;
  226. set_difference(analyzers.begin(), analyzers.end(),
  227. ic.availableAnalyzers.begin(), ic.availableAnalyzers.end(),
  228. insert_iterator<set<string> >(missing, missing.begin()));
  229. if (missing.size() == 1) {
  230. fprintf(stderr, "No analyzer with name %s was found.\n",
  231. missing.begin()->c_str());
  232. } else {
  233. cerr << "The analyzers";
  234. for (i = missing.begin(); i != missing.end(); ++i) {
  235. cerr << ", " << *i;
  236. }
  237. cerr << " were not found." << endl;
  238. }
  239. fprintf(stderr, "Choose from:\n");
  240. for (i = ic.availableAnalyzers.begin();
  241. i != ic.availableAnalyzers.end(); ++i) {
  242. cerr << " " << *i << endl;
  243. }
  244. return 1;
  245. }
  246. // change to the directory of the file to analyze
  247. // this ensures a consistent naming of the file uris, regardless of cwd
  248. string targetPath(targetFile);
  249. string::size_type slashpos = targetPath.rfind('/');
  250. if (slashpos == string::npos) {
  251. analyzer.analyzeDir(targetFile);
  252. } else {
  253. if (chdir(targetPath.substr(0,slashpos).c_str()) == -1) {
  254. fprintf(stderr, "%s\n", strerror(errno));
  255. return -1;
  256. }
  257. analyzer.analyzeDir(targetPath.substr(slashpos+1).c_str());
  258. }
  259. string str = out.str();
  260. int32_t n = 2*(int32_t)str.length();
  261. // if no reference file was specified, we output the analysis
  262. if (referenceFile == 0) {
  263. cout << str;
  264. return 0;
  265. }
  266. // load the file to compare with
  267. const char* c;
  268. n = f.read(c, n, n);
  269. if (n < 0) {
  270. fprintf(stderr, "Error: %s\n", f.error());
  271. return -1;
  272. }
  273. if (n != (int32_t)out.str().length()) {
  274. cout << "output length differs " << out.str().length() << " instead of "
  275. << n << endl;
  276. return -1;
  277. }
  278. const char* p1 = c;
  279. const char* p2 = str.c_str();
  280. int32_t n1 = n;
  281. string::size_type n2 = str.length();
  282. while (n1-- && n2-- && *p1 == *p2) {
  283. p1++;
  284. p2++;
  285. }
  286. if (n1 ==0 && (*p1 || *p2)) {
  287. cout << "difference at position " << p1-c << endl;
  288. int32_t m = (80 > str.length())?(int32_t)str.length():80;
  289. printf("%i %.*s\n", m, m, str.c_str());
  290. m = (80 > n)?n:80;
  291. printf("%i %.*s\n", m, m, c);
  292. return -1;
  293. }
  294. return 0;
  295. }