PageRenderTime 57ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/src/edu/stanford/nlp/trees/Treebanks.java

https://github.com/agibsonccc/CoreNLP
Java | 455 lines | 389 code | 40 blank | 26 comment | 115 complexity | 8a23547038f8cf06de32b932a1af8aae MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause
  1. package edu.stanford.nlp.trees;
  2. import java.io.*;
  3. import java.util.*;
  4. import java.text.NumberFormat;
  5. import java.text.DecimalFormat;
  6. import edu.stanford.nlp.io.IOUtils;
  7. import edu.stanford.nlp.io.NumberRangesFileFilter;
  8. import edu.stanford.nlp.util.Filter;
  9. import edu.stanford.nlp.util.Timing;
  10. import edu.stanford.nlp.ling.Sentence;
  11. import edu.stanford.nlp.ling.TaggedWord;
  12. import edu.stanford.nlp.stats.TwoDimensionalCounter;
  13. import edu.stanford.nlp.stats.Counter;
  14. import edu.stanford.nlp.util.ReflectionLoading;
  15. /** This is just a main method and other static methods for
  16. * command-line manipulation, statistics, and testing of
  17. * Treebank objects. It has been separated out into its
  18. * own class so that users of Treebank classes don't have
  19. * to inherit all this class' dependencies.
  20. *
  21. * @author Christopher Manning
  22. */
  23. public class Treebanks {
  24. private Treebanks() {} // static methods
  25. private static void printUsage() {
  26. System.err.println("This main method will let you variously manipulate and view a treebank.");
  27. System.err.println("Usage: java Treebanks [-flags]* treebankPath [fileRanges]");
  28. System.err.println("Useful flags include:");
  29. System.err.println("\t-maxLength n\t-suffix ext\t-treeReaderFactory class");
  30. System.err.println("\t-pennPrint\t-encoding enc\t-tlp class\t-sentenceLengths");
  31. System.err.println("\t-summary\t-decimate\t-yield\t-correct\t-punct");
  32. System.err.println("\t-oneLine\t-words\t-taggedWords\t-annotate options");
  33. }
  34. /**
  35. * Loads treebank and prints it.
  36. * All files below the designated <code>filePath</code> within the given
  37. * number range if any are loaded. You can normalize the trees or not
  38. * (English-specific) and print trees one per line up to a certain length
  39. * (for EVALB).
  40. * <p>
  41. * Usage: <code>
  42. * java edu.stanford.nlp.trees.Treebanks [-maxLength n|-normalize|-treeReaderFactory class] filePath [numberRanges]
  43. * </code>
  44. *
  45. * @param args Array of command-line arguments
  46. * @throws java.io.IOException If there is a treebank file access problem
  47. */
  48. public static void main(String[] args) throws IOException {
  49. if (args.length == 0) {
  50. printUsage();
  51. return;
  52. }
  53. int i = 0;
  54. final int maxLength;
  55. final int minLength;
  56. int maxL = Integer.MAX_VALUE;
  57. int minL = -1;
  58. boolean normalized = false;
  59. boolean decimate = false;
  60. boolean pennPrintTrees = false;
  61. boolean oneLinePrint = false;
  62. boolean printTaggedWords = false;
  63. boolean printWords = false;
  64. boolean correct = false;
  65. String annotationOptions = null;
  66. boolean summary = false;
  67. boolean timing = false;
  68. boolean yield = false;
  69. boolean punct = false;
  70. boolean sentenceLengths = false;
  71. boolean countTaggings = false;
  72. boolean removeCodeTrees = false;
  73. String decimatePrefix = null;
  74. String encoding = TreebankLanguagePack.DEFAULT_ENCODING;
  75. String suffix = Treebank.DEFAULT_TREE_FILE_SUFFIX;
  76. TreeReaderFactory trf = null;
  77. TreebankLanguagePack tlp = null;
  78. List<Filter<Tree>> filters = new ArrayList<Filter<Tree>>();
  79. while (i < args.length && args[i].startsWith("-")) {
  80. if (args[i].equals("-maxLength") && i + 1 < args.length) {
  81. maxL = Integer.parseInt(args[i+1]);
  82. i += 2;
  83. } else if (args[i].equals("-minLength") && i + 1 < args.length) {
  84. minL = Integer.parseInt(args[i+1]);
  85. i += 2;
  86. } else if (args[i].equals("-h") || args[i].equals("-help")) {
  87. printUsage();
  88. i++;
  89. } else if (args[i].equals("-normalized")) {
  90. normalized = true;
  91. i += 1;
  92. } else if (args[i].equalsIgnoreCase("-tlp")) {
  93. try {
  94. final Object o = Class.forName(args[i+1]).newInstance();
  95. tlp = (TreebankLanguagePack) o;
  96. trf = tlp.treeReaderFactory();
  97. } catch (Exception e) {
  98. System.err.println("Couldn't instantiate as TreebankLanguagePack: " + args[i+1]);
  99. return;
  100. }
  101. i += 2;
  102. } else if (args[i].equals("-treeReaderFactory") || args[i].equals("-trf")) {
  103. try {
  104. final Object o = Class.forName(args[i+1]).newInstance();
  105. trf = (TreeReaderFactory) o;
  106. } catch (Exception e) {
  107. System.err.println("Couldn't instantiate as TreeReaderFactory: " + args[i+1]);
  108. return;
  109. }
  110. i += 2;
  111. } else if (args[i].equals("-suffix")) {
  112. suffix = args[i+1];
  113. i += 2;
  114. } else if (args[i].equals("-decimate")) {
  115. decimate = true;
  116. decimatePrefix = args[i+1];
  117. i += 2;
  118. } else if (args[i].equals("-encoding")) {
  119. encoding = args[i+1];
  120. i += 2;
  121. } else if (args[i].equals("-correct")) {
  122. correct = true;
  123. i += 1;
  124. } else if (args[i].equals("-summary")) {
  125. summary = true;
  126. i += 1;
  127. } else if (args[i].equals("-yield")) {
  128. yield = true;
  129. i += 1;
  130. } else if (args[i].equals("-punct")) {
  131. punct = true;
  132. i += 1;
  133. } else if (args[i].equals("-pennPrint")) {
  134. pennPrintTrees = true;
  135. i++;
  136. } else if (args[i].equals("-oneLine")) {
  137. oneLinePrint = true;
  138. i++;
  139. } else if (args[i].equals("-taggedWords")) {
  140. printTaggedWords = true;
  141. i++;
  142. } else if (args[i].equals("-words")) {
  143. printWords = true;
  144. i++;
  145. } else if (args[i].equals("-annotate")) {
  146. annotationOptions = args[i+1];
  147. i += 2;
  148. } else if (args[i].equals("-timing")) {
  149. timing = true;
  150. i++;
  151. } else if (args[i].equals("-countTaggings")) {
  152. countTaggings = true;
  153. i++;
  154. } else if (args[i].equals("-sentenceLengths")) {
  155. sentenceLengths = true;
  156. i++;
  157. } else if (args[i].equals("-removeCodeTrees")) {
  158. removeCodeTrees = true;
  159. i++;
  160. } else if (args[i].equals("-filter")) {
  161. Filter<Tree> filter = ReflectionLoading.loadByReflection(args[i+1]);
  162. filters.add(filter);
  163. i += 2;
  164. } else {
  165. System.err.println("Unknown option: " + args[i]);
  166. i++;
  167. }
  168. }
  169. maxLength = maxL;
  170. minLength = minL;
  171. Treebank treebank;
  172. if (trf == null) {
  173. trf = new TreeReaderFactory() {
  174. @Override
  175. public TreeReader newTreeReader(Reader in) {
  176. return new PennTreeReader(in, new LabeledScoredTreeFactory());
  177. }
  178. };
  179. }
  180. if (normalized) {
  181. treebank = new DiskTreebank();
  182. } else {
  183. treebank = new DiskTreebank(trf, encoding);
  184. }
  185. for (Filter<Tree> filter : filters) {
  186. treebank = new FilteringTreebank(treebank, filter);
  187. }
  188. final PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, encoding), true);
  189. if (i + 1 < args.length ) {
  190. treebank.loadPath(args[i], new NumberRangesFileFilter(args[i+1], true));
  191. } else if (i < args.length) {
  192. treebank.loadPath(args[i], suffix, true);
  193. } else {
  194. printUsage();
  195. return;
  196. }
  197. // System.err.println("Loaded " + treebank.size() + " trees from " + args[i]);
  198. if (annotationOptions != null) {
  199. // todo Not yet implemented
  200. System.err.println("annotationOptions not yet implemented");
  201. }
  202. if (summary) {
  203. System.out.println(treebank.textualSummary());
  204. }
  205. if (sentenceLengths) {
  206. sentenceLengths(treebank, args[i], ((i+1)<args.length ? args[i+1]: null), pw);
  207. }
  208. if (punct) {
  209. printPunct(treebank, tlp, pw);
  210. }
  211. if (correct) {
  212. treebank = new EnglishPTBTreebankCorrector().transformTrees(treebank);
  213. }
  214. if (pennPrintTrees) {
  215. treebank.apply(new TreeVisitor() {
  216. @Override
  217. public void visitTree(Tree tree) {
  218. int length = tree.yield().size();
  219. if (length >= minLength && length <= maxLength) {
  220. tree.pennPrint(pw);
  221. pw.println();
  222. }
  223. }
  224. });
  225. }
  226. if (oneLinePrint) {
  227. treebank.apply(new TreeVisitor() {
  228. @Override
  229. public void visitTree(Tree tree) {
  230. int length = tree.yield().size();
  231. if (length >= minLength && length <= maxLength) {
  232. pw.println(tree);
  233. }
  234. }
  235. });
  236. }
  237. if (printWords) {
  238. final TreeNormalizer tn = new BobChrisTreeNormalizer();
  239. treebank.apply(new TreeVisitor() {
  240. @Override
  241. public void visitTree(Tree tree) {
  242. Tree tPrime = tn.normalizeWholeTree(tree, tree.treeFactory());
  243. int length = tPrime.yield().size();
  244. if (length >= minLength && length <= maxLength) {
  245. pw.println(Sentence.listToString(tPrime.taggedYield()));
  246. }
  247. }
  248. });
  249. }
  250. if (printTaggedWords) {
  251. final TreeNormalizer tn = new BobChrisTreeNormalizer();
  252. treebank.apply(new TreeVisitor() {
  253. @Override
  254. public void visitTree(Tree tree) {
  255. Tree tPrime = tn.normalizeWholeTree(tree, tree.treeFactory());
  256. pw.println(Sentence.listToString(tPrime.taggedYield(), false, "_"));
  257. }
  258. });
  259. }
  260. if (countTaggings) {
  261. countTaggings(treebank, pw);
  262. }
  263. if (yield) {
  264. treebank.apply(new TreeVisitor() {
  265. @Override
  266. public void visitTree(Tree tree) {
  267. int length = tree.yield().size();
  268. if (length >= minLength && length <= maxLength) {
  269. pw.println(Sentence.listToString(tree.yield()));
  270. }
  271. }
  272. });
  273. }
  274. if (decimate) {
  275. Writer w1 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-train.txt"), encoding));
  276. Writer w2 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-dev.txt"), encoding));
  277. Writer w3 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-test.txt"), encoding));
  278. treebank.decimate(w1, w2, w3);
  279. }
  280. if (timing) {
  281. runTiming(treebank);
  282. }
  283. if (removeCodeTrees) {
  284. // this is a bit of a hack. It only works on an individual file
  285. if (new File(args[i]).isDirectory()) {
  286. throw new RuntimeException("-removeCodeTrees only works on a single file");
  287. }
  288. String treebankStr = IOUtils.slurpFile(args[i]);
  289. treebankStr = treebankStr.replaceAll("\\( \\(CODE <[^>]+>\\)\\)", "");
  290. Writer w = new OutputStreamWriter(new FileOutputStream(args[i]), encoding);
  291. w.write(treebankStr);
  292. w.close();
  293. }
  294. } // end main()
  295. private static void printPunct(Treebank treebank, TreebankLanguagePack tlp, PrintWriter pw) {
  296. if (tlp == null) {
  297. System.err.println("The -punct option requires you to specify -tlp");
  298. } else {
  299. Filter<String> punctTagFilter = tlp.punctuationTagAcceptFilter();
  300. for (Tree t : treebank) {
  301. List<TaggedWord> tws = t.taggedYield();
  302. for (TaggedWord tw : tws) {
  303. if (punctTagFilter.accept(tw.tag())) {
  304. pw.println(tw);
  305. }
  306. }
  307. }
  308. }
  309. }
  310. private static void countTaggings(Treebank tb, final PrintWriter pw) {
  311. final TwoDimensionalCounter<String,String> wtc = new TwoDimensionalCounter<String,String>();
  312. tb.apply(new TreeVisitor() {
  313. @Override
  314. public void visitTree(Tree tree) {
  315. List<TaggedWord> tags = tree.taggedYield();
  316. for (TaggedWord tag : tags)
  317. wtc.incrementCount(tag.word(), tag.tag());
  318. }
  319. });
  320. for (String key : wtc.firstKeySet()) {
  321. pw.print(key);
  322. pw.print('\t');
  323. Counter<String> ctr = wtc.getCounter(key);
  324. for (String k2 : ctr.keySet()) {
  325. pw.print(k2 + '\t' + ctr.getCount(k2) + '\t');
  326. }
  327. pw.println();
  328. }
  329. }
  330. private static void runTiming(Treebank treebank) {
  331. System.out.println();
  332. Timing.startTime();
  333. int num = 0;
  334. for (Tree t : treebank) {
  335. num += t.yield().size();
  336. }
  337. Timing.endTime("traversing corpus, counting words with iterator");
  338. System.err.println("There were " + num + " words in the treebank.");
  339. treebank.apply(new TreeVisitor() {
  340. int num = 0;
  341. @Override
  342. public void visitTree(final Tree t) {
  343. num += t.yield().size();
  344. }
  345. });
  346. System.err.println();
  347. Timing.endTime("traversing corpus, counting words with TreeVisitor");
  348. System.err.println("There were " + num + " words in the treebank.");
  349. System.err.println();
  350. Timing.startTime();
  351. System.err.println("This treebank contains " + treebank.size() + " trees.");
  352. Timing.endTime("size of corpus");
  353. }
  354. private static void sentenceLengths(Treebank treebank, String name, String range,
  355. PrintWriter pw) {
  356. final int maxleng = 150;
  357. int[] lengthCounts = new int[maxleng+2];
  358. int numSents = 0;
  359. int longestSeen = 0;
  360. int totalWords = 0;
  361. String longSent = "";
  362. double median = 0.0;
  363. NumberFormat nf = new DecimalFormat("0.0");
  364. boolean foundMedian = false;
  365. for (Tree t : treebank) {
  366. numSents++;
  367. int len = t.yield().size();
  368. if (len <= maxleng) {
  369. lengthCounts[len]++;
  370. } else {
  371. lengthCounts[maxleng+1]++;
  372. }
  373. totalWords += len;
  374. if (len > longestSeen) {
  375. longestSeen = len;
  376. longSent = t.toString();
  377. }
  378. }
  379. System.out.print("Files " + name + ' ');
  380. if (range != null) {
  381. System.out.print(range + ' ');
  382. }
  383. System.out.println("consists of " + numSents + " sentences");
  384. int runningTotal = 0;
  385. for (int i = 0; i <= maxleng; i++) {
  386. runningTotal += lengthCounts[i];
  387. System.out.println(" " + lengthCounts[i] + " of length " + i +
  388. " (running total: " + runningTotal + ')');
  389. if ( ! foundMedian && runningTotal > numSents / 2) {
  390. if (numSents % 2 == 0 && runningTotal == numSents / 2 + 1) {
  391. // right on the boundary
  392. int j = i - 1;
  393. while (j > 0 && lengthCounts[j] == 0) {
  394. j--;
  395. }
  396. median = ((double) i + j) / 2;
  397. } else {
  398. median = i;
  399. }
  400. foundMedian = true;
  401. }
  402. }
  403. if (lengthCounts[maxleng+1] > 0) {
  404. runningTotal += lengthCounts[maxleng+1];
  405. System.out.println(" " + lengthCounts[maxleng+1] +
  406. " of length " + (maxleng+1) + " to " + longestSeen +
  407. " (running total: " + runningTotal + ')');
  408. }
  409. System.out.println("Average length: " +
  410. nf.format(((double) totalWords) / numSents) + "; median length: " +
  411. nf.format(median));
  412. System.out.println("Longest sentence is of length: " + longestSeen);
  413. pw.println(longSent);
  414. }
  415. }