PageRenderTime 53ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java

https://github.com/apache/tika
Java | 364 lines | 289 code | 40 blank | 35 comment | 46 complexity | 54857f735f50fa304e269681f440b6f7 MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.tika.eval.app.tools;
  18. import java.io.BufferedReader;
  19. import java.io.BufferedWriter;
  20. import java.io.IOException;
  21. import java.io.InputStream;
  22. import java.io.InputStreamReader;
  23. import java.nio.charset.StandardCharsets;
  24. import java.nio.file.Files;
  25. import java.nio.file.Path;
  26. import java.nio.file.Paths;
  27. import java.util.ArrayList;
  28. import java.util.Arrays;
  29. import java.util.HashSet;
  30. import java.util.List;
  31. import java.util.Objects;
  32. import java.util.Set;
  33. import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
  34. import org.apache.commons.io.FileUtils;
  35. import org.apache.lucene.analysis.Analyzer;
  36. import org.apache.lucene.document.Document;
  37. import org.apache.lucene.document.Field;
  38. import org.apache.lucene.document.TextField;
  39. import org.apache.lucene.index.DirectoryReader;
  40. import org.apache.lucene.index.IndexReader;
  41. import org.apache.lucene.index.IndexWriter;
  42. import org.apache.lucene.index.IndexWriterConfig;
  43. import org.apache.lucene.index.LeafReader;
  44. import org.apache.lucene.index.Terms;
  45. import org.apache.lucene.index.TermsEnum;
  46. import org.apache.lucene.store.Directory;
  47. import org.apache.lucene.store.FSDirectory;
  48. import org.apache.lucene.util.BytesRef;
  49. import org.apache.lucene.util.PriorityQueue;
  50. import org.apache.tika.eval.core.tokens.AnalyzerManager;
  51. import org.apache.tika.eval.core.tokens.URLEmailNormalizingFilterFactory;
  52. import org.apache.tika.utils.ProcessUtils;
  53. /**
  54. * Utility class that reads in a UTF-8 input file with one document per row
  55. * and outputs the 20000 tokens with the highest document frequencies.
  56. * <p>
  57. * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
  58. * but includes bigrams for cjk.
  59. * <p>
  60. * It also has a include list for __email__ and __url__ and a skip list
  61. * for common html markup terms.
  62. */
  63. public class TopCommonTokenCounter {
  64. private static final String FIELD = "f";
  65. //these should exist in every list
  66. static Set<String> INCLUDE_LIST = new HashSet<>(Arrays.asList(
  67. new String[]{URLEmailNormalizingFilterFactory.URL,
  68. URLEmailNormalizingFilterFactory.EMAIL}));
  69. //words to ignore
  70. //these are common 4 letter html markup words that we do
  71. //not want to count in case of failed markup processing.
  72. //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
  73. static Set<String> SKIP_LIST = new HashSet<>(
  74. Arrays.asList("span", "table", "href", "head", "title", "body", "html", "tagname",
  75. "lang", "style", "script", "strong", "blockquote", "form", "iframe", "section",
  76. "colspan", "rowspan"));
  77. private static String LICENSE =
  78. "# Licensed to the Apache Software Foundation (ASF) under one or more\n" +
  79. "# contributor license agreements. See the NOTICE file distributed with\n" +
  80. "# this work for additional information regarding copyright ownership.\n" +
  81. "# The ASF licenses this file to You under the Apache License, Version 2.0\n" +
  82. "# (the \"License\"); you may not use this file except in compliance with\n" +
  83. "# the License. You may obtain a copy of the License at\n" + "#\n" +
  84. "# http://www.apache.org/licenses/LICENSE-2.0\n" + "#\n" +
  85. "# Unless required by applicable law or agreed to in writing, software\n" +
  86. "# distributed under the License is distributed on an \"AS IS\" BASIS,\n" +
  87. "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" +
  88. "# See the License for the specific language governing permissions and\n" +
  89. "# limitations under the License.\n" + "#\n";
  90. private static int TOP_N = 30000;
  91. private static int MIN_DOC_FREQ = 10;
  92. public static void main(String[] args) throws Exception {
  93. Path commonTokensFile = Paths.get(args[0]);
  94. List<Path> inputFiles = new ArrayList<>();
  95. for (int i = 1; i < args.length; i++) {
  96. inputFiles.add(Paths.get(ProcessUtils.unescapeCommandLine(args[i])));
  97. }
  98. TopCommonTokenCounter counter = new TopCommonTokenCounter();
  99. if (Files.exists(commonTokensFile)) {
  100. System.err.println(
  101. commonTokensFile.getFileName().toString() + " exists. I'm skipping this.");
  102. return;
  103. }
  104. counter.execute(commonTokensFile, inputFiles);
  105. }
  106. private static void writeTopN(Path path, long totalDocs, long sumDocFreqs,
  107. long sumTotalTermFreqs, long uniqueTerms,
  108. AbstractTokenTFDFPriorityQueue queue) throws IOException {
  109. if (Files.isRegularFile(path)) {
  110. System.err.println("File " + path.getFileName() + " already exists. Skipping.");
  111. return;
  112. }
  113. Files.createDirectories(path.getParent());
  114. try (BufferedWriter writer = Files.newBufferedWriter(path, StandardCharsets.UTF_8)) {
  115. StringBuilder sb = new StringBuilder();
  116. writer.write(LICENSE);
  117. writer.write("#DOC_COUNT\t" + totalDocs + "\n");
  118. writer.write("#SUM_DOC_FREQS\t" + sumDocFreqs + "\n");
  119. writer.write("#SUM_TERM_FREQS\t" + sumTotalTermFreqs + "\n");
  120. writer.write("#UNIQUE_TERMS\t" + uniqueTerms + "\n");
  121. writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
  122. //add these tokens no matter what
  123. for (String t : INCLUDE_LIST) {
  124. writer.write(t);
  125. writer.newLine();
  126. }
  127. for (TokenDFTF tp : queue.getArray()) {
  128. writer.write(getRow(sb, tp) + "\n");
  129. }
  130. writer.flush();
  131. }
  132. }
  133. private static String getRow(StringBuilder sb, TokenDFTF tp) {
  134. sb.setLength(0);
  135. sb.append(clean(tp.token));
  136. sb.append("\t").append(tp.df);
  137. sb.append("\t").append(tp.tf);
  138. return sb.toString();
  139. }
  140. private static String clean(String s) {
  141. if (s == null) {
  142. return "";
  143. }
  144. return s.replaceAll("\\s+", " ").trim();
  145. }
  146. private void execute(Path commonTokensFile, List<Path> inputFiles) throws Exception {
  147. Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
  148. AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
  149. long totalDocs = -1;
  150. long sumDocFreqs = -1;
  151. long sumTotalTermFreqs = -1;
  152. long uniqueTerms = -1;
  153. try (Directory directory = FSDirectory.open(luceneDir)) {
  154. AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);
  155. Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
  156. IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
  157. int maxLen = 1000000;
  158. int len = 0;
  159. try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
  160. List<Document> docs = new ArrayList<>();
  161. for (Path inputFile : inputFiles) {
  162. //total hack
  163. boolean isLeipzig = inputFile.getFileName().toString().contains("-sentences.txt");
  164. int lines = 0;
  165. try (BufferedReader reader = getReader(inputFile)) {
  166. String line = reader.readLine();
  167. while (line != null) {
  168. if (isLeipzig) {
  169. int tab = line.indexOf("\t");
  170. if (tab > -1) {
  171. line = line.substring(tab + 1);
  172. }
  173. }
  174. len += line.length();
  175. Document document = new Document();
  176. document.add(new TextField(FIELD, line, Field.Store.NO));
  177. docs.add(document);
  178. if (len > maxLen) {
  179. writer.addDocuments(docs);
  180. docs.clear();
  181. len = 0;
  182. }
  183. line = reader.readLine();
  184. if (++lines % 100000 == 0) {
  185. System.out.println(
  186. "processed " + lines + " for " + inputFile.getFileName() +
  187. " :: " + commonTokensFile.toAbsolutePath());
  188. }
  189. }
  190. }
  191. }
  192. if (docs.size() > 0) {
  193. writer.addDocuments(docs);
  194. }
  195. writer.commit();
  196. writer.flush();
  197. }
  198. try (IndexReader reader = DirectoryReader.open(directory)) {
  199. LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
  200. totalDocs = wrappedReader.getDocCount(FIELD);
  201. sumDocFreqs = wrappedReader.getSumDocFreq(FIELD);
  202. sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD);
  203. Terms terms = wrappedReader.terms(FIELD);
  204. TermsEnum termsEnum = terms.iterator();
  205. BytesRef bytesRef = termsEnum.next();
  206. int docsWThisField = wrappedReader.getDocCount(FIELD);
  207. while (bytesRef != null) {
  208. uniqueTerms++;
  209. int df = termsEnum.docFreq();
  210. long tf = termsEnum.totalTermFreq();
  211. if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
  212. bytesRef = termsEnum.next();
  213. continue;
  214. }
  215. if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) {
  216. String t = bytesRef.utf8ToString();
  217. if (!SKIP_LIST.contains(t)) {
  218. queue.insertWithOverflow(new TokenDFTF(t, df, tf));
  219. }
  220. }
  221. bytesRef = termsEnum.next();
  222. }
  223. }
  224. } finally {
  225. FileUtils.deleteDirectory(luceneDir.toFile());
  226. }
  227. writeTopN(commonTokensFile, totalDocs, sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue);
  228. }
  229. private BufferedReader getReader(Path inputFile) throws IOException {
  230. InputStream is = Files.newInputStream(inputFile);
  231. if (inputFile.toString().endsWith(".gz")) {
  232. is = new GzipCompressorInputStream(is);
  233. }
  234. return new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
  235. }
  236. private abstract static class AbstractTokenTFDFPriorityQueue extends PriorityQueue<TokenDFTF> {
  237. AbstractTokenTFDFPriorityQueue(int maxSize) {
  238. super(maxSize);
  239. }
  240. public TokenDFTF[] getArray() {
  241. TokenDFTF[] topN = new TokenDFTF[size()];
  242. //now we reverse the queue
  243. TokenDFTF term = pop();
  244. int i = topN.length - 1;
  245. while (term != null && i > -1) {
  246. topN[i--] = term;
  247. term = pop();
  248. }
  249. return topN;
  250. }
  251. }
  252. private static class TokenDFTF {
  253. final String token;
  254. final int df;
  255. final long tf;
  256. public TokenDFTF(String token, int df, long tf) {
  257. this.token = token;
  258. this.df = df;
  259. this.tf = tf;
  260. }
  261. public long getTF() {
  262. return tf;
  263. }
  264. public int getDF() {
  265. return df;
  266. }
  267. public String getToken() {
  268. return token;
  269. }
  270. @Override
  271. public boolean equals(Object o) {
  272. if (this == o) {
  273. return true;
  274. }
  275. if (o == null || getClass() != o.getClass()) {
  276. return false;
  277. }
  278. TokenDFTF tokenDFTF = (TokenDFTF) o;
  279. if (df != tokenDFTF.df) {
  280. return false;
  281. }
  282. if (tf != tokenDFTF.tf) {
  283. return false;
  284. }
  285. return Objects.equals(token, tokenDFTF.token);
  286. }
  287. @Override
  288. public int hashCode() {
  289. int result = token != null ? token.hashCode() : 0;
  290. result = 31 * result + df;
  291. result = 31 * result + (int) (tf ^ (tf >>> 32));
  292. return result;
  293. }
  294. @Override
  295. public String toString() {
  296. return "TokenDFTF{" + "token='" + token + '\'' + ", df=" + df + ", tf=" + tf + '}';
  297. }
  298. }
  299. private static class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {
  300. TokenDFPriorityQueue(int maxSize) {
  301. super(maxSize);
  302. }
  303. @Override
  304. protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
  305. if (arg0.df < arg1.df) {
  306. return true;
  307. } else if (arg0.df > arg1.df) {
  308. return false;
  309. }
  310. return arg1.token.compareTo(arg0.token) < 0;
  311. }
  312. public TokenDFTF[] getArray() {
  313. TokenDFTF[] topN = new TokenDFTF[size()];
  314. //now we reverse the queue
  315. TokenDFTF term = pop();
  316. int i = topN.length - 1;
  317. while (term != null && i > -1) {
  318. topN[i--] = term;
  319. term = pop();
  320. }
  321. return topN;
  322. }
  323. }
  324. }