/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java
Java | 364 lines | 289 code | 40 blank | 35 comment | 46 complexity | 54857f735f50fa304e269681f440b6f7 MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.tika.eval.app.tools;
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.nio.charset.StandardCharsets;
- import java.nio.file.Files;
- import java.nio.file.Path;
- import java.nio.file.Paths;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.HashSet;
- import java.util.List;
- import java.util.Objects;
- import java.util.Set;
- import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
- import org.apache.commons.io.FileUtils;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.TextField;
- import org.apache.lucene.index.DirectoryReader;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.LeafReader;
- import org.apache.lucene.index.Terms;
- import org.apache.lucene.index.TermsEnum;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.util.BytesRef;
- import org.apache.lucene.util.PriorityQueue;
- import org.apache.tika.eval.core.tokens.AnalyzerManager;
- import org.apache.tika.eval.core.tokens.URLEmailNormalizingFilterFactory;
- import org.apache.tika.utils.ProcessUtils;
- /**
- * Utility class that reads in a UTF-8 input file with one document per row
- * and outputs the 20000 tokens with the highest document frequencies.
- * <p>
- * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
- * but includes bigrams for cjk.
- * <p>
- * It also has a include list for __email__ and __url__ and a skip list
- * for common html markup terms.
- */
- public class TopCommonTokenCounter {
- private static final String FIELD = "f";
- //these should exist in every list
- static Set<String> INCLUDE_LIST = new HashSet<>(Arrays.asList(
- new String[]{URLEmailNormalizingFilterFactory.URL,
- URLEmailNormalizingFilterFactory.EMAIL}));
- //words to ignore
- //these are common 4 letter html markup words that we do
- //not want to count in case of failed markup processing.
- //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
- static Set<String> SKIP_LIST = new HashSet<>(
- Arrays.asList("span", "table", "href", "head", "title", "body", "html", "tagname",
- "lang", "style", "script", "strong", "blockquote", "form", "iframe", "section",
- "colspan", "rowspan"));
- private static String LICENSE =
- "# Licensed to the Apache Software Foundation (ASF) under one or more\n" +
- "# contributor license agreements. See the NOTICE file distributed with\n" +
- "# this work for additional information regarding copyright ownership.\n" +
- "# The ASF licenses this file to You under the Apache License, Version 2.0\n" +
- "# (the \"License\"); you may not use this file except in compliance with\n" +
- "# the License. You may obtain a copy of the License at\n" + "#\n" +
- "# http://www.apache.org/licenses/LICENSE-2.0\n" + "#\n" +
- "# Unless required by applicable law or agreed to in writing, software\n" +
- "# distributed under the License is distributed on an \"AS IS\" BASIS,\n" +
- "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" +
- "# See the License for the specific language governing permissions and\n" +
- "# limitations under the License.\n" + "#\n";
- private static int TOP_N = 30000;
- private static int MIN_DOC_FREQ = 10;
- public static void main(String[] args) throws Exception {
- Path commonTokensFile = Paths.get(args[0]);
- List<Path> inputFiles = new ArrayList<>();
- for (int i = 1; i < args.length; i++) {
- inputFiles.add(Paths.get(ProcessUtils.unescapeCommandLine(args[i])));
- }
- TopCommonTokenCounter counter = new TopCommonTokenCounter();
- if (Files.exists(commonTokensFile)) {
- System.err.println(
- commonTokensFile.getFileName().toString() + " exists. I'm skipping this.");
- return;
- }
- counter.execute(commonTokensFile, inputFiles);
- }
- private static void writeTopN(Path path, long totalDocs, long sumDocFreqs,
- long sumTotalTermFreqs, long uniqueTerms,
- AbstractTokenTFDFPriorityQueue queue) throws IOException {
- if (Files.isRegularFile(path)) {
- System.err.println("File " + path.getFileName() + " already exists. Skipping.");
- return;
- }
- Files.createDirectories(path.getParent());
- try (BufferedWriter writer = Files.newBufferedWriter(path, StandardCharsets.UTF_8)) {
- StringBuilder sb = new StringBuilder();
- writer.write(LICENSE);
- writer.write("#DOC_COUNT\t" + totalDocs + "\n");
- writer.write("#SUM_DOC_FREQS\t" + sumDocFreqs + "\n");
- writer.write("#SUM_TERM_FREQS\t" + sumTotalTermFreqs + "\n");
- writer.write("#UNIQUE_TERMS\t" + uniqueTerms + "\n");
- writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
- //add these tokens no matter what
- for (String t : INCLUDE_LIST) {
- writer.write(t);
- writer.newLine();
- }
- for (TokenDFTF tp : queue.getArray()) {
- writer.write(getRow(sb, tp) + "\n");
- }
- writer.flush();
- }
- }
- private static String getRow(StringBuilder sb, TokenDFTF tp) {
- sb.setLength(0);
- sb.append(clean(tp.token));
- sb.append("\t").append(tp.df);
- sb.append("\t").append(tp.tf);
- return sb.toString();
- }
- private static String clean(String s) {
- if (s == null) {
- return "";
- }
- return s.replaceAll("\\s+", " ").trim();
- }
- private void execute(Path commonTokensFile, List<Path> inputFiles) throws Exception {
- Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
- AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
- long totalDocs = -1;
- long sumDocFreqs = -1;
- long sumTotalTermFreqs = -1;
- long uniqueTerms = -1;
- try (Directory directory = FSDirectory.open(luceneDir)) {
- AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);
- Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
- IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
- int maxLen = 1000000;
- int len = 0;
- try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
- List<Document> docs = new ArrayList<>();
- for (Path inputFile : inputFiles) {
- //total hack
- boolean isLeipzig = inputFile.getFileName().toString().contains("-sentences.txt");
- int lines = 0;
- try (BufferedReader reader = getReader(inputFile)) {
- String line = reader.readLine();
- while (line != null) {
- if (isLeipzig) {
- int tab = line.indexOf("\t");
- if (tab > -1) {
- line = line.substring(tab + 1);
- }
- }
- len += line.length();
- Document document = new Document();
- document.add(new TextField(FIELD, line, Field.Store.NO));
- docs.add(document);
- if (len > maxLen) {
- writer.addDocuments(docs);
- docs.clear();
- len = 0;
- }
- line = reader.readLine();
- if (++lines % 100000 == 0) {
- System.out.println(
- "processed " + lines + " for " + inputFile.getFileName() +
- " :: " + commonTokensFile.toAbsolutePath());
- }
- }
- }
- }
- if (docs.size() > 0) {
- writer.addDocuments(docs);
- }
- writer.commit();
- writer.flush();
- }
- try (IndexReader reader = DirectoryReader.open(directory)) {
- LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
- totalDocs = wrappedReader.getDocCount(FIELD);
- sumDocFreqs = wrappedReader.getSumDocFreq(FIELD);
- sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD);
- Terms terms = wrappedReader.terms(FIELD);
- TermsEnum termsEnum = terms.iterator();
- BytesRef bytesRef = termsEnum.next();
- int docsWThisField = wrappedReader.getDocCount(FIELD);
- while (bytesRef != null) {
- uniqueTerms++;
- int df = termsEnum.docFreq();
- long tf = termsEnum.totalTermFreq();
- if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
- bytesRef = termsEnum.next();
- continue;
- }
- if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) {
- String t = bytesRef.utf8ToString();
- if (!SKIP_LIST.contains(t)) {
- queue.insertWithOverflow(new TokenDFTF(t, df, tf));
- }
- }
- bytesRef = termsEnum.next();
- }
- }
- } finally {
- FileUtils.deleteDirectory(luceneDir.toFile());
- }
- writeTopN(commonTokensFile, totalDocs, sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue);
- }
- private BufferedReader getReader(Path inputFile) throws IOException {
- InputStream is = Files.newInputStream(inputFile);
- if (inputFile.toString().endsWith(".gz")) {
- is = new GzipCompressorInputStream(is);
- }
- return new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
- }
- private abstract static class AbstractTokenTFDFPriorityQueue extends PriorityQueue<TokenDFTF> {
- AbstractTokenTFDFPriorityQueue(int maxSize) {
- super(maxSize);
- }
- public TokenDFTF[] getArray() {
- TokenDFTF[] topN = new TokenDFTF[size()];
- //now we reverse the queue
- TokenDFTF term = pop();
- int i = topN.length - 1;
- while (term != null && i > -1) {
- topN[i--] = term;
- term = pop();
- }
- return topN;
- }
- }
- private static class TokenDFTF {
- final String token;
- final int df;
- final long tf;
- public TokenDFTF(String token, int df, long tf) {
- this.token = token;
- this.df = df;
- this.tf = tf;
- }
- public long getTF() {
- return tf;
- }
- public int getDF() {
- return df;
- }
- public String getToken() {
- return token;
- }
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
- TokenDFTF tokenDFTF = (TokenDFTF) o;
- if (df != tokenDFTF.df) {
- return false;
- }
- if (tf != tokenDFTF.tf) {
- return false;
- }
- return Objects.equals(token, tokenDFTF.token);
- }
- @Override
- public int hashCode() {
- int result = token != null ? token.hashCode() : 0;
- result = 31 * result + df;
- result = 31 * result + (int) (tf ^ (tf >>> 32));
- return result;
- }
- @Override
- public String toString() {
- return "TokenDFTF{" + "token='" + token + '\'' + ", df=" + df + ", tf=" + tf + '}';
- }
- }
- private static class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {
- TokenDFPriorityQueue(int maxSize) {
- super(maxSize);
- }
- @Override
- protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
- if (arg0.df < arg1.df) {
- return true;
- } else if (arg0.df > arg1.df) {
- return false;
- }
- return arg1.token.compareTo(arg0.token) < 0;
- }
- public TokenDFTF[] getArray() {
- TokenDFTF[] topN = new TokenDFTF[size()];
- //now we reverse the queue
- TokenDFTF term = pop();
- int i = topN.length - 1;
- while (term != null && i > -1) {
- topN[i--] = term;
- term = pop();
- }
- return topN;
- }
- }
- }