TokenizerUtils.java | searchcode

/src/main/java/de/jungblut/nlp/TokenizerUtils.java

http://github.com/thomasjungblut/thomasjungblut-common
Java | 341 lines | 202 code | 39 blank | 100 comment | 31 complexity | 991f402df657b09c38eee7d38d39a1e6 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause

package de.jungblut.nlp;

import com.google.common.base.Preconditions;
import de.jungblut.datastructure.ArrayUtils;
import de.jungblut.datastructure.StringPool;

import java.util.*;
import java.util.regex.Pattern;

/**
 * Nifty text utility for majorly tokenizing tasks.
 *
 * @author thomas.jungblut
 */
public final class TokenizerUtils {

    public static final String END_TAG = "<END>";
    public static final String START_TAG = "<START>";
    public static final String SEPARATORS = " \r\n\t.,;:'\"()?!\\-/|“„";

    private static final Pattern SEPARATORS_PATTERN = Pattern
            .compile("[ \r\n\t\\.,;:'\"()?!\\-/|“„]");
    private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");

    private static final char[] CHARACTER_REPLACE_MAPPING = new char[256];

    static {
        int lowerDifference = 'a' - 'A';

        for (char i = 'A'; i <= 'Z'; i++) {
            CHARACTER_REPLACE_MAPPING[i] = (char) (i + lowerDifference);
        }

        CHARACTER_REPLACE_MAPPING[' '] = ' ';
        CHARACTER_REPLACE_MAPPING['ä'] = 'ä';
        CHARACTER_REPLACE_MAPPING['ö'] = 'ö';
        CHARACTER_REPLACE_MAPPING['ü'] = 'ü';
        CHARACTER_REPLACE_MAPPING['Ä'] = 'ä';
        CHARACTER_REPLACE_MAPPING['Ö'] = 'ö';
        CHARACTER_REPLACE_MAPPING['Ü'] = 'ü';
        CHARACTER_REPLACE_MAPPING['ß'] = 'ß';

        for (char i = '0'; i <= '9'; i++) {
            CHARACTER_REPLACE_MAPPING[i] = i;
        }

        for (char i = 'a'; i <= 'z'; i++) {
            CHARACTER_REPLACE_MAPPING[i] = i;
        }
    }

    private static final Pattern NUMERIC_PATTERN = Pattern.compile("[0-9]");
    private static final CharSequence NON_BREAKING_WHITESPACE = ((char) 160) + "";

    private TokenizerUtils() {
        throw new IllegalAccessError();
    }

    /**
     * Applies given regex on tokens and may optionally delete when a token gets
     * empty.
     */
    public static String[] removeMatchingRegex(String regex, String replacement,
                                               String[] tokens, boolean removeEmpty) {
        String[] tk = new String[tokens.length];
        for (int i = 0; i < tokens.length; i++) {
            tk[i] = tokens[i].replaceAll(regex, replacement);
        }
        if (removeEmpty) {
            tk = removeEmpty(tk);
        }
        return tk;
    }

    /**
     * q-gram tokenizer, which is basically a proxy to
     * {@link #nShinglesTokenize(String, int)}. These are nGrams based on
     * characters. If you want to use normal word tokenizers, then use
     * {@link #wordTokenize(String)} for unigrams. To generate bigrams out of it
     * you need to call {@link #buildNGrams(String[], int)}.
     *
     * @param key
     * @param size
     * @return
     */
    public static String[] qGramTokenize(String key, int size) {
        return nShinglesTokenize(key, size);
    }

    /**
     * N-shingles tokenizer. N-Shingles are nGrams based on characters. If you
     * want to use normal word tokenizers, then use {@link #wordTokenize(String)}
     * for unigrams. To generate bigrams out of it you need to call
     * {@link #buildNGrams(String[], int)}.
     */
    public static String[] nShinglesTokenize(String key, int size) {
        if (key.length() < size) {
            return new String[]{key};
        }
        final int listSize = key.length() - size + 1;
        List<String> list = new ArrayList<>(listSize);
        for (int i = 0; i < listSize; i++) {
            int upperBound = i + size;
            list.add(new String(key.substring(i, upperBound)));
        }
        return list.toArray(new String[list.size()]);
    }

    /**
     * Tokenizes on normal whitespaces "\\s+" in java regex.
     */
    public static String[] whiteSpaceTokenize(String text) {
        return WHITESPACE_PATTERN.split(text);
    }

    /**
     * Deduplicates the given tokens, but maintains the order.
     */
    public static String[] deduplicateTokens(String[] tokens) {
        LinkedHashSet<String> set = new LinkedHashSet<>();
        Collections.addAll(set, tokens);
        return set.toArray(new String[set.size()]);
    }

    /**
     * Tokenizes on several indicators of a word, regex is [
     * \r\n\t.,;:'\"()?!\\-/|]
     */
    public static String[] wordTokenize(String text) {
        return wordTokenize(text, false);
    }

    /**
     * Tokenizes like {@link #wordTokenize(String)} does, but keeps the seperators
     * as their own token if the argument is true.
     */
    public static String[] wordTokenize(String text, boolean keepSeperators) {
        if (keepSeperators) {
            StringTokenizer tkns = new StringTokenizer(text, SEPARATORS, true);
            int countTokens = tkns.countTokens();
            String[] toReturn = new String[countTokens];
            int i = 0;
            while (countTokens-- > 0) {
                toReturn[i] = tkns.nextToken();
                if (toReturn[i].charAt(0) > ' ') {
                    i++;
                }
            }
            return Arrays.copyOf(toReturn, i);
        } else {
            return SEPARATORS_PATTERN.split(text);
        }
    }

    /**
     * Tokenizes on several indicators of a word, regex to detect these must be
     * given.
     */
    public static String[] wordTokenize(String text, String regex) {
        return text.split(regex);
    }

    /**
     * Normalizes the tokens:<br/>
     * - lower cases <br/>
     * - removes not alphanumeric characters (since I'm german I have included
     * äüöß as well).
     */
    public static String[] normalizeTokens(String[] tokens, boolean removeEmpty) {
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = normalizeString(tokens[i]);
        }

        if (removeEmpty) {
            tokens = removeEmpty(tokens);
        }
        return tokens;
    }

    /**
     * Normalizes the token:<br/>
     * - lower cases <br/>
     * - removes not alphanumeric characters (since I'm german I have included
     * äüöß as well).
     */
    public static String normalizeString(String token) {
        char[] charArray = token.toCharArray();
        char[] toReturn = new char[charArray.length];
        int index = 0;

        for (int i = 0; i < charArray.length; i++) {
            char x = charArray[i];
            if (x < CHARACTER_REPLACE_MAPPING.length) {
                if (CHARACTER_REPLACE_MAPPING[x] > 0) {
                    toReturn[index++] = CHARACTER_REPLACE_MAPPING[x];
                }
            }
        }

        return String.valueOf(Arrays.copyOf(toReturn, index));
    }

    /**
     * Removes empty tokens from given array. The empty slots will be filled with
     * the follow-up tokens.
     */
    public static String[] removeEmpty(String[] arr) {
        ArrayList<String> list = new ArrayList<>();
        for (String s : arr) {
            if (s != null && !s.isEmpty())
                list.add(s);
        }
        return list.toArray(new String[list.size()]);
    }

    /**
     * This tokenizer first splits on whitespaces and then concatenates the words
     * based on size.
     */
    public static String[] whiteSpaceTokenizeNGrams(String text, int size) {
        String[] whiteSpaceTokenize = whiteSpaceTokenize(text);
        return buildNGrams(whiteSpaceTokenize, size);
    }

    /**
     * This tokenizer uses the given tokens and then concatenates the words based
     * on size.
     */
    public static String[] buildNGrams(String[] tokens, int size) {
        if (tokens.length < size) {
            return tokens;
        }
        List<String> list = new ArrayList<>();
        final int endIndex = tokens.length - size + 1;
        for (int i = 0; i < endIndex; i++) {
            StringBuilder tkn = new StringBuilder(tokens[i]);
            final int tokenEndIndex = (i + size);
            for (int j = i + 1; j < tokenEndIndex; j++) {
                tkn.append(' ');
                tkn.append(tokens[j]);
            }
            list.add(tkn.toString());
        }
        return list.toArray(new String[list.size()]);
    }

    /**
     * Builds ngrams from a range of tokens, basically a concat of all the
     * {@link #buildNGrams(String[], int)} calls within the range. Both start and
     * end are inclusive.
     */
    public static String[] buildNGramsRange(String[] tokens, int startSize,
                                            int endSize) {

        String[] tkn = buildNGrams(tokens, startSize);
        for (int i = startSize + 1; i <= endSize; i++) {
            tkn = ArrayUtils.concat(tkn, buildNGrams(tokens, i));
        }

        return tkn;
    }

    /**
     * Interns the given strings inplace.
     *
     * @param strings the strings to intern.
     * @return an interned string array.
     */
    public static String[] internStrings(String[] strings) {
        for (int i = 0; i < strings.length; i++) {
            strings[i] = strings[i].intern();
        }
        return strings;
    }

    /**
     * Interns the given strings inplace with the given pool.
     *
     * @param strings the strings to intern.
     * @param pool    the string pool to use.
     * @return an interned string array.
     */
    public static String[] internStrings(String[] strings, StringPool pool) {
        Preconditions.checkNotNull(pool, "Pool shouldn't be null!");
        for (int i = 0; i < strings.length; i++) {
            strings[i] = pool.pool(strings[i]);
        }
        return strings;
    }

    /**
     * Adds <START> and <END> to the beginning of the array and the end.
     */
    public static String[] addStartAndEndTags(String[] unigram) {
        String[] tmp = new String[unigram.length + 2];
        System.arraycopy(unigram, 0, tmp, 1, unigram.length);
        tmp[0] = START_TAG;
        tmp[tmp.length - 1] = END_TAG;
        return tmp;
    }

    /**
     * Concats the given tokens with the given delimiter.
     */
    public static String concat(String[] tokens, String delimiter) {
        final int finalIndex = tokens.length - 1;
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < tokens.length; i++) {
            sb.append(tokens[i]);
            if (i != finalIndex) {
                sb.append(delimiter);
            }
        }
        return sb.toString();
    }

    /**
     * Replaces all numerics with "#".
     */
    public static String[] numericsToHash(String[] tokens) {
        String[] toReturn = new String[tokens.length];
        for (int i = 0; i < tokens.length; i++) {
            toReturn[i] = NUMERIC_PATTERN.matcher(tokens[i]).replaceAll("#");
        }
        return toReturn;
    }

    /**
     * Trims the tokens using {@link String#trim()} and additionally removes
     * non-breaking spaces.
     */
    public static String[] trim(String[] tokens) {
        String[] toReturn = new String[tokens.length];
        for (int i = 0; i < tokens.length; i++) {
            // removes spaces and non-breaking spaces
            toReturn[i] = tokens[i].trim().replace(NON_BREAKING_WHITESPACE, "");
        }
        return toReturn;
    }

}