PageRenderTime 25ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 1ms

/src/main/java/de/jungblut/nlp/TokenizerUtils.java

http://github.com/thomasjungblut/thomasjungblut-common
Java | 341 lines | 202 code | 39 blank | 100 comment | 31 complexity | 991f402df657b09c38eee7d38d39a1e6 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
  1. package de.jungblut.nlp;
  2. import com.google.common.base.Preconditions;
  3. import de.jungblut.datastructure.ArrayUtils;
  4. import de.jungblut.datastructure.StringPool;
  5. import java.util.*;
  6. import java.util.regex.Pattern;
  7. /**
  8. * Nifty text utility for majorly tokenizing tasks.
  9. *
  10. * @author thomas.jungblut
  11. */
  12. public final class TokenizerUtils {
  13. public static final String END_TAG = "<END>";
  14. public static final String START_TAG = "<START>";
  15. public static final String SEPARATORS = " \r\n\t.,;:'\"()?!\\-/|“„";
  16. private static final Pattern SEPARATORS_PATTERN = Pattern
  17. .compile("[ \r\n\t\\.,;:'\"()?!\\-/|“„]");
  18. private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
  19. private static final char[] CHARACTER_REPLACE_MAPPING = new char[256];
  20. static {
  21. int lowerDifference = 'a' - 'A';
  22. for (char i = 'A'; i <= 'Z'; i++) {
  23. CHARACTER_REPLACE_MAPPING[i] = (char) (i + lowerDifference);
  24. }
  25. CHARACTER_REPLACE_MAPPING[' '] = ' ';
  26. CHARACTER_REPLACE_MAPPING['ä'] = 'ä';
  27. CHARACTER_REPLACE_MAPPING['ö'] = 'ö';
  28. CHARACTER_REPLACE_MAPPING['ü'] = 'ü';
  29. CHARACTER_REPLACE_MAPPING['Ä'] = 'ä';
  30. CHARACTER_REPLACE_MAPPING['Ö'] = 'ö';
  31. CHARACTER_REPLACE_MAPPING['Ü'] = 'ü';
  32. CHARACTER_REPLACE_MAPPING['ß'] = 'ß';
  33. for (char i = '0'; i <= '9'; i++) {
  34. CHARACTER_REPLACE_MAPPING[i] = i;
  35. }
  36. for (char i = 'a'; i <= 'z'; i++) {
  37. CHARACTER_REPLACE_MAPPING[i] = i;
  38. }
  39. }
  40. private static final Pattern NUMERIC_PATTERN = Pattern.compile("[0-9]");
  41. private static final CharSequence NON_BREAKING_WHITESPACE = ((char) 160) + "";
  42. private TokenizerUtils() {
  43. throw new IllegalAccessError();
  44. }
  45. /**
  46. * Applies given regex on tokens and may optionally delete when a token gets
  47. * empty.
  48. */
  49. public static String[] removeMatchingRegex(String regex, String replacement,
  50. String[] tokens, boolean removeEmpty) {
  51. String[] tk = new String[tokens.length];
  52. for (int i = 0; i < tokens.length; i++) {
  53. tk[i] = tokens[i].replaceAll(regex, replacement);
  54. }
  55. if (removeEmpty) {
  56. tk = removeEmpty(tk);
  57. }
  58. return tk;
  59. }
  60. /**
  61. * q-gram tokenizer, which is basically a proxy to
  62. * {@link #nShinglesTokenize(String, int)}. These are nGrams based on
  63. * characters. If you want to use normal word tokenizers, then use
  64. * {@link #wordTokenize(String)} for unigrams. To generate bigrams out of it
  65. * you need to call {@link #buildNGrams(String[], int)}.
  66. *
  67. * @param key
  68. * @param size
  69. * @return
  70. */
  71. public static String[] qGramTokenize(String key, int size) {
  72. return nShinglesTokenize(key, size);
  73. }
  74. /**
  75. * N-shingles tokenizer. N-Shingles are nGrams based on characters. If you
  76. * want to use normal word tokenizers, then use {@link #wordTokenize(String)}
  77. * for unigrams. To generate bigrams out of it you need to call
  78. * {@link #buildNGrams(String[], int)}.
  79. */
  80. public static String[] nShinglesTokenize(String key, int size) {
  81. if (key.length() < size) {
  82. return new String[]{key};
  83. }
  84. final int listSize = key.length() - size + 1;
  85. List<String> list = new ArrayList<>(listSize);
  86. for (int i = 0; i < listSize; i++) {
  87. int upperBound = i + size;
  88. list.add(new String(key.substring(i, upperBound)));
  89. }
  90. return list.toArray(new String[list.size()]);
  91. }
  92. /**
  93. * Tokenizes on normal whitespaces "\\s+" in java regex.
  94. */
  95. public static String[] whiteSpaceTokenize(String text) {
  96. return WHITESPACE_PATTERN.split(text);
  97. }
  98. /**
  99. * Deduplicates the given tokens, but maintains the order.
  100. */
  101. public static String[] deduplicateTokens(String[] tokens) {
  102. LinkedHashSet<String> set = new LinkedHashSet<>();
  103. Collections.addAll(set, tokens);
  104. return set.toArray(new String[set.size()]);
  105. }
  106. /**
  107. * Tokenizes on several indicators of a word, regex is [
  108. * \r\n\t.,;:'\"()?!\\-/|]
  109. */
  110. public static String[] wordTokenize(String text) {
  111. return wordTokenize(text, false);
  112. }
  113. /**
  114. * Tokenizes like {@link #wordTokenize(String)} does, but keeps the seperators
  115. * as their own token if the argument is true.
  116. */
  117. public static String[] wordTokenize(String text, boolean keepSeperators) {
  118. if (keepSeperators) {
  119. StringTokenizer tkns = new StringTokenizer(text, SEPARATORS, true);
  120. int countTokens = tkns.countTokens();
  121. String[] toReturn = new String[countTokens];
  122. int i = 0;
  123. while (countTokens-- > 0) {
  124. toReturn[i] = tkns.nextToken();
  125. if (toReturn[i].charAt(0) > ' ') {
  126. i++;
  127. }
  128. }
  129. return Arrays.copyOf(toReturn, i);
  130. } else {
  131. return SEPARATORS_PATTERN.split(text);
  132. }
  133. }
  134. /**
  135. * Tokenizes on several indicators of a word, regex to detect these must be
  136. * given.
  137. */
  138. public static String[] wordTokenize(String text, String regex) {
  139. return text.split(regex);
  140. }
  141. /**
  142. * Normalizes the tokens:<br/>
  143. * - lower cases <br/>
  144. * - removes not alphanumeric characters (since I'm german I have included
  145. * äüöß as well).
  146. */
  147. public static String[] normalizeTokens(String[] tokens, boolean removeEmpty) {
  148. for (int i = 0; i < tokens.length; i++) {
  149. tokens[i] = normalizeString(tokens[i]);
  150. }
  151. if (removeEmpty) {
  152. tokens = removeEmpty(tokens);
  153. }
  154. return tokens;
  155. }
  156. /**
  157. * Normalizes the token:<br/>
  158. * - lower cases <br/>
  159. * - removes not alphanumeric characters (since I'm german I have included
  160. * äüöß as well).
  161. */
  162. public static String normalizeString(String token) {
  163. char[] charArray = token.toCharArray();
  164. char[] toReturn = new char[charArray.length];
  165. int index = 0;
  166. for (int i = 0; i < charArray.length; i++) {
  167. char x = charArray[i];
  168. if (x < CHARACTER_REPLACE_MAPPING.length) {
  169. if (CHARACTER_REPLACE_MAPPING[x] > 0) {
  170. toReturn[index++] = CHARACTER_REPLACE_MAPPING[x];
  171. }
  172. }
  173. }
  174. return String.valueOf(Arrays.copyOf(toReturn, index));
  175. }
  176. /**
  177. * Removes empty tokens from given array. The empty slots will be filled with
  178. * the follow-up tokens.
  179. */
  180. public static String[] removeEmpty(String[] arr) {
  181. ArrayList<String> list = new ArrayList<>();
  182. for (String s : arr) {
  183. if (s != null && !s.isEmpty())
  184. list.add(s);
  185. }
  186. return list.toArray(new String[list.size()]);
  187. }
  188. /**
  189. * This tokenizer first splits on whitespaces and then concatenates the words
  190. * based on size.
  191. */
  192. public static String[] whiteSpaceTokenizeNGrams(String text, int size) {
  193. String[] whiteSpaceTokenize = whiteSpaceTokenize(text);
  194. return buildNGrams(whiteSpaceTokenize, size);
  195. }
  196. /**
  197. * This tokenizer uses the given tokens and then concatenates the words based
  198. * on size.
  199. */
  200. public static String[] buildNGrams(String[] tokens, int size) {
  201. if (tokens.length < size) {
  202. return tokens;
  203. }
  204. List<String> list = new ArrayList<>();
  205. final int endIndex = tokens.length - size + 1;
  206. for (int i = 0; i < endIndex; i++) {
  207. StringBuilder tkn = new StringBuilder(tokens[i]);
  208. final int tokenEndIndex = (i + size);
  209. for (int j = i + 1; j < tokenEndIndex; j++) {
  210. tkn.append(' ');
  211. tkn.append(tokens[j]);
  212. }
  213. list.add(tkn.toString());
  214. }
  215. return list.toArray(new String[list.size()]);
  216. }
  217. /**
  218. * Builds ngrams from a range of tokens, basically a concat of all the
  219. * {@link #buildNGrams(String[], int)} calls within the range. Both start and
  220. * end are inclusive.
  221. */
  222. public static String[] buildNGramsRange(String[] tokens, int startSize,
  223. int endSize) {
  224. String[] tkn = buildNGrams(tokens, startSize);
  225. for (int i = startSize + 1; i <= endSize; i++) {
  226. tkn = ArrayUtils.concat(tkn, buildNGrams(tokens, i));
  227. }
  228. return tkn;
  229. }
  230. /**
  231. * Interns the given strings inplace.
  232. *
  233. * @param strings the strings to intern.
  234. * @return an interned string array.
  235. */
  236. public static String[] internStrings(String[] strings) {
  237. for (int i = 0; i < strings.length; i++) {
  238. strings[i] = strings[i].intern();
  239. }
  240. return strings;
  241. }
  242. /**
  243. * Interns the given strings inplace with the given pool.
  244. *
  245. * @param strings the strings to intern.
  246. * @param pool the string pool to use.
  247. * @return an interned string array.
  248. */
  249. public static String[] internStrings(String[] strings, StringPool pool) {
  250. Preconditions.checkNotNull(pool, "Pool shouldn't be null!");
  251. for (int i = 0; i < strings.length; i++) {
  252. strings[i] = pool.pool(strings[i]);
  253. }
  254. return strings;
  255. }
  256. /**
  257. * Adds <START> and <END> to the beginning of the array and the end.
  258. */
  259. public static String[] addStartAndEndTags(String[] unigram) {
  260. String[] tmp = new String[unigram.length + 2];
  261. System.arraycopy(unigram, 0, tmp, 1, unigram.length);
  262. tmp[0] = START_TAG;
  263. tmp[tmp.length - 1] = END_TAG;
  264. return tmp;
  265. }
  266. /**
  267. * Concats the given tokens with the given delimiter.
  268. */
  269. public static String concat(String[] tokens, String delimiter) {
  270. final int finalIndex = tokens.length - 1;
  271. StringBuilder sb = new StringBuilder();
  272. for (int i = 0; i < tokens.length; i++) {
  273. sb.append(tokens[i]);
  274. if (i != finalIndex) {
  275. sb.append(delimiter);
  276. }
  277. }
  278. return sb.toString();
  279. }
  280. /**
  281. * Replaces all numerics with "#".
  282. */
  283. public static String[] numericsToHash(String[] tokens) {
  284. String[] toReturn = new String[tokens.length];
  285. for (int i = 0; i < tokens.length; i++) {
  286. toReturn[i] = NUMERIC_PATTERN.matcher(tokens[i]).replaceAll("#");
  287. }
  288. return toReturn;
  289. }
  290. /**
  291. * Trims the tokens using {@link String#trim()} and additionally removes
  292. * non-breaking spaces.
  293. */
  294. public static String[] trim(String[] tokens) {
  295. String[] toReturn = new String[tokens.length];
  296. for (int i = 0; i < tokens.length; i++) {
  297. // removes spaces and non-breaking spaces
  298. toReturn[i] = tokens[i].trim().replace(NON_BREAKING_WHITESPACE, "");
  299. }
  300. return toReturn;
  301. }
  302. }