/core/carrot2-util-text/src/org/carrot2/text/preprocessing/Tokenizer.java

https://github.com/thinrope/carrot2 · Java · 225 lines · 128 code · 29 blank · 68 comment · 10 complexity · 7098229baa705ec5a27e9d91c98d738a MD5 · raw file

  1. /*
  2. * Carrot2 project.
  3. *
  4. * Copyright (C) 2002-2010, Dawid Weiss, Stanisław Osiński.
  5. * All rights reserved.
  6. *
  7. * Refer to the full license file "carrot2.LICENSE"
  8. * in the root folder of the repository checkout or at:
  9. * http://www.carrot2.org/carrot2.LICENSE
  10. */
  11. package org.carrot2.text.preprocessing;
  12. import java.io.IOException;
  13. import java.io.StringReader;
  14. import java.util.ArrayList;
  15. import java.util.Arrays;
  16. import java.util.Collection;
  17. import java.util.Iterator;
  18. import java.util.List;
  19. import org.apache.commons.lang.StringUtils;
  20. import org.carrot2.core.Document;
  21. import org.carrot2.core.ProcessingException;
  22. import org.carrot2.core.attribute.Init;
  23. import org.carrot2.text.analysis.ITokenizer;
  24. import org.carrot2.text.preprocessing.PreprocessingContext.AllFields;
  25. import org.carrot2.text.preprocessing.PreprocessingContext.AllTokens;
  26. import org.carrot2.text.util.MutableCharArray;
  27. import org.carrot2.util.CharArrayUtils;
  28. import org.carrot2.util.ExceptionUtils;
  29. import org.carrot2.util.attribute.Attribute;
  30. import org.carrot2.util.attribute.Bindable;
  31. import org.carrot2.util.attribute.Input;
  32. import com.carrotsearch.hppc.ByteArrayList;
  33. import com.carrotsearch.hppc.IntArrayList;
  34. import com.carrotsearch.hppc.ShortArrayList;
  35. import com.google.common.collect.Lists;
  36. /**
  37. * Performs tokenization of documents.
  38. * <p>
  39. * This class saves the following results to the {@link PreprocessingContext}:
  40. * <ul>
  41. * <li>{@link AllTokens#image}</li>
  42. * <li>{@link AllTokens#documentIndex}</li>
  43. * <li>{@link AllTokens#fieldIndex}</li>
  44. * <li>{@link AllTokens#type}</li>
  45. * </ul>
  46. */
  47. @Bindable(prefix = "Tokenizer")
  48. public final class Tokenizer
  49. {
  50. /**
  51. * Textual fields of documents that should be tokenized and parsed for clustering.
  52. *
  53. * @level Advanced
  54. * @group Preprocessing
  55. * @label Document fields
  56. */
  57. @Init
  58. @Input
  59. @Attribute
  60. public Collection<String> documentFields = Arrays.asList(new String []
  61. {
  62. Document.TITLE, Document.SUMMARY
  63. });
  64. /**
  65. * Token images.
  66. */
  67. private ArrayList<char []> images;
  68. /**
  69. * An array of token types.
  70. *
  71. * @see ITokenizer
  72. */
  73. private ShortArrayList tokenTypes;
  74. /**
  75. * An array of document indexes.
  76. */
  77. private IntArrayList documentIndices;
  78. /**
  79. * An array of field indexes.
  80. *
  81. * @see AllFields
  82. */
  83. private ByteArrayList fieldIndices;
  84. /**
  85. * Performs tokenization and saves the results to the <code>context</code>.
  86. */
  87. public void tokenize(PreprocessingContext context)
  88. {
  89. // Documents to tokenize
  90. final List<Document> documents = context.documents;
  91. // Fields to tokenize
  92. final String [] fieldNames = documentFields.toArray(new String [documentFields.size()]);
  93. if (fieldNames.length > 8)
  94. {
  95. throw new ProcessingException("Maximum number of tokenized fields is 8.");
  96. }
  97. // Prepare arrays
  98. images = Lists.newArrayList();
  99. tokenTypes = new ShortArrayList();
  100. documentIndices = new IntArrayList();
  101. fieldIndices = new ByteArrayList();
  102. final Iterator<Document> docIterator = documents.iterator();
  103. int documentIndex = 0;
  104. final ITokenizer ts = context.language.getTokenizer();
  105. final MutableCharArray wrapper = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY);
  106. while (docIterator.hasNext())
  107. {
  108. final Document doc = docIterator.next();
  109. boolean hadTokens = false;
  110. for (int i = 0; i < fieldNames.length; i++)
  111. {
  112. final byte fieldIndex = (byte) i;
  113. final String fieldName = fieldNames[i];
  114. final String fieldValue = doc.getField(fieldName);
  115. if (!StringUtils.isEmpty(fieldValue))
  116. {
  117. try
  118. {
  119. short tokenType;
  120. ts.reset(new StringReader(fieldValue));
  121. if ((tokenType = ts.nextToken()) != ITokenizer.TT_EOF)
  122. {
  123. if (hadTokens) addFieldSeparator(documentIndex);
  124. do
  125. {
  126. ts.setTermBuffer(wrapper);
  127. add(documentIndex, fieldIndex, context.intern(wrapper), tokenType);
  128. } while ( (tokenType = ts.nextToken()) != ITokenizer.TT_EOF);
  129. hadTokens = true;
  130. }
  131. }
  132. catch (IOException e)
  133. {
  134. // Not possible (StringReader above)?
  135. throw ExceptionUtils.wrapAsRuntimeException(e);
  136. }
  137. }
  138. }
  139. if (docIterator.hasNext())
  140. {
  141. addDocumentSeparator();
  142. }
  143. documentIndex++;
  144. }
  145. addTerminator();
  146. // Save results in the PreprocessingContext
  147. context.allTokens.documentIndex = documentIndices.toArray();
  148. context.allTokens.fieldIndex = fieldIndices.toArray();
  149. context.allTokens.image = images.toArray(new char [images.size()] []);
  150. context.allTokens.type = tokenTypes.toArray();
  151. context.allFields.name = fieldNames;
  152. // Clean up
  153. images = null;
  154. fieldIndices = null;
  155. tokenTypes = null;
  156. documentIndices = null;
  157. }
  158. /**
  159. * Adds a special terminating token required at the very end of all documents.
  160. */
  161. void addTerminator()
  162. {
  163. add(-1, (byte) -1, null, ITokenizer.TF_TERMINATOR);
  164. }
  165. /**
  166. * Adds a document separator to the lists.
  167. */
  168. void addDocumentSeparator()
  169. {
  170. add(-1, (byte) -1, null, ITokenizer.TF_SEPARATOR_DOCUMENT);
  171. }
  172. /**
  173. * Adds a field separator to the lists.
  174. */
  175. void addFieldSeparator(int documentIndex)
  176. {
  177. add(documentIndex, (byte) -1, null, ITokenizer.TF_SEPARATOR_FIELD);
  178. }
  179. /**
  180. * Adds a sentence separator to the lists.
  181. */
  182. void addSentenceSeparator(int documentIndex, byte fieldIndex)
  183. {
  184. add(documentIndex, fieldIndex, null, ITokenizer.TF_SEPARATOR_FIELD);
  185. }
  186. /**
  187. * Adds custom token code to the sequence. May be used to add separator constants.
  188. */
  189. void add(int documentIndex, byte fieldIndex, char [] image, short tokenTypeCode)
  190. {
  191. documentIndices.add(documentIndex);
  192. fieldIndices.add(fieldIndex);
  193. images.add(image);
  194. tokenTypes.add(tokenTypeCode);
  195. }
  196. }