PageRenderTime 26ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/projects/weka-3-6-9/weka-src/src/main/java/weka/filters/unsupervised/attribute/StringToWordVector.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 1614 lines | 755 code | 200 blank | 659 comment | 146 complexity | 55adbef0958b1dd930ada38e11d6e655 MD5 | raw file
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License as published by
  4. * the Free Software Foundation; either version 2 of the License, or
  5. * (at your option) any later version.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. *
  12. * You should have received a copy of the GNU General Public License
  13. * along with this program; if not, write to the Free Software
  14. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. */
  16. /*
  17. * StringToWordVector.java
  18. * Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
  19. *
  20. */
  21. package weka.filters.unsupervised.attribute;
  22. import weka.core.Attribute;
  23. import weka.core.Capabilities;
  24. import weka.core.FastVector;
  25. import weka.core.Instance;
  26. import weka.core.Instances;
  27. import weka.core.Option;
  28. import weka.core.OptionHandler;
  29. import weka.core.Range;
  30. import weka.core.RevisionHandler;
  31. import weka.core.RevisionUtils;
  32. import weka.core.SelectedTag;
  33. import weka.core.SparseInstance;
  34. import weka.core.Stopwords;
  35. import weka.core.Tag;
  36. import weka.core.Utils;
  37. import weka.core.Capabilities.Capability;
  38. import weka.core.stemmers.NullStemmer;
  39. import weka.core.stemmers.Stemmer;
  40. import weka.core.tokenizers.Tokenizer;
  41. import weka.core.tokenizers.WordTokenizer;
  42. import weka.filters.Filter;
  43. import weka.filters.UnsupervisedFilter;
  44. import java.io.File;
  45. import java.io.Serializable;
  46. import java.util.Enumeration;
  47. import java.util.Hashtable;
  48. import java.util.Iterator;
  49. import java.util.TreeMap;
  50. import java.util.Vector;
  51. /**
  52. <!-- globalinfo-start -->
  53. * Converts String attributes into a set of attributes representing word occurrence (depending on the tokenizer) information from the text contained in the strings. The set of words (attributes) is determined by the first batch filtered (typically training data).
  54. * <p/>
  55. <!-- globalinfo-end -->
  56. *
  57. <!-- options-start -->
  58. * Valid options are: <p/>
  59. *
  60. * <pre> -C
  61. * Output word counts rather than boolean word presence.
  62. * </pre>
  63. *
  64. * <pre> -R &lt;index1,index2-index4,...&gt;
  65. * Specify list of string attributes to convert to words (as weka Range).
  66. * (default: select all string attributes)</pre>
  67. *
  68. * <pre> -V
  69. * Invert matching sense of column indexes.</pre>
  70. *
  71. * <pre> -P &lt;attribute name prefix&gt;
  72. * Specify a prefix for the created attribute names.
  73. * (default: "")</pre>
  74. *
  75. * <pre> -W &lt;number of words to keep&gt;
  76. * Specify approximate number of word fields to create.
  77. * Surplus words will be discarded..
  78. * (default: 1000)</pre>
  79. *
  80. * <pre> -prune-rate &lt;rate as a percentage of dataset&gt;
  81. * Specify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary.
  82. * -W prunes after creating a full dictionary. You may not have enough memory for this approach.
  83. * (default: no periodic pruning)</pre>
  84. *
  85. * <pre> -T
  86. * Transform the word frequencies into log(1+fij)
  87. * where fij is the frequency of word i in jth document(instance).
  88. * </pre>
  89. *
  90. * <pre> -I
  91. * Transform each word frequency into:
  92. * fij*log(num of Documents/num of documents containing word i)
  93. * where fij if frequency of word i in jth document(instance)</pre>
  94. *
  95. * <pre> -N
  96. * Whether to 0=not normalize/1=normalize all data/2=normalize test data only
  97. * to average length of training documents (default 0=don't normalize).</pre>
  98. *
  99. * <pre> -L
  100. * Convert all tokens to lowercase before adding to the dictionary.</pre>
  101. *
  102. * <pre> -S
  103. * Ignore words that are in the stoplist.</pre>
  104. *
  105. * <pre> -stemmer &lt;spec&gt;
  106. * The stemmering algorihtm (classname plus parameters) to use.</pre>
  107. *
  108. * <pre> -M &lt;int&gt;
  109. * The minimum term frequency (default = 1).</pre>
  110. *
  111. * <pre> -O
  112. * If this is set, the maximum number of words and the
  113. * minimum term frequency is not enforced on a per-class
  114. * basis but based on the documents in all the classes
  115. * (even if a class attribute is set).</pre>
  116. *
  117. * <pre> -stopwords &lt;file&gt;
  118. * A file containing stopwords to override the default ones.
  119. * Using this option automatically sets the flag ('-S') to use the
  120. * stoplist if the file exists.
  121. * Format: one stopword per line, lines starting with '#'
  122. * are interpreted as comments and ignored.</pre>
  123. *
  124. * <pre> -tokenizer &lt;spec&gt;
  125. * The tokenizing algorihtm (classname plus parameters) to use.
  126. * (default: weka.core.tokenizers.WordTokenizer)</pre>
  127. *
  128. <!-- options-end -->
  129. *
  130. * @author Len Trigg (len@reeltwo.com)
  131. * @author Stuart Inglis (stuart@reeltwo.com)
  132. * @author Gordon Paynter (gordon.paynter@ucr.edu)
  133. * @author Asrhaf M. Kibriya (amk14@cs.waikato.ac.nz)
  134. * @version $Revision: 9004 $
  135. * @see Stopwords
  136. */
  137. public class StringToWordVector
  138. extends Filter
  139. implements UnsupervisedFilter, OptionHandler {
  140. /** for serialization. */
  141. static final long serialVersionUID = 8249106275278565424L;
  142. /** Range of columns to convert to word vectors. */
  143. protected Range m_SelectedRange = new Range("first-last");
  144. /** Contains a mapping of valid words to attribute indexes. */
  145. private TreeMap m_Dictionary = new TreeMap();
  146. /** True if output instances should contain word frequency rather than boolean 0 or 1. */
  147. private boolean m_OutputCounts = false;
  148. /** A String prefix for the attribute names. */
  149. private String m_Prefix = "";
  150. /** Contains the number of documents (instances) a particular word appears in.
  151. The counts are stored with the same indexing as given by m_Dictionary. */
  152. private int [] m_DocsCounts;
  153. /** Contains the number of documents (instances) in the input format from
  154. which the dictionary is created. It is used in IDF transform. */
  155. private int m_NumInstances = -1;
  156. /**
  157. * Contains the average length of documents (among the first batch of
  158. * instances aka training data). This is used in length normalization of
  159. * documents which will be normalized to average document length.
  160. */
  161. private double m_AvgDocLength = -1;
  162. /**
  163. * The default number of words (per class if there is a class attribute
  164. * assigned) to attempt to keep.
  165. */
  166. private int m_WordsToKeep = 1000;
  167. /**
  168. * The percentage at which to periodically prune the dictionary.
  169. */
  170. private double m_PeriodicPruningRate = -1;
  171. /** True if word frequencies should be transformed into log(1+fi)
  172. where fi is the frequency of word i.
  173. */
  174. private boolean m_TFTransform;
  175. /** The normalization to apply. */
  176. protected int m_filterType = FILTER_NONE;
  177. /** normalization: No normalization. */
  178. public static final int FILTER_NONE = 0;
  179. /** normalization: Normalize all data. */
  180. public static final int FILTER_NORMALIZE_ALL = 1;
  181. /** normalization: Normalize test data only. */
  182. public static final int FILTER_NORMALIZE_TEST_ONLY = 2;
  183. /** Specifies whether document's (instance's) word frequencies are
  184. * to be normalized. The are normalized to average length of
  185. * documents specified as input format. */
  186. public static final Tag [] TAGS_FILTER = {
  187. new Tag(FILTER_NONE, "No normalization"),
  188. new Tag(FILTER_NORMALIZE_ALL, "Normalize all data"),
  189. new Tag(FILTER_NORMALIZE_TEST_ONLY, "Normalize test data only"),
  190. };
  191. /** True if word frequencies should be transformed into
  192. fij*log(numOfDocs/numOfDocsWithWordi). */
  193. private boolean m_IDFTransform;
  194. /** True if all tokens should be downcased. */
  195. private boolean m_lowerCaseTokens;
  196. /** True if tokens that are on a stoplist are to be ignored. */
  197. private boolean m_useStoplist;
  198. /** the stemming algorithm. */
  199. private Stemmer m_Stemmer = new NullStemmer();
  200. /** the minimum (per-class) word frequency. */
  201. private int m_minTermFreq = 1;
  202. /** whether to operate on a per-class basis. */
  203. private boolean m_doNotOperateOnPerClassBasis = false;
  204. /** a file containing stopwords for using others than the default Rainbow
  205. * ones. */
  206. private File m_Stopwords = new File(System.getProperty("user.dir"));
  207. /** the tokenizer algorithm to use. */
  208. private Tokenizer m_Tokenizer = new WordTokenizer();
  209. /**
  210. * Default constructor. Targets 1000 words in the output.
  211. */
  212. public StringToWordVector() {
  213. }
  214. /**
  215. * Returns an enumeration describing the available options.
  216. *
  217. * @return an enumeration of all the available options
  218. */
  219. public Enumeration listOptions() {
  220. Vector result = new Vector();
  221. result.addElement(new Option(
  222. "\tOutput word counts rather than boolean word presence.\n",
  223. "C", 0, "-C"));
  224. result.addElement(new Option(
  225. "\tSpecify list of string attributes to convert to words (as weka Range).\n"
  226. + "\t(default: select all string attributes)",
  227. "R", 1, "-R <index1,index2-index4,...>"));
  228. result.addElement(new Option(
  229. "\tInvert matching sense of column indexes.",
  230. "V", 0, "-V"));
  231. result.addElement(new Option(
  232. "\tSpecify a prefix for the created attribute names.\n"
  233. + "\t(default: \"\")",
  234. "P", 1, "-P <attribute name prefix>"));
  235. result.addElement(new Option(
  236. "\tSpecify approximate number of word fields to create.\n"
  237. + "\tSurplus words will be discarded..\n"
  238. + "\t(default: 1000)",
  239. "W", 1, "-W <number of words to keep>"));
  240. result.addElement(new Option(
  241. "\tSpecify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary.\n"
  242. + "\t-W prunes after creating a full dictionary. You may not have enough memory for this approach.\n"
  243. + "\t(default: no periodic pruning)",
  244. "prune-rate", 1, "-prune-rate <rate as a percentage of dataset>"));
  245. result.addElement(new Option(
  246. "\tTransform the word frequencies into log(1+fij)\n"+
  247. "\twhere fij is the frequency of word i in jth document(instance).\n",
  248. "T", 0, "-T"));
  249. result.addElement(new Option(
  250. "\tTransform each word frequency into:\n"+
  251. "\tfij*log(num of Documents/num of documents containing word i)\n"+
  252. "\t where fij if frequency of word i in jth document(instance)",
  253. "I", 0, "-I"));
  254. result.addElement(new Option(
  255. "\tWhether to 0=not normalize/1=normalize all data/2=normalize test data only\n"
  256. + "\tto average length of training documents "
  257. + "(default 0=don\'t normalize).",
  258. "N", 1, "-N"));
  259. result.addElement(new Option(
  260. "\tConvert all tokens to lowercase before "+
  261. "adding to the dictionary.",
  262. "L", 0, "-L"));
  263. result.addElement(new Option(
  264. "\tIgnore words that are in the stoplist.",
  265. "S", 0, "-S"));
  266. result.addElement(new Option(
  267. "\tThe stemmering algorihtm (classname plus parameters) to use.",
  268. "stemmer", 1, "-stemmer <spec>"));
  269. result.addElement(new Option(
  270. "\tThe minimum term frequency (default = 1).",
  271. "M", 1, "-M <int>"));
  272. result.addElement(new Option(
  273. "\tIf this is set, the maximum number of words and the \n"
  274. + "\tminimum term frequency is not enforced on a per-class \n"
  275. + "\tbasis but based on the documents in all the classes \n"
  276. + "\t(even if a class attribute is set).",
  277. "O", 0, "-O"));
  278. result.addElement(new Option(
  279. "\tA file containing stopwords to override the default ones.\n"
  280. + "\tUsing this option automatically sets the flag ('-S') to use the\n"
  281. + "\tstoplist if the file exists.\n"
  282. + "\tFormat: one stopword per line, lines starting with '#'\n"
  283. + "\tare interpreted as comments and ignored.",
  284. "stopwords", 1, "-stopwords <file>"));
  285. result.addElement(new Option(
  286. "\tThe tokenizing algorihtm (classname plus parameters) to use.\n"
  287. + "\t(default: " + WordTokenizer.class.getName() + ")",
  288. "tokenizer", 1, "-tokenizer <spec>"));
  289. return result.elements();
  290. }
  291. /**
  292. * Parses a given list of options. <p/>
  293. *
  294. <!-- options-start -->
  295. * Valid options are: <p/>
  296. *
  297. * <pre> -C
  298. * Output word counts rather than boolean word presence.
  299. * </pre>
  300. *
  301. * <pre> -R &lt;index1,index2-index4,...&gt;
  302. * Specify list of string attributes to convert to words (as weka Range).
  303. * (default: select all string attributes)</pre>
  304. *
  305. * <pre> -V
  306. * Invert matching sense of column indexes.</pre>
  307. *
  308. * <pre> -P &lt;attribute name prefix&gt;
  309. * Specify a prefix for the created attribute names.
  310. * (default: "")</pre>
  311. *
  312. * <pre> -W &lt;number of words to keep&gt;
  313. * Specify approximate number of word fields to create.
  314. * Surplus words will be discarded..
  315. * (default: 1000)</pre>
  316. *
  317. * <pre> -prune-rate &lt;rate as a percentage of dataset&gt;
  318. * Specify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary.
  319. * -W prunes after creating a full dictionary. You may not have enough memory for this approach.
  320. * (default: no periodic pruning)</pre>
  321. *
  322. * <pre> -T
  323. * Transform the word frequencies into log(1+fij)
  324. * where fij is the frequency of word i in jth document(instance).
  325. * </pre>
  326. *
  327. * <pre> -I
  328. * Transform each word frequency into:
  329. * fij*log(num of Documents/num of documents containing word i)
  330. * where fij if frequency of word i in jth document(instance)</pre>
  331. *
  332. * <pre> -N
  333. * Whether to 0=not normalize/1=normalize all data/2=normalize test data only
  334. * to average length of training documents (default 0=don't normalize).</pre>
  335. *
  336. * <pre> -L
  337. * Convert all tokens to lowercase before adding to the dictionary.</pre>
  338. *
  339. * <pre> -S
  340. * Ignore words that are in the stoplist.</pre>
  341. *
  342. * <pre> -stemmer &lt;spec&gt;
  343. * The stemmering algorihtm (classname plus parameters) to use.</pre>
  344. *
  345. * <pre> -M &lt;int&gt;
  346. * The minimum term frequency (default = 1).</pre>
  347. *
  348. * <pre> -O
  349. * If this is set, the maximum number of words and the
  350. * minimum term frequency is not enforced on a per-class
  351. * basis but based on the documents in all the classes
  352. * (even if a class attribute is set).</pre>
  353. *
  354. * <pre> -stopwords &lt;file&gt;
  355. * A file containing stopwords to override the default ones.
  356. * Using this option automatically sets the flag ('-S') to use the
  357. * stoplist if the file exists.
  358. * Format: one stopword per line, lines starting with '#'
  359. * are interpreted as comments and ignored.</pre>
  360. *
  361. * <pre> -tokenizer &lt;spec&gt;
  362. * The tokenizing algorihtm (classname plus parameters) to use.
  363. * (default: weka.core.tokenizers.WordTokenizer)</pre>
  364. *
  365. <!-- options-end -->
  366. *
  367. * @param options the list of options as an array of strings
  368. * @throws Exception if an option is not supported
  369. */
  370. public void setOptions(String[] options) throws Exception {
  371. String value;
  372. value = Utils.getOption('R', options);
  373. if (value.length() != 0)
  374. setSelectedRange(value);
  375. else
  376. setSelectedRange("first-last");
  377. setInvertSelection(Utils.getFlag('V', options));
  378. value = Utils.getOption('P', options);
  379. if (value.length() != 0)
  380. setAttributeNamePrefix(value);
  381. else
  382. setAttributeNamePrefix("");
  383. value = Utils.getOption('W', options);
  384. if (value.length() != 0)
  385. setWordsToKeep(Integer.valueOf(value).intValue());
  386. else
  387. setWordsToKeep(1000);
  388. value = Utils.getOption("prune-rate", options);
  389. if (value.length() > 0)
  390. setPeriodicPruning(Double.parseDouble(value));
  391. else
  392. setPeriodicPruning(-1);
  393. value = Utils.getOption('M', options);
  394. if (value.length() != 0)
  395. setMinTermFreq(Integer.valueOf(value).intValue());
  396. else
  397. setMinTermFreq(1);
  398. setOutputWordCounts(Utils.getFlag('C', options));
  399. setTFTransform(Utils.getFlag('T', options));
  400. setIDFTransform(Utils.getFlag('I', options));
  401. setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options));
  402. String nString = Utils.getOption('N', options);
  403. if (nString.length() != 0)
  404. setNormalizeDocLength(new SelectedTag(Integer.parseInt(nString), TAGS_FILTER));
  405. else
  406. setNormalizeDocLength(new SelectedTag(FILTER_NONE, TAGS_FILTER));
  407. setLowerCaseTokens(Utils.getFlag('L', options));
  408. setUseStoplist(Utils.getFlag('S', options));
  409. String stemmerString = Utils.getOption("stemmer", options);
  410. if (stemmerString.length() == 0) {
  411. setStemmer(null);
  412. }
  413. else {
  414. String[] stemmerSpec = Utils.splitOptions(stemmerString);
  415. if (stemmerSpec.length == 0)
  416. throw new Exception("Invalid stemmer specification string");
  417. String stemmerName = stemmerSpec[0];
  418. stemmerSpec[0] = "";
  419. Stemmer stemmer = (Stemmer) Class.forName(stemmerName).newInstance();
  420. if (stemmer instanceof OptionHandler)
  421. ((OptionHandler) stemmer).setOptions(stemmerSpec);
  422. setStemmer(stemmer);
  423. }
  424. value = Utils.getOption("stopwords", options);
  425. if (value.length() != 0)
  426. setStopwords(new File(value));
  427. else
  428. setStopwords(null);
  429. String tokenizerString = Utils.getOption("tokenizer", options);
  430. if (tokenizerString.length() == 0) {
  431. setTokenizer(new WordTokenizer());
  432. }
  433. else {
  434. String[] tokenizerSpec = Utils.splitOptions(tokenizerString);
  435. if (tokenizerSpec.length == 0)
  436. throw new Exception("Invalid tokenizer specification string");
  437. String tokenizerName = tokenizerSpec[0];
  438. tokenizerSpec[0] = "";
  439. Tokenizer tokenizer = (Tokenizer) Class.forName(tokenizerName).newInstance();
  440. if (tokenizer instanceof OptionHandler)
  441. ((OptionHandler) tokenizer).setOptions(tokenizerSpec);
  442. setTokenizer(tokenizer);
  443. }
  444. }
  445. /**
  446. * Gets the current settings of the filter.
  447. *
  448. * @return an array of strings suitable for passing to setOptions
  449. */
  450. public String[] getOptions() {
  451. Vector result;
  452. result = new Vector();
  453. result.add("-R");
  454. result.add(getSelectedRange().getRanges());
  455. if (getInvertSelection())
  456. result.add("-V");
  457. if (!"".equals(getAttributeNamePrefix())) {
  458. result.add("-P");
  459. result.add(getAttributeNamePrefix());
  460. }
  461. result.add("-W");
  462. result.add(String.valueOf(getWordsToKeep()));
  463. result.add("-prune-rate");
  464. result.add(String.valueOf(getPeriodicPruning()));
  465. if (getOutputWordCounts())
  466. result.add("-C");
  467. if (getTFTransform())
  468. result.add("-T");
  469. if (getIDFTransform())
  470. result.add("-I");
  471. result.add("-N");
  472. result.add("" + m_filterType);
  473. if (getLowerCaseTokens())
  474. result.add("-L");
  475. if (getUseStoplist())
  476. result.add("-S");
  477. if (getStemmer() != null) {
  478. result.add("-stemmer");
  479. String spec = getStemmer().getClass().getName();
  480. if (getStemmer() instanceof OptionHandler)
  481. spec += " " + Utils.joinOptions(
  482. ((OptionHandler) getStemmer()).getOptions());
  483. result.add(spec.trim());
  484. }
  485. result.add("-M");
  486. result.add(String.valueOf(getMinTermFreq()));
  487. if (getDoNotOperateOnPerClassBasis())
  488. result.add("-O");
  489. if (!getStopwords().isDirectory()) {
  490. result.add("-stopwords");
  491. result.add(getStopwords().getAbsolutePath());
  492. }
  493. result.add("-tokenizer");
  494. String spec = getTokenizer().getClass().getName();
  495. if (getTokenizer() instanceof OptionHandler)
  496. spec += " " + Utils.joinOptions(
  497. ((OptionHandler) getTokenizer()).getOptions());
  498. result.add(spec.trim());
  499. return (String[]) result.toArray(new String[result.size()]);
  500. }
  501. /**
  502. * Constructor that allows specification of the target number of words
  503. * in the output.
  504. *
  505. * @param wordsToKeep the number of words in the output vector (per class
  506. * if assigned).
  507. */
  508. public StringToWordVector(int wordsToKeep) {
  509. m_WordsToKeep = wordsToKeep;
  510. }
  511. /**
  512. * Used to store word counts for dictionary selection based on
  513. * a threshold.
  514. */
  515. private class Count
  516. implements Serializable, RevisionHandler {
  517. /** for serialization. */
  518. static final long serialVersionUID = 2157223818584474321L;
  519. /** the counts. */
  520. public int count, docCount;
  521. /**
  522. * the constructor.
  523. *
  524. * @param c the count
  525. */
  526. public Count(int c) {
  527. count = c;
  528. }
  529. /**
  530. * Returns the revision string.
  531. *
  532. * @return the revision
  533. */
  534. public String getRevision() {
  535. return RevisionUtils.extract("$Revision: 9004 $");
  536. }
  537. }
  538. /**
  539. * Returns the Capabilities of this filter.
  540. *
  541. * @return the capabilities of this object
  542. * @see Capabilities
  543. */
  544. public Capabilities getCapabilities() {
  545. Capabilities result = super.getCapabilities();
  546. result.disableAll();
  547. // attributes
  548. result.enableAllAttributes();
  549. result.enable(Capability.MISSING_VALUES);
  550. // class
  551. result.enableAllClasses();
  552. result.enable(Capability.MISSING_CLASS_VALUES);
  553. result.enable(Capability.NO_CLASS);
  554. return result;
  555. }
  556. /**
  557. * Sets the format of the input instances.
  558. *
  559. * @param instanceInfo an Instances object containing the input
  560. * instance structure (any instances contained in the object are
  561. * ignored - only the structure is required).
  562. * @return true if the outputFormat may be collected immediately
  563. * @throws Exception if the input format can't be set
  564. * successfully
  565. */
  566. public boolean setInputFormat(Instances instanceInfo)
  567. throws Exception {
  568. super.setInputFormat(instanceInfo);
  569. m_SelectedRange.setUpper(instanceInfo.numAttributes() - 1);
  570. m_AvgDocLength = -1;
  571. m_NumInstances = -1;
  572. return false;
  573. }
  574. /**
  575. * Input an instance for filtering. Filter requires all
  576. * training instances be read before producing output.
  577. *
  578. * @param instance the input instance.
  579. * @return true if the filtered instance may now be
  580. * collected with output().
  581. * @throws IllegalStateException if no input structure has been defined.
  582. */
  583. public boolean input(Instance instance) throws Exception {
  584. if (getInputFormat() == null) {
  585. throw new IllegalStateException("No input instance format defined");
  586. }
  587. if (m_NewBatch) {
  588. resetQueue();
  589. m_NewBatch = false;
  590. }
  591. if (isFirstBatchDone()) {
  592. FastVector fv = new FastVector();
  593. int firstCopy = convertInstancewoDocNorm(instance, fv);
  594. Instance inst = (Instance)fv.elementAt(0);
  595. if (m_filterType != FILTER_NONE) {
  596. normalizeInstance(inst, firstCopy);
  597. }
  598. push(inst);
  599. return true;
  600. } else {
  601. bufferInput(instance);
  602. return false;
  603. }
  604. }
  605. /**
  606. * Signify that this batch of input to the filter is finished.
  607. * If the filter requires all instances prior to filtering,
  608. * output() may now be called to retrieve the filtered instances.
  609. *
  610. * @return true if there are instances pending output.
  611. * @throws IllegalStateException if no input structure has been defined.
  612. */
  613. public boolean batchFinished() throws Exception {
  614. if (getInputFormat() == null) {
  615. throw new IllegalStateException("No input instance format defined");
  616. }
  617. // We only need to do something in this method
  618. // if the first batch hasn't been processed. Otherwise
  619. // input() has already done all the work.
  620. if (!isFirstBatchDone()) {
  621. // Determine the dictionary from the first batch (training data)
  622. determineDictionary();
  623. // Convert all instances w/o normalization
  624. FastVector fv = new FastVector();
  625. int firstCopy=0;
  626. for(int i=0; i < m_NumInstances; i++) {
  627. firstCopy = convertInstancewoDocNorm(getInputFormat().instance(i), fv);
  628. }
  629. // Need to compute average document length if necessary
  630. if (m_filterType != FILTER_NONE) {
  631. m_AvgDocLength = 0;
  632. for(int i=0; i<fv.size(); i++) {
  633. Instance inst = (Instance) fv.elementAt(i);
  634. double docLength = 0;
  635. for(int j=0; j<inst.numValues(); j++) {
  636. if(inst.index(j)>=firstCopy) {
  637. docLength += inst.valueSparse(j) * inst.valueSparse(j);
  638. }
  639. }
  640. m_AvgDocLength += Math.sqrt(docLength);
  641. }
  642. m_AvgDocLength /= m_NumInstances;
  643. }
  644. // Perform normalization if necessary.
  645. if (m_filterType == FILTER_NORMALIZE_ALL) {
  646. for(int i=0; i<fv.size(); i++) {
  647. normalizeInstance((Instance) fv.elementAt(i), firstCopy);
  648. }
  649. }
  650. // Push all instances into the output queue
  651. for(int i=0; i<fv.size(); i++) {
  652. push((Instance) fv.elementAt(i));
  653. }
  654. }
  655. // Flush the input
  656. flushInput();
  657. m_NewBatch = true;
  658. m_FirstBatchDone = true;
  659. return (numPendingOutput() != 0);
  660. }
  661. /**
  662. * Returns a string describing this filter.
  663. *
  664. * @return a description of the filter suitable for
  665. * displaying in the explorer/experimenter gui
  666. */
  667. public String globalInfo() {
  668. return
  669. "Converts String attributes into a set of attributes representing "
  670. + "word occurrence (depending on the tokenizer) information from the "
  671. + "text contained in the strings. The set of words (attributes) is "
  672. + "determined by the first batch filtered (typically training data).";
  673. }
  674. /**
  675. * Gets whether output instances contain 0 or 1 indicating word
  676. * presence, or word counts.
  677. *
  678. * @return true if word counts should be output.
  679. */
  680. public boolean getOutputWordCounts() {
  681. return m_OutputCounts;
  682. }
  683. /**
  684. * Sets whether output instances contain 0 or 1 indicating word
  685. * presence, or word counts.
  686. *
  687. * @param outputWordCounts true if word counts should be output.
  688. */
  689. public void setOutputWordCounts(boolean outputWordCounts) {
  690. m_OutputCounts = outputWordCounts;
  691. }
  692. /**
  693. * Returns the tip text for this property.
  694. *
  695. * @return tip text for this property suitable for
  696. * displaying in the explorer/experimenter gui
  697. */
  698. public String outputWordCountsTipText() {
  699. return "Output word counts rather than boolean 0 or 1"+
  700. "(indicating presence or absence of a word).";
  701. }
  702. /**
  703. * Get the value of m_SelectedRange.
  704. *
  705. * @return Value of m_SelectedRange.
  706. */
  707. public Range getSelectedRange() {
  708. return m_SelectedRange;
  709. }
  710. /**
  711. * Set the value of m_SelectedRange.
  712. *
  713. * @param newSelectedRange Value to assign to m_SelectedRange.
  714. */
  715. public void setSelectedRange(String newSelectedRange) {
  716. m_SelectedRange = new Range(newSelectedRange);
  717. }
  718. /**
  719. * Returns the tip text for this property.
  720. *
  721. * @return tip text for this property suitable for
  722. * displaying in the explorer/experimenter gui
  723. */
  724. public String attributeIndicesTipText() {
  725. return "Specify range of attributes to act on."
  726. + " This is a comma separated list of attribute indices, with"
  727. + " \"first\" and \"last\" valid values. Specify an inclusive"
  728. + " range with \"-\". E.g: \"first-3,5,6-10,last\".";
  729. }
  730. /**
  731. * Gets the current range selection.
  732. *
  733. * @return a string containing a comma separated list of ranges
  734. */
  735. public String getAttributeIndices() {
  736. return m_SelectedRange.getRanges();
  737. }
  738. /**
  739. * Sets which attributes are to be worked on.
  740. *
  741. * @param rangeList a string representing the list of attributes. Since
  742. * the string will typically come from a user, attributes are indexed from
  743. * 1. <br>
  744. * eg: first-3,5,6-last
  745. * @throws IllegalArgumentException if an invalid range list is supplied
  746. */
  747. public void setAttributeIndices(String rangeList) {
  748. m_SelectedRange.setRanges(rangeList);
  749. }
  750. /**
  751. * Sets which attributes are to be processed.
  752. *
  753. * @param attributes an array containing indexes of attributes to process.
  754. * Since the array will typically come from a program, attributes are indexed
  755. * from 0.
  756. * @throws IllegalArgumentException if an invalid set of ranges
  757. * is supplied
  758. */
  759. public void setAttributeIndicesArray(int[] attributes) {
  760. setAttributeIndices(Range.indicesToRangeList(attributes));
  761. }
  762. /**
  763. * Returns the tip text for this property.
  764. *
  765. * @return tip text for this property suitable for
  766. * displaying in the explorer/experimenter gui
  767. */
  768. public String invertSelectionTipText() {
  769. return "Set attribute selection mode. If false, only selected"
  770. + " attributes in the range will be worked on; if"
  771. + " true, only non-selected attributes will be processed.";
  772. }
  773. /**
  774. * Gets whether the supplied columns are to be processed or skipped.
  775. *
  776. * @return true if the supplied columns will be kept
  777. */
  778. public boolean getInvertSelection() {
  779. return m_SelectedRange.getInvert();
  780. }
  781. /**
  782. * Sets whether selected columns should be processed or skipped.
  783. *
  784. * @param invert the new invert setting
  785. */
  786. public void setInvertSelection(boolean invert) {
  787. m_SelectedRange.setInvert(invert);
  788. }
  789. /**
  790. * Get the attribute name prefix.
  791. *
  792. * @return The current attribute name prefix.
  793. */
  794. public String getAttributeNamePrefix() {
  795. return m_Prefix;
  796. }
  797. /**
  798. * Set the attribute name prefix.
  799. *
  800. * @param newPrefix String to use as the attribute name prefix.
  801. */
  802. public void setAttributeNamePrefix(String newPrefix) {
  803. m_Prefix = newPrefix;
  804. }
  805. /**
  806. * Returns the tip text for this property.
  807. *
  808. * @return tip text for this property suitable for
  809. * displaying in the explorer/experimenter gui
  810. */
  811. public String attributeNamePrefixTipText() {
  812. return "Prefix for the created attribute names. "+
  813. "(default: \"\")";
  814. }
  815. /**
  816. * Gets the number of words (per class if there is a class attribute
  817. * assigned) to attempt to keep.
  818. *
  819. * @return the target number of words in the output vector (per class if
  820. * assigned).
  821. */
  822. public int getWordsToKeep() {
  823. return m_WordsToKeep;
  824. }
  825. /**
  826. * Sets the number of words (per class if there is a class attribute
  827. * assigned) to attempt to keep.
  828. *
  829. * @param newWordsToKeep the target number of words in the output
  830. * vector (per class if assigned).
  831. */
  832. public void setWordsToKeep(int newWordsToKeep) {
  833. m_WordsToKeep = newWordsToKeep;
  834. }
  835. /**
  836. * Returns the tip text for this property.
  837. *
  838. * @return tip text for this property suitable for
  839. * displaying in the explorer/experimenter gui
  840. */
  841. public String wordsToKeepTipText() {
  842. return "The number of words (per class if there is a class attribute "+
  843. "assigned) to attempt to keep.";
  844. }
  845. /**
  846. * Gets the rate at which the dictionary is periodically pruned, as a
  847. * percentage of the dataset size.
  848. *
  849. * @return the rate at which the dictionary is periodically pruned
  850. */
  851. public double getPeriodicPruning() {
  852. return m_PeriodicPruningRate;
  853. }
  854. /**
  855. * Sets the rate at which the dictionary is periodically pruned, as a
  856. * percentage of the dataset size.
  857. *
  858. * @param newPeriodicPruning the rate at which the dictionary is periodically pruned
  859. */
  860. public void setPeriodicPruning(double newPeriodicPruning) {
  861. m_PeriodicPruningRate = newPeriodicPruning;
  862. }
  863. /**
  864. * Returns the tip text for this property.
  865. *
  866. * @return tip text for this property suitable for
  867. * displaying in the explorer/experimenter gui
  868. */
  869. public String periodicPruningTipText() {
  870. return "Specify the rate (x% of the input dataset) at which to periodically prune the dictionary. "
  871. + "wordsToKeep prunes after creating a full dictionary. You may not have enough "
  872. + "memory for this approach.";
  873. }
  874. /** Gets whether if the word frequencies should be transformed into
  875. * log(1+fij) where fij is the frequency of word i in document(instance) j.
  876. *
  877. * @return true if word frequencies are to be transformed.
  878. */
  879. public boolean getTFTransform() {
  880. return this.m_TFTransform;
  881. }
  882. /** Sets whether if the word frequencies should be transformed into
  883. * log(1+fij) where fij is the frequency of word i in document(instance) j.
  884. *
  885. * @param TFTransform true if word frequencies are to be transformed.
  886. */
  887. public void setTFTransform(boolean TFTransform) {
  888. this.m_TFTransform = TFTransform;
  889. }
  890. /**
  891. * Returns the tip text for this property.
  892. *
  893. * @return tip text for this property suitable for
  894. * displaying in the explorer/experimenter gui
  895. */
  896. public String TFTransformTipText() {
  897. return "Sets whether if the word frequencies should be transformed into:\n "+
  898. " log(1+fij) \n"+
  899. " where fij is the frequency of word i in document (instance) j.";
  900. }
  901. /** Sets whether if the word frequencies in a document should be transformed
  902. * into: <br>
  903. * fij*log(num of Docs/num of Docs with word i) <br>
  904. * where fij is the frequency of word i in document(instance) j.
  905. *
  906. * @return true if the word frequencies are to be transformed.
  907. */
  908. public boolean getIDFTransform() {
  909. return this.m_IDFTransform;
  910. }
  911. /** Sets whether if the word frequencies in a document should be transformed
  912. * into: <br>
  913. * fij*log(num of Docs/num of Docs with word i) <br>
  914. * where fij is the frequency of word i in document(instance) j.
  915. *
  916. * @param IDFTransform true if the word frequecies are to be transformed
  917. */
  918. public void setIDFTransform(boolean IDFTransform) {
  919. this.m_IDFTransform = IDFTransform;
  920. }
  921. /**
  922. * Returns the tip text for this property.
  923. *
  924. * @return tip text for this property suitable for
  925. * displaying in the explorer/experimenter gui
  926. */
  927. public String IDFTransformTipText() {
  928. return "Sets whether if the word frequencies in a document should be "+
  929. "transformed into: \n"+
  930. " fij*log(num of Docs/num of Docs with word i) \n"+
  931. " where fij is the frequency of word i in document (instance) j.";
  932. }
  933. /** Gets whether if the word frequencies for a document (instance) should
  934. * be normalized or not.
  935. *
  936. * @return true if word frequencies are to be normalized.
  937. */
  938. public SelectedTag getNormalizeDocLength() {
  939. return new SelectedTag(m_filterType, TAGS_FILTER);
  940. }
  941. /** Sets whether if the word frequencies for a document (instance) should
  942. * be normalized or not.
  943. *
  944. * @param newType the new type.
  945. */
  946. public void setNormalizeDocLength(SelectedTag newType) {
  947. if (newType.getTags() == TAGS_FILTER) {
  948. m_filterType = newType.getSelectedTag().getID();
  949. }
  950. }
  951. /**
  952. * Returns the tip text for this property.
  953. *
  954. * @return tip text for this property suitable for
  955. * displaying in the explorer/experimenter gui
  956. */
  957. public String normalizeDocLengthTipText() {
  958. return "Sets whether if the word frequencies for a document (instance) "+
  959. "should be normalized or not.";
  960. }
  961. /** Gets whether if the tokens are to be downcased or not.
  962. *
  963. * @return true if the tokens are to be downcased.
  964. */
  965. public boolean getLowerCaseTokens() {
  966. return this.m_lowerCaseTokens;
  967. }
  968. /** Sets whether if the tokens are to be downcased or not. (Doesn't affect
  969. * non-alphabetic characters in tokens).
  970. *
  971. * @param downCaseTokens should be true if only lower case tokens are
  972. * to be formed.
  973. */
  974. public void setLowerCaseTokens(boolean downCaseTokens) {
  975. this.m_lowerCaseTokens = downCaseTokens;
  976. }
  977. /**
  978. * Returns the tip text for this property.
  979. *
  980. * @return tip text for this property suitable for
  981. * displaying in the explorer/experimenter gui
  982. */
  983. public String doNotOperateOnPerClassBasisTipText() {
  984. return "If this is set, the maximum number of words and the "
  985. + "minimum term frequency is not enforced on a per-class "
  986. + "basis but based on the documents in all the classes "
  987. + "(even if a class attribute is set).";
  988. }
  989. /**
  990. * Get the DoNotOperateOnPerClassBasis value.
  991. * @return the DoNotOperateOnPerClassBasis value.
  992. */
  993. public boolean getDoNotOperateOnPerClassBasis() {
  994. return m_doNotOperateOnPerClassBasis;
  995. }
  996. /**
  997. * Set the DoNotOperateOnPerClassBasis value.
  998. * @param newDoNotOperateOnPerClassBasis The new DoNotOperateOnPerClassBasis value.
  999. */
  1000. public void setDoNotOperateOnPerClassBasis(boolean newDoNotOperateOnPerClassBasis) {
  1001. this.m_doNotOperateOnPerClassBasis = newDoNotOperateOnPerClassBasis;
  1002. }
  1003. /**
  1004. * Returns the tip text for this property.
  1005. *
  1006. * @return tip text for this property suitable for
  1007. * displaying in the explorer/experimenter gui
  1008. */
  1009. public String minTermFreqTipText() {
  1010. return "Sets the minimum term frequency. This is enforced "
  1011. + "on a per-class basis.";
  1012. }
  1013. /**
  1014. * Get the MinTermFreq value.
  1015. * @return the MinTermFreq value.
  1016. */
  1017. public int getMinTermFreq() {
  1018. return m_minTermFreq;
  1019. }
  1020. /**
  1021. * Set the MinTermFreq value.
  1022. * @param newMinTermFreq The new MinTermFreq value.
  1023. */
  1024. public void setMinTermFreq(int newMinTermFreq) {
  1025. this.m_minTermFreq = newMinTermFreq;
  1026. }
  1027. /**
  1028. * Returns the tip text for this property.
  1029. *
  1030. * @return tip text for this property suitable for
  1031. * displaying in the explorer/experimenter gui
  1032. */
  1033. public String lowerCaseTokensTipText() {
  1034. return "If set then all the word tokens are converted to lower case "+
  1035. "before being added to the dictionary.";
  1036. }
  1037. /** Gets whether if the words on the stoplist are to be ignored (The stoplist
  1038. * is in weka.core.StopWords).
  1039. *
  1040. * @return true if the words on the stoplist are to be ignored.
  1041. */
  1042. public boolean getUseStoplist() {
  1043. return m_useStoplist;
  1044. }
  1045. /** Sets whether if the words that are on a stoplist are to be ignored (The
  1046. * stop list is in weka.core.StopWords).
  1047. *
  1048. * @param useStoplist true if the tokens that are on a stoplist are to be
  1049. * ignored.
  1050. */
  1051. public void setUseStoplist(boolean useStoplist) {
  1052. m_useStoplist = useStoplist;
  1053. }
  1054. /**
  1055. * Returns the tip text for this property.
  1056. *
  1057. * @return tip text for this property suitable for
  1058. * displaying in the explorer/experimenter gui
  1059. */
  1060. public String useStoplistTipText() {
  1061. return "Ignores all the words that are on the stoplist, if set to true.";
  1062. }
  1063. /**
  1064. * the stemming algorithm to use, null means no stemming at all (i.e., the
  1065. * NullStemmer is used).
  1066. *
  1067. * @param value the configured stemming algorithm, or null
  1068. * @see NullStemmer
  1069. */
  1070. public void setStemmer(Stemmer value) {
  1071. if (value != null)
  1072. m_Stemmer = value;
  1073. else
  1074. m_Stemmer = new NullStemmer();
  1075. }
  1076. /**
  1077. * Returns the current stemming algorithm, null if none is used.
  1078. *
  1079. * @return the current stemming algorithm, null if none set
  1080. */
  1081. public Stemmer getStemmer() {
  1082. return m_Stemmer;
  1083. }
  1084. /**
  1085. * Returns the tip text for this property.
  1086. *
  1087. * @return tip text for this property suitable for
  1088. * displaying in the explorer/experimenter gui
  1089. */
  1090. public String stemmerTipText() {
  1091. return "The stemming algorithm to use on the words.";
  1092. }
  1093. /**
  1094. * sets the file containing the stopwords, null or a directory unset the
  1095. * stopwords. If the file exists, it automatically turns on the flag to
  1096. * use the stoplist.
  1097. *
  1098. * @param value the file containing the stopwords
  1099. */
  1100. public void setStopwords(File value) {
  1101. if (value == null)
  1102. value = new File(System.getProperty("user.dir"));
  1103. m_Stopwords = value;
  1104. if (value.exists() && value.isFile())
  1105. setUseStoplist(true);
  1106. }
  1107. /**
  1108. * returns the file used for obtaining the stopwords, if the file represents
  1109. * a directory then the default ones are used.
  1110. *
  1111. * @return the file containing the stopwords
  1112. */
  1113. public File getStopwords() {
  1114. return m_Stopwords;
  1115. }
  1116. /**
  1117. * Returns the tip text for this property.
  1118. *
  1119. * @return tip text for this property suitable for
  1120. * displaying in the explorer/experimenter gui
  1121. */
  1122. public String stopwordsTipText() {
  1123. return "The file containing the stopwords (if this is a directory then the default ones are used).";
  1124. }
  1125. /**
  1126. * the tokenizer algorithm to use.
  1127. *
  1128. * @param value the configured tokenizing algorithm
  1129. */
  1130. public void setTokenizer(Tokenizer value) {
  1131. m_Tokenizer = value;
  1132. }
  1133. /**
  1134. * Returns the current tokenizer algorithm.
  1135. *
  1136. * @return the current tokenizer algorithm
  1137. */
  1138. public Tokenizer getTokenizer() {
  1139. return m_Tokenizer;
  1140. }
  1141. /**
  1142. * Returns the tip text for this property.
  1143. *
  1144. * @return tip text for this property suitable for
  1145. * displaying in the explorer/experimenter gui
  1146. */
  1147. public String tokenizerTipText() {
  1148. return "The tokenizing algorithm to use on the strings.";
  1149. }
  1150. /**
  1151. * sorts an array.
  1152. *
  1153. * @param array the array to sort
  1154. */
  1155. private static void sortArray(int [] array) {
  1156. int i, j, h, N = array.length - 1;
  1157. for (h = 1; h <= N / 9; h = 3 * h + 1);
  1158. for (; h > 0; h /= 3) {
  1159. for (i = h + 1; i <= N; i++) {
  1160. int v = array[i];
  1161. j = i;
  1162. while (j > h && array[j - h] > v ) {
  1163. array[j] = array[j - h];
  1164. j -= h;
  1165. }
  1166. array[j] = v;
  1167. }
  1168. }
  1169. }
  1170. /**
  1171. * determines the selected range.
  1172. */
  1173. private void determineSelectedRange() {
  1174. Instances inputFormat = getInputFormat();
  1175. // Calculate the default set of fields to convert
  1176. if (m_SelectedRange == null) {
  1177. StringBuffer fields = new StringBuffer();
  1178. for (int j = 0; j < inputFormat.numAttributes(); j++) {
  1179. if (inputFormat.attribute(j).type() == Attribute.STRING)
  1180. fields.append((j + 1) + ",");
  1181. }
  1182. m_SelectedRange = new Range(fields.toString());
  1183. }
  1184. m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
  1185. // Prevent the user from converting non-string fields
  1186. StringBuffer fields = new StringBuffer();
  1187. for (int j = 0; j < inputFormat.numAttributes(); j++) {
  1188. if (m_SelectedRange.isInRange(j)
  1189. && inputFormat.attribute(j).type() == Attribute.STRING)
  1190. fields.append((j + 1) + ",");
  1191. }
  1192. m_SelectedRange.setRanges(fields.toString());
  1193. m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
  1194. // System.err.println("Selected Range: " + getSelectedRange().getRanges());
  1195. }
  1196. /**
  1197. * determines the dictionary.
  1198. */
  1199. private void determineDictionary() {
  1200. // initialize stopwords
  1201. Stopwords stopwords = new Stopwords();
  1202. if (getUseStoplist()) {
  1203. try {
  1204. if (getStopwords().exists() && !getStopwords().isDirectory())
  1205. stopwords.read(getStopwords());
  1206. }
  1207. catch (Exception e) {
  1208. e.printStackTrace();
  1209. }
  1210. }
  1211. // Operate on a per-class basis if class attribute is set
  1212. int classInd = getInputFormat().classIndex();
  1213. int values = 1;
  1214. if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  1215. values = getInputFormat().attribute(classInd).numValues();
  1216. }
  1217. //TreeMap dictionaryArr [] = new TreeMap[values];
  1218. TreeMap [] dictionaryArr = new TreeMap[values];
  1219. for (int i = 0; i < values; i++) {
  1220. dictionaryArr[i] = new TreeMap();
  1221. }
  1222. // Make sure we know which fields to convert
  1223. determineSelectedRange();
  1224. // Tokenize all training text into an orderedMap of "words".
  1225. long pruneRate =
  1226. Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
  1227. for (int i = 0; i < getInputFormat().numInstances(); i++) {
  1228. Instance instance = getInputFormat().instance(i);
  1229. int vInd = 0;
  1230. if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  1231. vInd = (int)instance.classValue();
  1232. }
  1233. // Iterate through all relevant string attributes of the current instance
  1234. Hashtable h = new Hashtable();
  1235. for (int j = 0; j < instance.numAttributes(); j++) {
  1236. if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {
  1237. // Get tokenizer
  1238. m_Tokenizer.tokenize(instance.stringValue(j));
  1239. // Iterate through tokens, perform stemming, and remove stopwords
  1240. // (if required)
  1241. while (m_Tokenizer.hasMoreElements()) {
  1242. String word = ((String)m_Tokenizer.nextElement()).intern();
  1243. if(this.m_lowerCaseTokens==true)
  1244. word = word.toLowerCase();
  1245. word = m_Stemmer.stem(word);
  1246. if(this.m_useStoplist==true)
  1247. if(stopwords.is(word))
  1248. continue;
  1249. if(!(h.containsKey(word)))
  1250. h.put(word, new Integer(0));
  1251. Count count = (Count)dictionaryArr[vInd].get(word);
  1252. if (count == null) {
  1253. dictionaryArr[vInd].put(word, new Count(1));
  1254. } else {
  1255. count.count++;
  1256. }
  1257. }
  1258. }
  1259. }
  1260. //updating the docCount for the words that have occurred in this
  1261. //instance(document).
  1262. Enumeration e = h.keys();
  1263. while(e.hasMoreElements()) {
  1264. String word = (String) e.nextElement();
  1265. Count c = (Count)dictionaryArr[vInd].get(word);
  1266. if(c!=null) {
  1267. c.docCount++;
  1268. } else
  1269. System.err.println("Warning: A word should definitely be in the "+
  1270. "dictionary.Please check the code");
  1271. }
  1272. if (pruneRate > 0) {
  1273. if (i % pruneRate == 0 && i > 0) {
  1274. for (int z = 0; z < values; z++) {
  1275. Vector d = new Vector(1000);
  1276. Iterator it = dictionaryArr[z].keySet().iterator();
  1277. while (it.hasNext()) {
  1278. String word = (String)it.next();
  1279. Count count = (Count)dictionaryArr[z].get(word);
  1280. if (count.count <= 1) { d.add(word); }
  1281. }
  1282. Iterator iter = d.iterator();
  1283. while(iter.hasNext()) {
  1284. String word = (String)iter.next();
  1285. dictionaryArr[z].remove(word);
  1286. }
  1287. }
  1288. }
  1289. }
  1290. }
  1291. // Figure out the minimum required word frequency
  1292. int totalsize = 0;
  1293. int prune[] = new int[values];
  1294. for (int z = 0; z < values; z++) {
  1295. totalsize += dictionaryArr[z].size();
  1296. int array[] = new int[dictionaryArr[z].size()];
  1297. int pos = 0;
  1298. Iterator it = dictionaryArr[z].keySet().iterator();
  1299. while (it.hasNext()) {
  1300. String word = (String)it.next();
  1301. Count count = (Count)dictionaryArr[z].get(word);
  1302. array[pos] = count.count;
  1303. pos++;
  1304. }
  1305. // sort the array
  1306. sortArray(array);
  1307. if (array.length < m_WordsToKeep) {
  1308. // if there aren't enough words, set the threshold to
  1309. // minFreq
  1310. prune[z] = m_minTermFreq;
  1311. } else {
  1312. // otherwise set it to be at least minFreq
  1313. prune[z] = Math.max(m_minTermFreq,
  1314. array[array.length - m_WordsToKeep]);
  1315. }
  1316. }
  1317. // Convert the dictionary into an attribute index
  1318. // and create one attribute per word
  1319. FastVector attributes = new FastVector(totalsize +
  1320. getInputFormat().numAttributes());
  1321. // Add the non-converted attributes
  1322. int classIndex = -1;
  1323. for (int i = 0; i < getInputFormat().numAttributes(); i++) {
  1324. if (!m_SelectedRange.isInRange(i)) {
  1325. if (getInputFormat().classIndex() == i) {
  1326. classIndex = attributes.size();
  1327. }
  1328. attributes.addElement(getInputFormat().attribute(i).copy());
  1329. }
  1330. }
  1331. // Add the word vector attributes (eliminating duplicates
  1332. // that occur in multiple classes)
  1333. TreeMap newDictionary = new TreeMap();
  1334. int index = attributes.size();
  1335. for(int z = 0; z < values; z++) {
  1336. Iterator it = dictionaryArr[z].keySet().iterator();
  1337. while (it.hasNext()) {
  1338. String word = (String)it.next();
  1339. Count count = (Count)dictionaryArr[z].get(word);
  1340. if (count.count >= prune[z]) {
  1341. if(newDictionary.get(word) == null) {
  1342. newDictionary.put(word, new Integer(index++));
  1343. attributes.addElement(new Attribute(m_Prefix + word));
  1344. }
  1345. }
  1346. }
  1347. }
  1348. // Compute document frequencies
  1349. m_DocsCounts = new int[attributes.size()];
  1350. Iterator it = newDictionary.keySet().iterator();
  1351. while(it.hasNext()) {
  1352. String word = (String) it.next();
  1353. int idx = ((Integer)newDictionary.get(word)).intValue();
  1354. int docsCount=0;
  1355. for(int j=0; j<values; j++) {
  1356. Count c = (Count) dictionaryArr[j].get(word);
  1357. if(c!=null)
  1358. docsCount += c.docCount;
  1359. }
  1360. m_DocsCounts[idx]=docsCount;
  1361. }
  1362. // Trim vector and set instance variables
  1363. attributes.trimToSize();
  1364. m_Dictionary = newDictionary;
  1365. m_NumInstances = getInputFormat().numInstances();
  1366. // Set the filter's output format
  1367. Instances outputFormat = new Instances(getInputFormat().relationName(),
  1368. attributes, 0);
  1369. outputFormat.setClassIndex(classIndex);
  1370. setOutputFormat(outputFormat);
  1371. }
  1372. /**
  1373. * Converts the instance w/o normalization.
  1374. *
  1375. * @oaram instance the instance to convert
  1376. * @param v
  1377. * @return the conerted instance
  1378. */
  1379. private int convertInstancewoDocNorm(Instance instance, FastVector v) {
  1380. // Convert the instance into a sorted set of indexes
  1381. TreeMap contained = new TreeMap();
  1382. // Copy all non-converted attributes from input to output
  1383. int firstCopy = 0;
  1384. for (int i = 0; i < getInputFormat().numAttributes(); i++) {
  1385. if (!m_SelectedRange.isInRange(i)) {
  1386. if (getInputFormat().attribute(i).type() != Attribute.STRING &&
  1387. getInputFormat().attribute(i).type() != Attribute.RELATIONAL) {
  1388. // Add simple nominal and numeric attributes directly
  1389. if (instance.value(i) != 0.0) {
  1390. contained.put(new Integer(firstCopy),
  1391. new Double(instance.value(i)));
  1392. }
  1393. } else {
  1394. if (instance.isMissing(i)) {
  1395. contained.put(new Integer(firstCopy),
  1396. new Double(Instance.missingValue()));
  1397. } else if (getInputFormat().attribute(i).type() == Attribute.STRING) {
  1398. // If this is a string attribute, we have to first add
  1399. // this value to the range of possible values, then add
  1400. // its new internal index.
  1401. if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
  1402. // Note that the first string value in a
  1403. // SparseInstance doesn't get printed.
  1404. outputFormatPeek().attribute(firstCopy)
  1405. .addStringValue("Hack to defeat SparseInstance bug");
  1406. }
  1407. int newIndex = outputFormatPeek().attribute(firstCopy)
  1408. .addStringValue(instance.stringValue(i));
  1409. contained.put(new Integer(firstCopy),
  1410. new Double(newIndex));
  1411. } else {
  1412. // relational
  1413. if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
  1414. Instan