PageRenderTime 105ms CodeModel.GetById 42ms RepoModel.GetById 0ms app.codeStats 0ms

/pe1/src/cs276/pe1/spell/KGramSpellingCorrector.java

https://github.com/sarnoff/CS276
Java | 104 lines | 83 code | 12 blank | 9 comment | 13 complexity | b3054280b9b141b4f28fe99537d44f62 MD5 | raw file
  1. package cs276.pe1.spell;
  2. import java.io.File;
  3. import java.util.Map;
  4. import java.util.HashMap;
  5. import java.util.List;
  6. import java.util.ArrayList;
  7. import java.util.Set;
  8. import java.util.HashSet;
  9. import cs276.util.IOUtils;
  10. import cs276.util.StringUtils;
  11. import cs276.util.Counter;
  12. public class KGramSpellingCorrector implements SpellingCorrector {
  13. protected static int K = 2; //start with bigrams, then extend out
  14. protected static int SE = K-1;//3;//defines extra kgrams - default to k-1 for best performance?
  15. protected static int WL = 10;//returned Word List size
  16. /** Initializes spelling corrector by indexing kgrams in words from a file */
  17. protected Map<String,Set<String>> kgram;
  18. protected Counter<String> freq;
  19. public KGramSpellingCorrector() {
  20. File path = new File("/afs/ir/class/cs276/pe1-2011/big.txt.gz");
  21. //File path = new File("datasources/big.txt.gz");
  22. String extraKGram = "";
  23. for(int i = 0; i < SE; i++)
  24. extraKGram += "$";
  25. kgram = new HashMap<String,Set<String>>();
  26. freq = new Counter<String>();
  27. for (String line : IOUtils.readLines(IOUtils.openFile(path))) {
  28. for (String word : StringUtils.tokenize(line)) {
  29. freq.incrementCount(word);
  30. String key = extraKGram+word+extraKGram;//$ to signal beginning/end of word
  31. if(key.length() <= K)
  32. addWord(key,word);
  33. for(int i = 0;i<(key.length()-K+1);i++)
  34. addWord(key.substring(i,i+K),word);
  35. }
  36. }
  37. }
  38. /*
  39. * Checks key in kgram. If it exists, add value to the set.
  40. * Otherwise it creates a new set and add its value to it.
  41. * Set prevents duplicate values
  42. */
  43. private void addWord(String key, String value)
  44. {
  45. Set<String> set;
  46. if(!kgram.containsKey(key))
  47. {
  48. set = new HashSet<String>();
  49. }
  50. else
  51. {
  52. set = kgram.get(key);
  53. }
  54. set.add(value);
  55. kgram.put(key,set);
  56. }
  57. protected Counter<String> jaccardScore(String word)
  58. {
  59. Counter<String> wordCounts = new Counter<String>();
  60. String extraKGram = "";
  61. for(int i = 0; i < SE; i++)
  62. extraKGram += "$";
  63. String key = extraKGram+word+extraKGram;
  64. Set<String> set;
  65. //if the word is smaller than the key size
  66. if(key.length() <= K)
  67. {
  68. if(kgram.containsKey(key))
  69. {
  70. set = kgram.get(key);
  71. for(String w : set)
  72. wordCounts.incrementCount(w);
  73. }
  74. return wordCounts;
  75. }
  76. //otherwise iterate through the kgram and increment the count
  77. double numKeys = key.length()-K+1;
  78. for(int i = 0;i<numKeys;i++)
  79. {
  80. if(kgram.containsKey(key.substring(i,i+K)))
  81. {
  82. set = kgram.get(key.substring(i,i+K));
  83. for(String w : set)
  84. wordCounts.incrementCount(w,1.0/numKeys);
  85. }
  86. }
  87. return wordCounts;
  88. }
  89. public List<String> corrections(String word) {
  90. return jaccardScore(word).topK(WL);
  91. }
  92. }