PageRenderTime 1500ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/starter/src/cs276/pe1/spell/KGramSpellingCorrector.java

https://github.com/rioleo/huangenius
Java | 101 lines | 69 code | 22 blank | 10 comment | 15 complexity | 6867259a27c72d95f8a52a2ac332424f MD5 | raw file
  1. package cs276.pe1.spell;
  2. import java.io.File;
  3. import java.util.List;
  4. import cs276.util.IOUtils;
  5. import cs276.util.StringUtils;
  6. import java.util.*;
  7. import cs276.util.Counter;
  8. public class KGramSpellingCorrector implements SpellingCorrector {
  9. /** Initializes spelling corrector by indexing kgrams in words from a file */
  10. private HashMap<String, Counter<String>> index;
  11. public KGramSpellingCorrector() {
  12. // instantiate new index
  13. index = new HashMap<String, Counter<String>>();
  14. File path = new File("/afs/ir/class/cs276/pe1-2011/big.txt.gz");
  15. // File path = new File("/afs/ir/class/cs276/pe1-2011/imdb-plots-20081003.list.gz");
  16. for (String line : IOUtils.readLines(IOUtils.openFile(path))) {
  17. for (String word : StringUtils.tokenize(line)) {
  18. //ArrayList<String> bigrams = getBigrams(word);
  19. ArrayList<String> bigrams = getKgrams(word, 2);
  20. for (String bigram : bigrams) {
  21. if (index.containsKey(bigram)) {
  22. index.get(bigram).incrementCount(word);
  23. } else {
  24. Counter<String> words = new Counter<String>();
  25. words.incrementCount(word);
  26. index.put(bigram, words);
  27. }
  28. }
  29. }
  30. }
  31. }
  32. public List<String> corrections(String word) {
  33. Set<String> wordBigrams = getKgramsSet(word, 2);
  34. Counter<String> possibleCorrections = new Counter<String>();
  35. for (String wordBigram : wordBigrams) {
  36. Set<String> postings = null;
  37. if (index.containsKey(wordBigram)) postings = index.get(wordBigram).keySet();
  38. if (postings != null) {
  39. for (String posting : postings) {
  40. if (!possibleCorrections.containsKey(posting)) {
  41. Set<String> intersect = getKgramsSet(posting, 2);
  42. intersect.retainAll(wordBigrams);
  43. Set<String> union = getKgramsSet(posting, 2);
  44. union.addAll(wordBigrams);
  45. possibleCorrections.setCount(posting, ((double) intersect.size())/union.size());
  46. }
  47. }
  48. }
  49. }
  50. // System.out.println("Word: " + word);
  51. // System.out.println("KGram corrections----------------------");
  52. // for (String entry : possibleCorrections.topK(20)) {
  53. // double occurrences = index.get(getKgrams(entry,2).get(0)).getCount(entry);
  54. // System.out.println(entry + " " + possibleCorrections.getCount(entry) + ": " + occurrences);
  55. // }
  56. return possibleCorrections.topK(5);
  57. }
  58. private Set<String> getKgramsSet(String word, int k) {
  59. Set<String> kgrams = new HashSet<String>();
  60. for (int i = 0; i < k-1; i++) word = "$" + word + "$";
  61. for (int i = 0; i < word.length()-(k-1); i++) {
  62. String kgram = word.substring(i, i+k);
  63. kgrams.add(kgram);
  64. }
  65. return kgrams;
  66. }
  67. private ArrayList<String> getKgrams(String word, int k) {
  68. ArrayList<String> kgrams = new ArrayList<String>();
  69. for (int i = 0; i < k-1; i++) word = "$" + word + "$";
  70. for (int i = 0; i < word.length()-(k-1); i++) {
  71. String kgram = word.substring(i, i+k);
  72. kgrams.add(kgram);
  73. }
  74. return kgrams;
  75. }
  76. public double getOccurrences(String word) {
  77. return(index.get(getKgrams(word,2).get(0)).getCount(word));
  78. }
  79. }