PageRenderTime 7ms CodeModel.GetById 1ms app.highlight 4ms RepoModel.GetById 1ms app.codeStats 0ms

/spell-correct.py

http://spell-correct-in-go.googlecode.com/
Python | 36 lines | 27 code | 9 blank | 0 comment | 34 complexity | 5dfa56ceca4676533ad7a94d5161de51 MD5 | raw file
 1from datetime import datetime
 2import re, collections
 3
 4def words(text): return re.findall('[a-z]+', text.lower())
 5
 6def train(features):
 7    model = collections.defaultdict(lambda: 1)
 8    for f in features:
 9        model[f] += 1
10    return model
11
12NWORDS = train(words(file('big.txt').read()))
13
14alphabet = 'abcdefghijklmnopqrstuvwxyz'
15
16def edits1(word):
17   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
18   deletes    = [a + b[1:] for a, b in splits if b]
19   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
20   replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
21   inserts    = [a + c + b     for a, b in splits for c in alphabet]
22   return set(deletes + transposes + replaces + inserts)
23
24def known_edits2(word):
25    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
26
27def known(words): return set(w for w in words if w in NWORDS)
28
29def correct(word):
30    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
31    return max(candidates, key=NWORDS.get)
32
33startTime = datetime.now()
34for i in range(100):
35    correct('korrecter')
36print datetime.now() - startTime