/src/whoosh/lang/phonetic.py
Python | 119 lines | 105 code | 7 blank | 7 comment | 0 complexity | de3d035f15caccc53035081d5bbd9709 MD5 | raw file
Possible License(s): Apache-2.0
- #encoding: utf-8
- """
- This module contains quasi-phonetic encoders for words in different languages.
- """
- import re
- from whoosh.compat import iteritems
- # This soundex implementation is adapted from the recipe here:
- # http://code.activestate.com/recipes/52213/
- english_codes = '01230120022455012623010202'
- def soundex_en(word):
- # digits holds the soundex values for the alphabet
- r = ""
- if word:
- # Remember first character
- fc = None
- prevcode = None
- for char in word.lower():
- c = ord(char)
- if c >= 97 and c <= 122: # a-z
- if not fc:
- fc = char
- code = english_codes[c - 97]
- # Don't append the code if it's the same as the previous
- if code != prevcode:
- r += code
- prevcode = code
- # Replace first digit with first alpha character
- r = fc + r[1:]
- return r
- # Quasi-phonetic coder for Spanish, translated to Python from Sebastian
- # Ferreyra's version here:
- # http://www.javalobby.org/java/forums/t16936.html
- _esp_codes = (("\\Aw?[uh]?([aeiou])", ""),
- ("c[eiéí]|z|ll|sh|ch|sch|cc|y[aeiouáéíóú]|ps|bs|x|j|g[eiéí]", "s"),
- ("[aeiouhwáéíóúü]+", ""),
- ("y", ""),
- ("ñ|gn", "n"),
- ("[dpc]t", "t"),
- ("c[aouáóú]|ck|q", "k"),
- ("v", "b"),
- ("d$", "t"), # Change a trailing d to a t
- )
- _esp_codes = tuple((re.compile(pat), repl) for pat, repl in _esp_codes)
- def soundex_esp(word):
- word = word.lower()
- r = ""
- prevcode = None
- i = 0
- while i < len(word):
- code = None
- for expr, ecode in _esp_codes:
- match = expr.match(word, i)
- if match:
- i = match.end()
- code = ecode
- break
- if code is None:
- code = word[i]
- i += 1
- if code != prevcode:
- r += code
- prevcode = code
- return r
- # This version of soundex for Arabic is translated to Python from Tammam
- # Koujan's C# version here:
- # http://www.codeproject.com/KB/recipes/ArabicSoundex.aspx
- # Create a dictionary mapping arabic characters to digits
- _arabic_codes = {}
- for chars, code in iteritems({'\u0627\u0623\u0625\u0622\u062d\u062e\u0647\u0639\u063a\u0634\u0648\u064a': "0",
- '\u0641\u0628': "1",
- '\u062c\u0632\u0633\u0635\u0638\u0642\u0643': "2",
- '\u062a\u062b\u062f\u0630\u0636\u0637': "3",
- '\u0644': "4",
- '\u0645\u0646': "5",
- '\u0631': "6",
- }):
- for char in chars:
- _arabic_codes[char] = code
- def soundex_ar(word):
- if word[0] in "\u0627\u0623\u0625\u0622":
- word = word[1:]
- r = "0"
- prevcode = "0"
- if len(word) > 1:
- # Discard the first character
- for char in word[1:]:
- if char in _arabic_codes:
- code = _arabic_codes.get(char, "0")
- # Don't append the code if it's the same as the previous
- if code != prevcode:
- # If the code is a 0 (vowel), don't process it
- if code != "0":
- r += code
- prevcode = code
- return r