phonetic.py | searchcode

/src/whoosh/lang/phonetic.py

https://bitbucket.org/mkr/whoosh
Python | 119 lines | 105 code | 7 blank | 7 comment | 0 complexity | de3d035f15caccc53035081d5bbd9709 MD5 | raw file
Possible License(s): Apache-2.0

#encoding: utf-8

"""
This module contains quasi-phonetic encoders for words in different languages.
"""

import re

from whoosh.compat import iteritems

# This soundex implementation is adapted from the recipe here:
# http://code.activestate.com/recipes/52213/

english_codes = '01230120022455012623010202'


def soundex_en(word):
    # digits holds the soundex values for the alphabet
    r = ""
    if word:
        # Remember first character
        fc = None
        prevcode = None
        for char in word.lower():
            c = ord(char)
            if c >= 97 and c <= 122:  # a-z
                if not fc:
                    fc = char
                code = english_codes[c - 97]
                # Don't append the code if it's the same as the previous
                if code != prevcode:
                    r += code
                prevcode = code

        # Replace first digit with first alpha character
        r = fc + r[1:]

    return r


# Quasi-phonetic coder for Spanish, translated to Python from Sebastian
# Ferreyra's version here:
# http://www.javalobby.org/java/forums/t16936.html

_esp_codes = (("\\Aw?[uh]?([aeiou])", ""),
              ("c[eiéí]|z|ll|sh|ch|sch|cc|y[aeiouáéíóú]|ps|bs|x|j|g[eiéí]", "s"),
              ("[aeiouhwáéíóúü]+", ""),
              ("y", ""),
              ("ñ|gn", "n"),
              ("[dpc]t", "t"),
              ("c[aouáóú]|ck|q", "k"),
              ("v", "b"),
              ("d$", "t"), # Change a trailing d to a t
              )
_esp_codes = tuple((re.compile(pat), repl) for pat, repl in _esp_codes)


def soundex_esp(word):
    word = word.lower()
    r = ""

    prevcode = None
    i = 0
    while i < len(word):
        code = None
        for expr, ecode in _esp_codes:
            match = expr.match(word, i)
            if match:
                i = match.end()
                code = ecode
                break

        if code is None:
            code = word[i]
            i += 1

        if code != prevcode:
            r += code
        prevcode = code

    return r


# This version of soundex for Arabic is translated to Python from Tammam
# Koujan's C# version here:
# http://www.codeproject.com/KB/recipes/ArabicSoundex.aspx

# Create a dictionary mapping arabic characters to digits
_arabic_codes = {}
for chars, code in iteritems({'\u0627\u0623\u0625\u0622\u062d\u062e\u0647\u0639\u063a\u0634\u0648\u064a': "0",
                    '\u0641\u0628': "1",
                    '\u062c\u0632\u0633\u0635\u0638\u0642\u0643': "2",
                    '\u062a\u062b\u062f\u0630\u0636\u0637': "3",
                    '\u0644': "4",
                    '\u0645\u0646': "5",
                    '\u0631': "6",
                    }):
    for char in chars:
        _arabic_codes[char] = code


def soundex_ar(word):
    if word[0] in "\u0627\u0623\u0625\u0622":
        word = word[1:]

    r = "0"
    prevcode = "0"
    if len(word) > 1:
        # Discard the first character
        for char in word[1:]:
            if char in _arabic_codes:
                code = _arabic_codes.get(char, "0")
            # Don't append the code if it's the same as the previous
            if code != prevcode:
                # If the code is a 0 (vowel), don't process it
                if code != "0":
                    r += code
            prevcode = code
    return r