PageRenderTime 49ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/src/whoosh/lang/phonetic.py

https://bitbucket.org/mkr/whoosh
Python | 119 lines | 105 code | 7 blank | 7 comment | 0 complexity | de3d035f15caccc53035081d5bbd9709 MD5 | raw file
Possible License(s): Apache-2.0
  1. #encoding: utf-8
  2. """
  3. This module contains quasi-phonetic encoders for words in different languages.
  4. """
  5. import re
  6. from whoosh.compat import iteritems
  7. # This soundex implementation is adapted from the recipe here:
  8. # http://code.activestate.com/recipes/52213/
  9. english_codes = '01230120022455012623010202'
  10. def soundex_en(word):
  11. # digits holds the soundex values for the alphabet
  12. r = ""
  13. if word:
  14. # Remember first character
  15. fc = None
  16. prevcode = None
  17. for char in word.lower():
  18. c = ord(char)
  19. if c >= 97 and c <= 122: # a-z
  20. if not fc:
  21. fc = char
  22. code = english_codes[c - 97]
  23. # Don't append the code if it's the same as the previous
  24. if code != prevcode:
  25. r += code
  26. prevcode = code
  27. # Replace first digit with first alpha character
  28. r = fc + r[1:]
  29. return r
  30. # Quasi-phonetic coder for Spanish, translated to Python from Sebastian
  31. # Ferreyra's version here:
  32. # http://www.javalobby.org/java/forums/t16936.html
  33. _esp_codes = (("\\Aw?[uh]?([aeiou])", ""),
  34. ("c[eiéí]|z|ll|sh|ch|sch|cc|y[aeiouáéíóú]|ps|bs|x|j|g[eiéí]", "s"),
  35. ("[aeiouhwáéíóúü]+", ""),
  36. ("y", ""),
  37. ("ñ|gn", "n"),
  38. ("[dpc]t", "t"),
  39. ("c[aouáóú]|ck|q", "k"),
  40. ("v", "b"),
  41. ("d$", "t"), # Change a trailing d to a t
  42. )
  43. _esp_codes = tuple((re.compile(pat), repl) for pat, repl in _esp_codes)
  44. def soundex_esp(word):
  45. word = word.lower()
  46. r = ""
  47. prevcode = None
  48. i = 0
  49. while i < len(word):
  50. code = None
  51. for expr, ecode in _esp_codes:
  52. match = expr.match(word, i)
  53. if match:
  54. i = match.end()
  55. code = ecode
  56. break
  57. if code is None:
  58. code = word[i]
  59. i += 1
  60. if code != prevcode:
  61. r += code
  62. prevcode = code
  63. return r
  64. # This version of soundex for Arabic is translated to Python from Tammam
  65. # Koujan's C# version here:
  66. # http://www.codeproject.com/KB/recipes/ArabicSoundex.aspx
  67. # Create a dictionary mapping arabic characters to digits
  68. _arabic_codes = {}
  69. for chars, code in iteritems({'\u0627\u0623\u0625\u0622\u062d\u062e\u0647\u0639\u063a\u0634\u0648\u064a': "0",
  70. '\u0641\u0628': "1",
  71. '\u062c\u0632\u0633\u0635\u0638\u0642\u0643': "2",
  72. '\u062a\u062b\u062f\u0630\u0636\u0637': "3",
  73. '\u0644': "4",
  74. '\u0645\u0646': "5",
  75. '\u0631': "6",
  76. }):
  77. for char in chars:
  78. _arabic_codes[char] = code
  79. def soundex_ar(word):
  80. if word[0] in "\u0627\u0623\u0625\u0622":
  81. word = word[1:]
  82. r = "0"
  83. prevcode = "0"
  84. if len(word) > 1:
  85. # Discard the first character
  86. for char in word[1:]:
  87. if char in _arabic_codes:
  88. code = _arabic_codes.get(char, "0")
  89. # Don't append the code if it's the same as the previous
  90. if code != prevcode:
  91. # If the code is a 0 (vowel), don't process it
  92. if code != "0":
  93. r += code
  94. prevcode = code
  95. return r