PageRenderTime 47ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/src/whoosh/lang/dmetaphone.py

https://bitbucket.org/mkr/whoosh
Python | 415 lines | 374 code | 7 blank | 34 comment | 162 complexity | 9e2c9318db1debcff6bd85adea29e2a7 MD5 | raw file
Possible License(s): Apache-2.0
  1. # coding= utf-8
  2. # This script implements the Double Metaphone algorythm (c) 1998, 1999 by
  3. # Lawrence Philips. It was translated to Python from the C source written by
  4. # Kevin Atkinson (http://aspell.net/metaphone/) By Andrew Collins - January 12,
  5. # 2007 who claims no rights to this work.
  6. # http://atomboy.isa-geek.com:8080/plone/Members/acoil/programing/double-metaphone
  7. import re
  8. from whoosh.compat import u
  9. vowels = frozenset("AEIOUY")
  10. slavo_germ_exp = re.compile("W|K|CZ|WITZ")
  11. silent_starts = re.compile("GN|KN|PN|WR|PS")
  12. def double_metaphone(text):
  13. text = text.upper()
  14. slavo_germanic = bool(slavo_germ_exp.search(text))
  15. length = len(text)
  16. text = "--" + text + " "
  17. first = pos = 2
  18. last = first + length - 1
  19. primary = secondary = ""
  20. if silent_starts.match(text, pos):
  21. pos += 1
  22. while pos < length + 2:
  23. ch = text[pos]
  24. if ch in vowels:
  25. # all init vowels now map to 'A'
  26. if pos != first:
  27. next = (None, 1)
  28. else:
  29. next = ("A", 1)
  30. elif ch == "B":
  31. #"-mb", e.g", "dumb", already skipped over... see 'M' below
  32. if text[pos + 1] == "B":
  33. next = ("P", 2)
  34. else:
  35. next = ("P", 1)
  36. elif ch == "C":
  37. # various germanic
  38. if (pos > (first + 1) and text[pos - 2] not in vowels and text[pos - 1:pos + 2] == 'ACH' and \
  39. (text[pos + 2] not in ['I', 'E'] or text[pos - 2:pos + 4] in ['BACHER', 'MACHER'])):
  40. next = ('K', 2)
  41. # special case 'CAESAR'
  42. elif pos == first and text[first:first + 6] == 'CAESAR':
  43. next = ('S', 2)
  44. elif text[pos:pos + 4] == 'CHIA': # italian 'chianti'
  45. next = ('K', 2)
  46. elif text[pos:pos + 2] == 'CH':
  47. # find 'michael'
  48. if pos > first and text[pos:pos + 4] == 'CHAE':
  49. next = ('K', 'X', 2)
  50. elif pos == first and (text[pos + 1:pos + 6] in ['HARAC', 'HARIS'] or \
  51. text[pos + 1:pos + 4] in ["HOR", "HYM", "HIA", "HEM"]) and text[first:first + 5] != 'CHORE':
  52. next = ('K', 2)
  53. # germanic, greek, or otherwise 'ch' for 'kh' sound
  54. elif text[first:first + 4] in ['VAN ', 'VON '] or text[first:first + 3] == 'SCH' \
  55. or text[pos - 2:pos + 4] in ["ORCHES", "ARCHIT", "ORCHID"] \
  56. or text[pos + 2] in ['T', 'S'] \
  57. or ((text[pos - 1] in ["A", "O", "U", "E"] or pos == first) \
  58. and text[pos + 2] in ["L", "R", "N", "M", "B", "H", "F", "V", "W", " "]):
  59. next = ('K', 1)
  60. else:
  61. if pos > first:
  62. if text[first:first + 2] == 'MC':
  63. next = ('K', 2)
  64. else:
  65. next = ('X', 'K', 2)
  66. else:
  67. next = ('X', 2)
  68. # e.g, 'czerny'
  69. elif text[pos:pos + 2] == 'CZ' and text[pos - 2:pos + 2] != 'WICZ':
  70. next = ('S', 'X', 2)
  71. # e.g., 'focaccia'
  72. elif text[pos + 1:pos + 4] == 'CIA':
  73. next = ('X', 3)
  74. # double 'C', but not if e.g. 'McClellan'
  75. elif text[pos:pos + 2] == 'CC' and not (pos == (first + 1) and text[first] == 'M'):
  76. # 'bellocchio' but not 'bacchus'
  77. if text[pos + 2] in ["I", "E", "H"] and text[pos + 2:pos + 4] != 'HU':
  78. # 'accident', 'accede' 'succeed'
  79. if (pos == (first + 1) and text[first] == 'A') or \
  80. text[pos - 1:pos + 4] in ['UCCEE', 'UCCES']:
  81. next = ('KS', 3)
  82. # 'bacci', 'bertucci', other italian
  83. else:
  84. next = ('X', 3)
  85. else:
  86. next = ('K', 2)
  87. elif text[pos:pos + 2] in ["CK", "CG", "CQ"]:
  88. next = ('K', 'K', 2)
  89. elif text[pos:pos + 2] in ["CI", "CE", "CY"]:
  90. # italian vs. english
  91. if text[pos:pos + 3] in ["CIO", "CIE", "CIA"]:
  92. next = ('S', 'X', 2)
  93. else:
  94. next = ('S', 2)
  95. else:
  96. # name sent in 'mac caffrey', 'mac gregor
  97. if text[pos + 1:pos + 3] in [" C", " Q", " G"]:
  98. next = ('K', 3)
  99. else:
  100. if text[pos + 1] in ["C", "K", "Q"] and text[pos + 1:pos + 3] not in ["CE", "CI"]:
  101. next = ('K', 2)
  102. else: # default for 'C'
  103. next = ('K', 1)
  104. elif ch == u('\xc7'):
  105. next = ('S', 1)
  106. elif ch == 'D':
  107. if text[pos:pos + 2] == 'DG':
  108. if text[pos + 2] in ['I', 'E', 'Y']: # e.g. 'edge'
  109. next = ('J', 3)
  110. else:
  111. next = ('TK', 2)
  112. elif text[pos:pos + 2] in ['DT', 'DD']:
  113. next = ('T', 2)
  114. else:
  115. next = ('T', 1)
  116. elif ch == 'F':
  117. if text[pos + 1] == 'F':
  118. next = ('F', 2)
  119. else:
  120. next = ('F', 1)
  121. elif ch == 'G':
  122. if text[pos + 1] == 'H':
  123. if pos > first and text[pos - 1] not in vowels:
  124. next = ('K', 2)
  125. elif pos < (first + 3):
  126. if pos == first: # 'ghislane', ghiradelli
  127. if text[pos + 2] == 'I':
  128. next = ('J', 2)
  129. else:
  130. next = ('K', 2)
  131. # Parker's rule (with some further refinements) - e.g., 'hugh'
  132. elif (pos > (first + 1) and text[pos - 2] in ['B', 'H', 'D']) \
  133. or (pos > (first + 2) and text[pos - 3] in ['B', 'H', 'D']) \
  134. or (pos > (first + 3) and text[pos - 4] in ['B', 'H']):
  135. next = (None, 2)
  136. else:
  137. # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
  138. if pos > (first + 2) and text[pos - 1] == 'U' \
  139. and text[pos - 3] in ["C", "G", "L", "R", "T"]:
  140. next = ('F', 2)
  141. else:
  142. if pos > first and text[pos - 1] != 'I':
  143. next = ('K', 2)
  144. elif text[pos + 1] == 'N':
  145. if pos == (first + 1) and text[first] in vowels and not slavo_germanic:
  146. next = ('KN', 'N', 2)
  147. else:
  148. # not e.g. 'cagney'
  149. if text[pos + 2:pos + 4] != 'EY' and text[pos + 1] != 'Y' and not slavo_germanic:
  150. next = ('N', 'KN', 2)
  151. else:
  152. next = ('KN', 2)
  153. # 'tagliaro'
  154. elif text[pos + 1:pos + 3] == 'LI' and not slavo_germanic:
  155. next = ('KL', 'L', 2)
  156. # -ges-,-gep-,-gel-, -gie- at beginning
  157. elif pos == first and (text[pos + 1] == 'Y' \
  158. or text[pos + 1:pos + 3] in ["ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER"]):
  159. next = ('K', 'J', 2)
  160. # -ger-, -gy-
  161. elif (text[pos + 1:pos + 2] == 'ER' or text[pos + 1] == 'Y') \
  162. and text[first:first + 6] not in ["DANGER", "RANGER", "MANGER"] \
  163. and text[pos - 1] not in ['E', 'I'] and text[pos - 1:pos + 2] not in ['RGY', 'OGY']:
  164. next = ('K', 'J', 2)
  165. # italian e.g, 'biaggi'
  166. elif text[pos + 1] in ['E', 'I', 'Y'] or text[pos - 1:pos + 3] in ["AGGI", "OGGI"]:
  167. # obvious germanic
  168. if text[first:first + 4] in ['VON ', 'VAN '] or text[first:first + 3] == 'SCH' \
  169. or text[pos + 1:pos + 3] == 'ET':
  170. next = ('K', 2)
  171. else:
  172. # always soft if french ending
  173. if text[pos + 1:pos + 5] == 'IER ':
  174. next = ('J', 2)
  175. else:
  176. next = ('J', 'K', 2)
  177. elif text[pos + 1] == 'G':
  178. next = ('K', 2)
  179. else:
  180. next = ('K', 1)
  181. elif ch == 'H':
  182. # only keep if first & before vowel or btw. 2 vowels
  183. if (pos == first or text[pos - 1] in vowels) and text[pos + 1] in vowels:
  184. next = ('H', 2)
  185. else: # (also takes care of 'HH')
  186. next = (None, 1)
  187. elif ch == 'J':
  188. # obvious spanish, 'jose', 'san jacinto'
  189. if text[pos:pos + 4] == 'JOSE' or text[first:first + 4] == 'SAN ':
  190. if (pos == first and text[pos + 4] == ' ') or text[first:first + 4] == 'SAN ':
  191. next = ('H',)
  192. else:
  193. next = ('J', 'H')
  194. elif pos == first and text[pos:pos + 4] != 'JOSE':
  195. next = ('J', 'A') # Yankelovich/Jankelowicz
  196. else:
  197. # spanish pron. of e.g. 'bajador'
  198. if text[pos - 1] in vowels and not slavo_germanic \
  199. and text[pos + 1] in ['A', 'O']:
  200. next = ('J', 'H')
  201. else:
  202. if pos == last:
  203. next = ('J', ' ')
  204. else:
  205. if text[pos + 1] not in ["L", "T", "K", "S", "N", "M", "B", "Z"] \
  206. and text[pos - 1] not in ["S", "K", "L"]:
  207. next = ('J',)
  208. else:
  209. next = (None,)
  210. if text[pos + 1] == 'J':
  211. next = next + (2,)
  212. else:
  213. next = next + (1,)
  214. elif ch == 'K':
  215. if text[pos + 1] == 'K':
  216. next = ('K', 2)
  217. else:
  218. next = ('K', 1)
  219. elif ch == 'L':
  220. if text[pos + 1] == 'L':
  221. # spanish e.g. 'cabrillo', 'gallegos'
  222. if (pos == (last - 2) and text[pos - 1:pos + 3] in ["ILLO", "ILLA", "ALLE"]) \
  223. or ((text[last - 1:last + 1] in ["AS", "OS"] or text[last] in ["A", "O"]) \
  224. and text[pos - 1:pos + 3] == 'ALLE'):
  225. next = ('L', '', 2)
  226. else:
  227. next = ('L', 2)
  228. else:
  229. next = ('L', 1)
  230. elif ch == 'M':
  231. if text[pos + 1:pos + 4] == 'UMB' \
  232. and (pos + 1 == last or text[pos + 2:pos + 4] == 'ER') \
  233. or text[pos + 1] == 'M':
  234. next = ('M', 2)
  235. else:
  236. next = ('M', 1)
  237. elif ch == 'N':
  238. if text[pos + 1] == 'N':
  239. next = ('N', 2)
  240. else:
  241. next = ('N', 1)
  242. elif ch == u('\xd1'):
  243. next = ('N', 1)
  244. elif ch == 'P':
  245. if text[pos + 1] == 'H':
  246. next = ('F', 2)
  247. elif text[pos + 1] in ['P', 'B']: # also account for "campbell", "raspberry"
  248. next = ('P', 2)
  249. else:
  250. next = ('P', 1)
  251. elif ch == 'Q':
  252. if text[pos + 1] == 'Q':
  253. next = ('K', 2)
  254. else:
  255. next = ('K', 1)
  256. elif ch == 'R':
  257. # french e.g. 'rogier', but exclude 'hochmeier'
  258. if pos == last and not slavo_germanic \
  259. and text[pos - 2:pos] == 'IE' and text[pos - 4:pos - 2] not in ['ME', 'MA']:
  260. next = ('', 'R')
  261. else:
  262. next = ('R',)
  263. if text[pos + 1] == 'R':
  264. next = next + (2,)
  265. else:
  266. next = next + (1,)
  267. elif ch == 'S':
  268. # special cases 'island', 'isle', 'carlisle', 'carlysle'
  269. if text[pos - 1:pos + 2] in ['ISL', 'YSL']:
  270. next = (None, 1)
  271. # special case 'sugar-'
  272. elif pos == first and text[first:first + 5] == 'SUGAR':
  273. next = ('X', 'S', 1)
  274. elif text[pos:pos + 2] == 'SH':
  275. # germanic
  276. if text[pos + 1:pos + 5] in ["HEIM", "HOEK", "HOLM", "HOLZ"]:
  277. next = ('S', 2)
  278. else:
  279. next = ('X', 2)
  280. # italian & armenian
  281. elif text[pos:pos + 3] in ["SIO", "SIA"] or text[pos:pos + 4] == 'SIAN':
  282. if not slavo_germanic:
  283. next = ('S', 'X', 3)
  284. else:
  285. next = ('S', 3)
  286. # german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
  287. # also, -sz- in slavic language altho in hungarian it is pronounced 's'
  288. elif (pos == first and text[pos + 1] in ["M", "N", "L", "W"]) or text[pos + 1] == 'Z':
  289. next = ('S', 'X')
  290. if text[pos + 1] == 'Z':
  291. next = next + (2,)
  292. else:
  293. next = next + (1,)
  294. elif text[pos:pos + 2] == 'SC':
  295. # Schlesinger's rule
  296. if text[pos + 2] == 'H':
  297. # dutch origin, e.g. 'school', 'schooner'
  298. if text[pos + 3:pos + 5] in ["OO", "ER", "EN", "UY", "ED", "EM"]:
  299. # 'schermerhorn', 'schenker'
  300. if text[pos + 3:pos + 5] in ['ER', 'EN']:
  301. next = ('X', 'SK', 3)
  302. else:
  303. next = ('SK', 3)
  304. else:
  305. if pos == first and text[first + 3] not in vowels and text[first + 3] != 'W':
  306. next = ('X', 'S', 3)
  307. else:
  308. next = ('X', 3)
  309. elif text[pos + 2] in ['I', 'E', 'Y']:
  310. next = ('S', 3)
  311. else:
  312. next = ('SK', 3)
  313. # french e.g. 'resnais', 'artois'
  314. elif pos == last and text[pos - 2:pos] in ['AI', 'OI']:
  315. next = ('', 'S', 1)
  316. else:
  317. next = ('S',)
  318. if text[pos + 1] in ['S', 'Z']:
  319. next = next + (2,)
  320. else:
  321. next = next + (1,)
  322. elif ch == 'T':
  323. if text[pos:pos + 4] == 'TION':
  324. next = ('X', 3)
  325. elif text[pos:pos + 3] in ['TIA', 'TCH']:
  326. next = ('X', 3)
  327. elif text[pos:pos + 2] == 'TH' or text[pos:pos + 3] == 'TTH':
  328. # special case 'thomas', 'thames' or germanic
  329. if text[pos + 2:pos + 4] in ['OM', 'AM'] or text[first:first + 4] in ['VON ', 'VAN '] \
  330. or text[first:first + 3] == 'SCH':
  331. next = ('T', 2)
  332. else:
  333. next = ('0', 'T', 2)
  334. elif text[pos + 1] in ['T', 'D']:
  335. next = ('T', 2)
  336. else:
  337. next = ('T', 1)
  338. elif ch == 'V':
  339. if text[pos + 1] == 'V':
  340. next = ('F', 2)
  341. else:
  342. next = ('F', 1)
  343. elif ch == 'W':
  344. # can also be in middle of word
  345. if text[pos:pos + 2] == 'WR':
  346. next = ('R', 2)
  347. elif pos == first and (text[pos + 1] in vowels or text[pos:pos + 2] == 'WH'):
  348. # Wasserman should match Vasserman
  349. if text[pos + 1] in vowels:
  350. next = ('A', 'F', 1)
  351. else:
  352. next = ('A', 1)
  353. # Arnow should match Arnoff
  354. elif (pos == last and text[pos - 1] in vowels) \
  355. or text[pos - 1:pos + 5] in ["EWSKI", "EWSKY", "OWSKI", "OWSKY"] \
  356. or text[first:first + 3] == 'SCH':
  357. next = ('', 'F', 1)
  358. # polish e.g. 'filipowicz'
  359. elif text[pos:pos + 4] in ["WICZ", "WITZ"]:
  360. next = ('TS', 'FX', 4)
  361. else: # default is to skip it
  362. next = (None, 1)
  363. elif ch == 'X':
  364. # french e.g. breaux
  365. next = (None,)
  366. if not(pos == last and (text[pos - 3:pos] in ["IAU", "EAU"] \
  367. or text[pos - 2:pos] in ['AU', 'OU'])):
  368. next = ('KS',)
  369. if text[pos + 1] in ['C', 'X']:
  370. next = next + (2,)
  371. else:
  372. next = next + (1,)
  373. elif ch == 'Z':
  374. # chinese pinyin e.g. 'zhao'
  375. if text[pos + 1] == 'H':
  376. next = ('J',)
  377. elif text[pos + 1:pos + 3] in ["ZO", "ZI", "ZA"] \
  378. or (slavo_germanic and pos > first and text[pos - 1] != 'T'):
  379. next = ('S', 'TS')
  380. else:
  381. next = ('S',)
  382. if text[pos + 1] == 'Z':
  383. next = next + (2,)
  384. else:
  385. next = next + (1,)
  386. else:
  387. next = (None, 1)
  388. if len(next) == 2:
  389. if next[0]:
  390. primary += next[0]
  391. secondary += next[0]
  392. pos += next[1]
  393. elif len(next) == 3:
  394. if next[0]:
  395. primary += next[0]
  396. if next[1]:
  397. secondary += next[1]
  398. pos += next[2]
  399. if primary == secondary:
  400. return (primary, None)
  401. else:
  402. return (primary, secondary)