/stemmer.go

http://github.com/agonopol/go-stem · Go · 354 lines · 334 code · 18 blank · 2 comment · 221 complexity · b3e088cb277aa34bfd1cfaa3ba5e6df0 MD5 · raw file

  1. package stemmer
  2. import "fmt"
  3. import "bytes"
  4. func ingore() {
  5. fmt.Sprintf("")
  6. }
  7. func Consonant(body []byte, offset int) bool {
  8. switch body[offset] {
  9. case 'A', 'E', 'I', 'O', 'U', 'a', 'e', 'i', 'o', 'u':
  10. return false
  11. case 'Y', 'y':
  12. if offset == 0 {
  13. return true
  14. }
  15. return offset > 0 && !Consonant(body, offset-1)
  16. }
  17. return true
  18. }
  19. func Vowel(body []byte, offset int) bool {
  20. return !Consonant(body, offset)
  21. }
  22. const (
  23. vowel_state = iota
  24. consonant_state
  25. )
  26. func Measure(body []byte) int {
  27. meansure := 0
  28. if len(body) > 0 {
  29. var state int
  30. if Vowel(body, 0) {
  31. state = vowel_state
  32. } else {
  33. state = consonant_state
  34. }
  35. for i := 0; i < len(body); i++ {
  36. if Vowel(body, i) && state == consonant_state {
  37. state = vowel_state
  38. } else if Consonant(body, i) && state == vowel_state {
  39. state = consonant_state
  40. meansure++
  41. }
  42. }
  43. }
  44. return meansure
  45. }
  46. func hasVowel(body []byte) bool {
  47. for i := 0; i < len(body); i++ {
  48. if Vowel(body, i) {
  49. return true
  50. }
  51. }
  52. return false
  53. }
  54. func one_a(body []byte) []byte {
  55. if bytes.HasSuffix(body, []byte("sses")) || bytes.HasSuffix(body, []byte("ies")) {
  56. return body[:len(body)-2]
  57. } else if bytes.HasSuffix(body, []byte("ss")) {
  58. return body
  59. } else if bytes.HasSuffix(body, []byte("s")) {
  60. return body[:len(body)-1]
  61. }
  62. return body
  63. }
  64. func star_o(body []byte) bool {
  65. size := len(body) - 1
  66. if size >= 2 && Consonant(body, size-2) && Vowel(body, size-1) && Consonant(body, size) {
  67. return body[size] != 'w' && body[size] != 'x' && body[size] != 'y'
  68. }
  69. return false
  70. }
  71. func one_b_a(body []byte) []byte {
  72. size := len(body)
  73. if bytes.HasSuffix(body, []byte("at")) {
  74. return append(body, 'e')
  75. } else if bytes.HasSuffix(body, []byte("bl")) {
  76. return append(body, 'e')
  77. } else if bytes.HasSuffix(body, []byte("iz")) {
  78. return append(body, 'e')
  79. } else if Consonant(body, size-1) && Consonant(body, size-2) && body[size-1] == body[size-2] {
  80. if body[size-1] != 'l' && body[size-1] != 's' && body[size-1] != 'z' {
  81. return body[:size-1]
  82. }
  83. } else if star_o(body) && Measure(body) == 1 {
  84. return append(body, 'e')
  85. }
  86. return body
  87. }
  88. func one_b(body []byte) []byte {
  89. if bytes.HasSuffix(body, []byte("eed")) {
  90. if Measure(body[:len(body)-3]) > 0 {
  91. return body[:len(body)-1]
  92. }
  93. } else if bytes.HasSuffix(body, []byte("ed")) {
  94. if hasVowel(body[:len(body)-2]) {
  95. return one_b_a(body[:len(body)-2])
  96. }
  97. } else if bytes.HasSuffix(body, []byte("ing")) {
  98. if hasVowel(body[:len(body)-3]) {
  99. return one_b_a(body[:len(body)-3])
  100. }
  101. }
  102. return body
  103. }
  104. func one_c(body []byte) []byte {
  105. if bytes.HasSuffix(body, []byte("y")) && hasVowel(body[:len(body)-1]) {
  106. body[len(body)-1] = 'i'
  107. return body
  108. }
  109. return body
  110. }
  111. func two(body []byte) []byte {
  112. if bytes.HasSuffix(body, []byte("ational")) {
  113. if Measure(body[:len(body)-7]) > 0 {
  114. return append(body[:len(body)-7], []byte("ate")...)
  115. }
  116. } else if bytes.HasSuffix(body, []byte("tional")) {
  117. if Measure(body[:len(body)-6]) > 0 {
  118. return body[:len(body)-2]
  119. }
  120. } else if bytes.HasSuffix(body, []byte("enci")) || bytes.HasSuffix(body, []byte("anci")) {
  121. if Measure(body[:len(body)-4]) > 0 {
  122. return append(body[:len(body)-1], 'e')
  123. }
  124. } else if bytes.HasSuffix(body, []byte("izer")) {
  125. if Measure(body[:len(body)-4]) > 0 {
  126. return append(body[:len(body)-4], []byte("ize")...)
  127. }
  128. } else if bytes.HasSuffix(body, []byte("abli")) {
  129. if Measure(body[:len(body)-4]) > 0 {
  130. return append(body[:len(body)-4], []byte("able")...)
  131. }
  132. // To match the published algorithm, delete the following phrase
  133. } else if bytes.HasSuffix(body, []byte("bli")) {
  134. if Measure(body[:len(body)-3]) > 0 {
  135. return append(body[:len(body)-1], 'e')
  136. }
  137. } else if bytes.HasSuffix(body, []byte("alli")) {
  138. if Measure(body[:len(body)-4]) > 0 {
  139. return append(body[:len(body)-4], []byte("al")...)
  140. }
  141. } else if bytes.HasSuffix(body, []byte("entli")) {
  142. if Measure(body[:len(body)-5]) > 0 {
  143. return append(body[:len(body)-5], []byte("ent")...)
  144. }
  145. } else if bytes.HasSuffix(body, []byte("eli")) {
  146. if Measure(body[:len(body)-3]) > 0 {
  147. return append(body[:len(body)-3], []byte("e")...)
  148. }
  149. } else if bytes.HasSuffix(body, []byte("ousli")) {
  150. if Measure(body[:len(body)-5]) > 0 {
  151. return append(body[:len(body)-5], []byte("ous")...)
  152. }
  153. } else if bytes.HasSuffix(body, []byte("ization")) {
  154. if Measure(body[:len(body)-7]) > 0 {
  155. return append(body[:len(body)-7], []byte("ize")...)
  156. }
  157. } else if bytes.HasSuffix(body, []byte("ation")) {
  158. if Measure(body[:len(body)-5]) > 0 {
  159. return append(body[:len(body)-5], []byte("ate")...)
  160. }
  161. } else if bytes.HasSuffix(body, []byte("ator")) {
  162. if Measure(body[:len(body)-4]) > 0 {
  163. return append(body[:len(body)-4], []byte("ate")...)
  164. }
  165. } else if bytes.HasSuffix(body, []byte("alism")) {
  166. if Measure(body[:len(body)-5]) > 0 {
  167. return append(body[:len(body)-5], []byte("al")...)
  168. }
  169. } else if bytes.HasSuffix(body, []byte("iveness")) {
  170. if Measure(body[:len(body)-7]) > 0 {
  171. return append(body[:len(body)-7], []byte("ive")...)
  172. }
  173. } else if bytes.HasSuffix(body, []byte("fulness")) {
  174. if Measure(body[:len(body)-7]) > 0 {
  175. return append(body[:len(body)-7], []byte("ful")...)
  176. }
  177. } else if bytes.HasSuffix(body, []byte("ousness")) {
  178. if Measure(body[:len(body)-7]) > 0 {
  179. return append(body[:len(body)-7], []byte("ous")...)
  180. }
  181. } else if bytes.HasSuffix(body, []byte("aliti")) {
  182. if Measure(body[:len(body)-5]) > 0 {
  183. return append(body[:len(body)-5], []byte("al")...)
  184. }
  185. } else if bytes.HasSuffix(body, []byte("iviti")) {
  186. if Measure(body[:len(body)-5]) > 0 {
  187. return append(body[:len(body)-5], []byte("ive")...)
  188. }
  189. } else if bytes.HasSuffix(body, []byte("biliti")) {
  190. if Measure(body[:len(body)-6]) > 0 {
  191. return append(body[:len(body)-6], []byte("ble")...)
  192. }
  193. // To match the published algorithm, delete the following phrase
  194. } else if bytes.HasSuffix(body, []byte("logi")) {
  195. if Measure(body[:len(body)-4]) > 0 {
  196. return body[:len(body)-1]
  197. }
  198. }
  199. return body
  200. }
  201. func three(body []byte) []byte {
  202. if bytes.HasSuffix(body, []byte("icate")) {
  203. if Measure(body[:len(body)-5]) > 0 {
  204. return body[:len(body)-3]
  205. }
  206. } else if bytes.HasSuffix(body, []byte("ative")) {
  207. if Measure(body[:len(body)-5]) > 0 {
  208. return body[:len(body)-5]
  209. }
  210. } else if bytes.HasSuffix(body, []byte("alize")) {
  211. if Measure(body[:len(body)-5]) > 0 {
  212. return body[:len(body)-3]
  213. }
  214. } else if bytes.HasSuffix(body, []byte("iciti")) {
  215. if Measure(body[:len(body)-5]) > 0 {
  216. return body[:len(body)-3]
  217. }
  218. } else if bytes.HasSuffix(body, []byte("ical")) {
  219. if Measure(body[:len(body)-4]) > 0 {
  220. return body[:len(body)-2]
  221. }
  222. } else if bytes.HasSuffix(body, []byte("ful")) {
  223. if Measure(body[:len(body)-3]) > 0 {
  224. return body[:len(body)-3]
  225. }
  226. } else if bytes.HasSuffix(body, []byte("ness")) {
  227. if Measure(body[:len(body)-4]) > 0 {
  228. return body[:len(body)-4]
  229. }
  230. }
  231. return body
  232. }
  233. func four(body []byte) []byte {
  234. if bytes.HasSuffix(body, []byte("al")) {
  235. if Measure(body[:len(body)-2]) > 1 {
  236. return body[:len(body)-2]
  237. }
  238. } else if bytes.HasSuffix(body, []byte("ance")) {
  239. if Measure(body[:len(body)-4]) > 1 {
  240. return body[:len(body)-4]
  241. }
  242. } else if bytes.HasSuffix(body, []byte("ence")) {
  243. if Measure(body[:len(body)-4]) > 1 {
  244. return body[:len(body)-4]
  245. }
  246. } else if bytes.HasSuffix(body, []byte("er")) {
  247. if Measure(body[:len(body)-2]) > 1 {
  248. return body[:len(body)-2]
  249. }
  250. } else if bytes.HasSuffix(body, []byte("ic")) {
  251. if Measure(body[:len(body)-2]) > 1 {
  252. return body[:len(body)-2]
  253. }
  254. } else if bytes.HasSuffix(body, []byte("able")) {
  255. if Measure(body[:len(body)-4]) > 1 {
  256. return body[:len(body)-4]
  257. }
  258. } else if bytes.HasSuffix(body, []byte("ible")) {
  259. if Measure(body[:len(body)-4]) > 1 {
  260. return body[:len(body)-4]
  261. }
  262. } else if bytes.HasSuffix(body, []byte("ant")) {
  263. if Measure(body[:len(body)-3]) > 1 {
  264. return body[:len(body)-3]
  265. }
  266. } else if bytes.HasSuffix(body, []byte("ement")) {
  267. if Measure(body[:len(body)-5]) > 1 {
  268. return body[:len(body)-5]
  269. }
  270. } else if bytes.HasSuffix(body, []byte("ment")) {
  271. if Measure(body[:len(body)-4]) > 1 {
  272. return body[:len(body)-4]
  273. }
  274. } else if bytes.HasSuffix(body, []byte("ent")) {
  275. if Measure(body[:len(body)-3]) > 1 {
  276. return body[:len(body)-3]
  277. }
  278. } else if bytes.HasSuffix(body, []byte("ion")) {
  279. if Measure(body[:len(body)-3]) > 1 {
  280. if len(body) > 4 && (body[len(body)-4] == 's' || body[len(body)-4] == 't') {
  281. return body[:len(body)-3]
  282. }
  283. }
  284. } else if bytes.HasSuffix(body, []byte("ou")) {
  285. if Measure(body[:len(body)-2]) > 1 {
  286. return body[:len(body)-2]
  287. }
  288. } else if bytes.HasSuffix(body, []byte("ism")) {
  289. if Measure(body[:len(body)-3]) > 1 {
  290. return body[:len(body)-3]
  291. }
  292. } else if bytes.HasSuffix(body, []byte("ate")) {
  293. if Measure(body[:len(body)-3]) > 1 {
  294. return body[:len(body)-3]
  295. }
  296. } else if bytes.HasSuffix(body, []byte("iti")) {
  297. if Measure(body[:len(body)-3]) > 1 {
  298. return body[:len(body)-3]
  299. }
  300. } else if bytes.HasSuffix(body, []byte("ous")) {
  301. if Measure(body[:len(body)-3]) > 1 {
  302. return body[:len(body)-3]
  303. }
  304. } else if bytes.HasSuffix(body, []byte("ive")) {
  305. if Measure(body[:len(body)-3]) > 1 {
  306. return body[:len(body)-3]
  307. }
  308. } else if bytes.HasSuffix(body, []byte("ize")) {
  309. if Measure(body[:len(body)-3]) > 1 {
  310. return body[:len(body)-3]
  311. }
  312. }
  313. return body
  314. }
  315. func five_a(body []byte) []byte {
  316. if bytes.HasSuffix(body, []byte("e")) && Measure(body[:len(body)-1]) > 1 {
  317. return body[:len(body)-1]
  318. } else if bytes.HasSuffix(body, []byte("e")) && Measure(body[:len(body)-1]) == 1 && !star_o(body[:len(body)-1]) {
  319. return body[:len(body)-1]
  320. }
  321. return body
  322. }
  323. func five_b(body []byte) []byte {
  324. size := len(body)
  325. if Measure(body) > 1 && Consonant(body, size-1) && Consonant(body, size-2) && body[size-1] == body[size-2] && body[size-1] == 'l' {
  326. return body[:len(body)-1]
  327. }
  328. return body
  329. }
  330. func Stem(body []byte) []byte {
  331. word := bytes.TrimSpace(bytes.ToLower(body))
  332. if len(word) > 2 {
  333. return five_b(five_a(four(three(two(one_c(one_b(one_a(word))))))))
  334. }
  335. return word
  336. }