/vendor/gems/ferret-0.11.4/test/unit/analysis/tc_token_stream.rb

https://github.com/ekcell/lovdbyless · Ruby · 606 lines · 589 code · 17 blank · 0 comment · 4 complexity · 6e8a6ff07ce01e7695eb13537add0ad7 MD5 · raw file

  1. require File.dirname(__FILE__) + "/../../test_helper"
  2. puts "Loading once"
  3. class TokenTest < Test::Unit::TestCase
  4. include Ferret::Analysis
  5. def test_token
  6. t = Token.new("text", 1, 2, 3)
  7. assert_equal("text", t.text)
  8. assert_equal(1, t.start)
  9. assert_equal(2, t.end)
  10. assert_equal(3, t.pos_inc)
  11. t.text = "yada yada yada"
  12. t.start = 11
  13. t.end = 12
  14. t.pos_inc = 13
  15. assert_equal("yada yada yada", t.text)
  16. assert_equal(11, t.start)
  17. assert_equal(12, t.end)
  18. assert_equal(13, t.pos_inc)
  19. t = Token.new("text", 1, 2)
  20. assert_equal(1, t.pos_inc)
  21. end
  22. end
  23. class AsciiLetterTokenizerTest < Test::Unit::TestCase
  24. include Ferret::Analysis
  25. def test_letter_tokenizer()
  26. input = 'DBalmain@gmail.com is My e-mail 523@#$ ADDRESS. 23#@$'
  27. t = AsciiLetterTokenizer.new(input)
  28. assert_equal(Token.new("DBalmain", 0, 8), t.next())
  29. assert_equal(Token.new("gmail", 9, 14), t.next())
  30. assert_equal(Token.new("com", 15, 18), t.next())
  31. assert_equal(Token.new("is", 19, 21), t.next())
  32. assert_equal(Token.new("My", 22, 24), t.next())
  33. assert_equal(Token.new("e", 25, 26), t.next())
  34. assert_equal(Token.new("mail", 27, 31), t.next())
  35. assert_equal(Token.new("ADDRESS", 39, 46), t.next())
  36. assert(! t.next())
  37. t.text = "one_two three"
  38. assert_equal(Token.new("one", 0, 3), t.next())
  39. assert_equal(Token.new("two", 4, 7), t.next())
  40. assert_equal(Token.new("three", 8, 13), t.next())
  41. assert(! t.next())
  42. t = AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(input))
  43. assert_equal(Token.new("dbalmain", 0, 8), t.next())
  44. assert_equal(Token.new("gmail", 9, 14), t.next())
  45. assert_equal(Token.new("com", 15, 18), t.next())
  46. assert_equal(Token.new("is", 19, 21), t.next())
  47. assert_equal(Token.new("my", 22, 24), t.next())
  48. assert_equal(Token.new("e", 25, 26), t.next())
  49. assert_equal(Token.new("mail", 27, 31), t.next())
  50. assert_equal(Token.new("address", 39, 46), t.next())
  51. assert(! t.next())
  52. end
  53. end
  54. class LetterTokenizerTest < Test::Unit::TestCase
  55. include Ferret::Analysis
  56. def test_letter_tokenizer()
  57. input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
  58. t = LetterTokenizer.new(input)
  59. assert_equal(Token.new('DBalmän', 0, 8), t.next)
  60. assert_equal(Token.new('gmail', 9, 14), t.next)
  61. assert_equal(Token.new('com', 15, 18), t.next)
  62. assert_equal(Token.new('is', 19, 21), t.next)
  63. assert_equal(Token.new('My', 22, 24), t.next)
  64. assert_equal(Token.new('e', 25, 26), t.next)
  65. assert_equal(Token.new('mail', 27, 31), t.next)
  66. assert_equal(Token.new('address', 40, 47), t.next)
  67. assert_equal(Token.new('ÁÄGÇ', 55, 62), t.next)
  68. assert_equal(Token.new('ÊËÌ', 64, 70), t.next)
  69. assert_equal(Token.new('ÚØÃ', 72, 78), t.next)
  70. assert_equal(Token.new('ÖÎÍ', 80, 86), t.next)
  71. assert(! t.next())
  72. t.text = "one_two three"
  73. assert_equal(Token.new("one", 0, 3), t.next())
  74. assert_equal(Token.new("two", 4, 7), t.next())
  75. assert_equal(Token.new("three", 8, 13), t.next())
  76. assert(! t.next())
  77. t = LowerCaseFilter.new(LetterTokenizer.new(input))
  78. assert_equal(Token.new('dbalmän', 0, 8), t.next)
  79. assert_equal(Token.new('gmail', 9, 14), t.next)
  80. assert_equal(Token.new('com', 15, 18), t.next)
  81. assert_equal(Token.new('is', 19, 21), t.next)
  82. assert_equal(Token.new('my', 22, 24), t.next)
  83. assert_equal(Token.new('e', 25, 26), t.next)
  84. assert_equal(Token.new('mail', 27, 31), t.next)
  85. assert_equal(Token.new('address', 40, 47), t.next)
  86. assert_equal(Token.new('áägç', 55, 62), t.next)
  87. assert_equal(Token.new('êëì', 64, 70), t.next)
  88. assert_equal(Token.new('úøã', 72, 78), t.next)
  89. assert_equal(Token.new('öîí', 80, 86), t.next)
  90. assert(! t.next())
  91. t = LetterTokenizer.new(input, true)
  92. assert_equal(Token.new('dbalmän', 0, 8), t.next)
  93. assert_equal(Token.new('gmail', 9, 14), t.next)
  94. assert_equal(Token.new('com', 15, 18), t.next)
  95. assert_equal(Token.new('is', 19, 21), t.next)
  96. assert_equal(Token.new('my', 22, 24), t.next)
  97. assert_equal(Token.new('e', 25, 26), t.next)
  98. assert_equal(Token.new('mail', 27, 31), t.next)
  99. assert_equal(Token.new('address', 40, 47), t.next)
  100. assert_equal(Token.new('áägç', 55, 62), t.next)
  101. assert_equal(Token.new('êëì', 64, 70), t.next)
  102. assert_equal(Token.new('úøã', 72, 78), t.next)
  103. assert_equal(Token.new('öîí', 80, 86), t.next)
  104. assert(! t.next())
  105. end
  106. end if (/utf-8/i =~ Ferret.locale)
  107. class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
  108. include Ferret::Analysis
  109. def test_whitespace_tokenizer()
  110. input = 'DBalmain@gmail.com is My e-mail 52 #$ ADDRESS. 23#@$'
  111. t = AsciiWhiteSpaceTokenizer.new(input)
  112. assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
  113. assert_equal(Token.new('is', 19, 21), t.next)
  114. assert_equal(Token.new('My', 22, 24), t.next)
  115. assert_equal(Token.new('e-mail', 25, 31), t.next)
  116. assert_equal(Token.new('52', 32, 34), t.next)
  117. assert_equal(Token.new('#$', 37, 39), t.next)
  118. assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
  119. assert_equal(Token.new('23#@$', 49, 54), t.next)
  120. assert(! t.next())
  121. t.text = "one_two three"
  122. assert_equal(Token.new("one_two", 0, 7), t.next())
  123. assert_equal(Token.new("three", 8, 13), t.next())
  124. assert(! t.next())
  125. t = AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(input))
  126. assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
  127. assert_equal(Token.new('is', 19, 21), t.next)
  128. assert_equal(Token.new('my', 22, 24), t.next)
  129. assert_equal(Token.new('e-mail', 25, 31), t.next)
  130. assert_equal(Token.new('52', 32, 34), t.next)
  131. assert_equal(Token.new('#$', 37, 39), t.next)
  132. assert_equal(Token.new('address.', 40, 48), t.next)
  133. assert_equal(Token.new('23#@$', 49, 54), t.next)
  134. assert(! t.next())
  135. end
  136. end
  137. class WhiteSpaceTokenizerTest < Test::Unit::TestCase
  138. include Ferret::Analysis
  139. def test_whitespace_tokenizer()
  140. input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
  141. t = WhiteSpaceTokenizer.new(input)
  142. assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t.next)
  143. assert_equal(Token.new('is', 19, 21), t.next)
  144. assert_equal(Token.new('My', 22, 24), t.next)
  145. assert_equal(Token.new('e-mail', 25, 31), t.next)
  146. assert_equal(Token.new('52', 32, 34), t.next)
  147. assert_equal(Token.new('#$', 37, 39), t.next)
  148. assert_equal(Token.new('address.', 40, 48), t.next)
  149. assert_equal(Token.new('23#@$', 49, 54), t.next)
  150. assert_equal(Token.new('ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ', 55, 86), t.next)
  151. assert(! t.next())
  152. t.text = "one_two three"
  153. assert_equal(Token.new("one_two", 0, 7), t.next())
  154. assert_equal(Token.new("three", 8, 13), t.next())
  155. assert(! t.next())
  156. t = LowerCaseFilter.new(WhiteSpaceTokenizer.new(input))
  157. assert_equal(Token.new('dbalmän@gmail.com', 0, 18), t.next)
  158. assert_equal(Token.new('is', 19, 21), t.next)
  159. assert_equal(Token.new('my', 22, 24), t.next)
  160. assert_equal(Token.new('e-mail', 25, 31), t.next)
  161. assert_equal(Token.new('52', 32, 34), t.next)
  162. assert_equal(Token.new('#$', 37, 39), t.next)
  163. assert_equal(Token.new('address.', 40, 48), t.next)
  164. assert_equal(Token.new('23#@$', 49, 54), t.next)
  165. assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
  166. assert(! t.next())
  167. t = WhiteSpaceTokenizer.new(input, true)
  168. assert_equal(Token.new('dbalmän@gmail.com', 0, 18), t.next)
  169. assert_equal(Token.new('is', 19, 21), t.next)
  170. assert_equal(Token.new('my', 22, 24), t.next)
  171. assert_equal(Token.new('e-mail', 25, 31), t.next)
  172. assert_equal(Token.new('52', 32, 34), t.next)
  173. assert_equal(Token.new('#$', 37, 39), t.next)
  174. assert_equal(Token.new('address.', 40, 48), t.next)
  175. assert_equal(Token.new('23#@$', 49, 54), t.next)
  176. assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
  177. assert(! t.next())
  178. end
  179. end if (/utf-8/i =~ Ferret.locale)
  180. class AsciiStandardTokenizerTest < Test::Unit::TestCase
  181. include Ferret::Analysis
  182. def test_standard_tokenizer()
  183. input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
  184. t = AsciiStandardTokenizer.new(input)
  185. assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
  186. assert_equal(Token.new('is', 19, 21), t.next)
  187. assert_equal(Token.new('My', 22, 24), t.next)
  188. assert_equal(Token.new('e-mail', 25, 31), t.next)
  189. assert_equal(Token.new('52', 32, 34), t.next)
  190. assert_equal(Token.new('Address', 40, 47), t.next)
  191. assert_equal(Token.new('23', 49, 51), t.next)
  192. assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
  193. assert_equal(Token.new('TNT', 86, 91), t.next)
  194. assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
  195. assert(! t.next())
  196. t.text = "one_two three"
  197. assert_equal(Token.new("one_two", 0, 7), t.next())
  198. assert_equal(Token.new("three", 8, 13), t.next())
  199. assert(! t.next())
  200. t = AsciiLowerCaseFilter.new(AsciiStandardTokenizer.new(input))
  201. assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
  202. assert_equal(Token.new('is', 19, 21), t.next)
  203. assert_equal(Token.new('my', 22, 24), t.next)
  204. assert_equal(Token.new('e-mail', 25, 31), t.next)
  205. assert_equal(Token.new('52', 32, 34), t.next)
  206. assert_equal(Token.new('address', 40, 47), t.next)
  207. assert_equal(Token.new('23', 49, 51), t.next)
  208. assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
  209. assert_equal(Token.new('tnt', 86, 91), t.next)
  210. assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
  211. assert(! t.next())
  212. end
  213. end
  214. class StandardTokenizerTest < Test::Unit::TestCase
  215. include Ferret::Analysis
  216. def test_standard_tokenizer()
  217. input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
  218. t = StandardTokenizer.new(input)
  219. assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
  220. assert_equal(Token.new('is', 19, 21), t.next)
  221. assert_equal(Token.new('My', 22, 24), t.next)
  222. assert_equal(Token.new('e-mail', 25, 31), t.next)
  223. assert_equal(Token.new('52', 32, 34), t.next)
  224. assert_equal(Token.new('Address', 40, 47), t.next)
  225. assert_equal(Token.new('23', 49, 51), t.next)
  226. assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
  227. assert_equal(Token.new('TNT', 86, 91), t.next)
  228. assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
  229. assert_equal(Token.new('23', 111, 113), t.next)
  230. assert_equal(Token.new('ÁÄGÇ', 117, 124), t.next)
  231. assert_equal(Token.new('ÊËÌ', 126, 132), t.next)
  232. assert_equal(Token.new('ÚØÃ', 134, 140), t.next)
  233. assert_equal(Token.new('ÖÎÍ', 142, 148), t.next)
  234. assert(! t.next())
  235. t.text = "one_two three"
  236. assert_equal(Token.new("one_two", 0, 7), t.next())
  237. assert_equal(Token.new("three", 8, 13), t.next())
  238. assert(! t.next())
  239. t = LowerCaseFilter.new(StandardTokenizer.new(input))
  240. assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
  241. assert_equal(Token.new('is', 19, 21), t.next)
  242. assert_equal(Token.new('my', 22, 24), t.next)
  243. assert_equal(Token.new('e-mail', 25, 31), t.next)
  244. assert_equal(Token.new('52', 32, 34), t.next)
  245. assert_equal(Token.new('address', 40, 47), t.next)
  246. assert_equal(Token.new('23', 49, 51), t.next)
  247. assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
  248. assert_equal(Token.new('tnt', 86, 91), t.next)
  249. assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
  250. assert_equal(Token.new('23', 111, 113), t.next)
  251. assert_equal(Token.new('áägç', 117, 124), t.next)
  252. assert_equal(Token.new('êëì', 126, 132), t.next)
  253. assert_equal(Token.new('úøã', 134, 140), t.next)
  254. assert_equal(Token.new('öîí', 142, 148), t.next)
  255. input = "e-mail 123-1235-asd-1234 http://www.davebalmain.com/trac-site/"
  256. t = HyphenFilter.new(StandardTokenizer.new(input))
  257. assert_equal(Token.new('email', 0, 6), t.next)
  258. assert_equal(Token.new('e', 0, 1, 0), t.next)
  259. assert_equal(Token.new('mail', 2, 6, 1), t.next)
  260. assert_equal(Token.new('123-1235-asd-1234', 7, 24), t.next)
  261. assert_equal(Token.new('www.davebalmain.com/trac-site', 25, 61), t.next)
  262. assert(! t.next())
  263. end
  264. end if (/utf-8/i =~ Ferret.locale)
  265. class RegExpTokenizerTest < Test::Unit::TestCase
  266. include Ferret::Analysis
  267. ALPHA = /[[:alpha:]_-]+/
  268. APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
  269. ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
  270. ACRONYM_WORD = /^#{ACRONYM}$/
  271. APOSTROPHE_WORD = /^#{APOSTROPHE}$/
  272. def test_reg_exp_tokenizer()
  273. input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
  274. t = RegExpTokenizer.new(input)
  275. assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
  276. assert_equal(Token.new('is', 19, 21), t.next)
  277. assert_equal(Token.new('My', 22, 24), t.next)
  278. assert_equal(Token.new('e-mail', 25, 31), t.next)
  279. assert_equal(Token.new('52', 32, 34), t.next)
  280. assert_equal(Token.new('Address', 40, 47), t.next)
  281. assert_equal(Token.new('23', 49, 51), t.next)
  282. assert_equal(Token.new('http://www.google.com/RESULT_3.html', 55, 90), t.next)
  283. assert_equal(Token.new('T.N.T.', 91, 97), t.next)
  284. assert_equal(Token.new('123-1235-ASD-1234', 98, 115), t.next)
  285. assert_equal(Token.new('23', 116, 118), t.next)
  286. assert_equal(Token.new('Rob\'s', 119, 124), t.next)
  287. assert(! t.next())
  288. t.text = "one_two three"
  289. assert_equal(Token.new("one_two", 0, 7), t.next())
  290. assert_equal(Token.new("three", 8, 13), t.next())
  291. assert(! t.next())
  292. t = LowerCaseFilter.new(RegExpTokenizer.new(input))
  293. t2 = LowerCaseFilter.new(RegExpTokenizer.new(input, /\w{2,}/))
  294. assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
  295. assert_equal(Token.new('is', 19, 21), t.next)
  296. assert_equal(Token.new('my', 22, 24), t.next)
  297. assert_equal(Token.new('e-mail', 25, 31), t.next)
  298. assert_equal(Token.new('52', 32, 34), t.next)
  299. assert_equal(Token.new('address', 40, 47), t.next)
  300. assert_equal(Token.new('23', 49, 51), t.next)
  301. assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
  302. assert_equal(Token.new('t.n.t.', 91, 97), t.next)
  303. assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
  304. assert_equal(Token.new('23', 116, 118), t.next)
  305. assert_equal(Token.new('rob\'s', 119, 124), t.next)
  306. assert(! t.next())
  307. assert_equal(Token.new('dbalmain', 0, 8), t2.next)
  308. assert_equal(Token.new('gmail', 9, 14), t2.next)
  309. assert_equal(Token.new('com', 15, 18), t2.next)
  310. assert_equal(Token.new('is', 19, 21), t2.next)
  311. assert_equal(Token.new('my', 22, 24), t2.next)
  312. assert_equal(Token.new('mail', 27, 31), t2.next)
  313. assert_equal(Token.new('52', 32, 34), t2.next)
  314. assert_equal(Token.new('address', 40, 47), t2.next)
  315. assert_equal(Token.new('23', 49, 51), t2.next)
  316. assert_equal(Token.new('http', 55, 59), t2.next)
  317. assert_equal(Token.new('www', 62, 65), t2.next)
  318. assert_equal(Token.new('google', 66, 72), t2.next)
  319. assert_equal(Token.new('com', 73, 76), t2.next)
  320. assert_equal(Token.new('result_3', 77, 85), t2.next)
  321. assert_equal(Token.new('html', 86, 90), t2.next)
  322. assert_equal(Token.new('123', 98, 101), t2.next)
  323. assert_equal(Token.new('1235', 102, 106), t2.next)
  324. assert_equal(Token.new('asd', 107, 110), t2.next)
  325. assert_equal(Token.new('1234', 111, 115), t2.next)
  326. assert_equal(Token.new('23', 116, 118), t2.next)
  327. assert_equal(Token.new('rob', 119, 122), t2.next)
  328. assert(! t2.next())
  329. t = RegExpTokenizer.new(input) do |str|
  330. if str =~ ACRONYM_WORD
  331. str.gsub!(/\./, '')
  332. elsif str =~ APOSTROPHE_WORD
  333. str.gsub!(/'[sS]$/, '')
  334. end
  335. str
  336. end
  337. t = LowerCaseFilter.new(t)
  338. assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
  339. assert_equal(Token.new('is', 19, 21), t.next)
  340. assert_equal(Token.new('my', 22, 24), t.next)
  341. assert_equal(Token.new('e-mail', 25, 31), t.next)
  342. assert_equal(Token.new('52', 32, 34), t.next)
  343. assert_equal(Token.new('address', 40, 47), t.next)
  344. assert_equal(Token.new('23', 49, 51), t.next)
  345. assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
  346. assert_equal(Token.new('tnt', 91, 97), t.next)
  347. assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
  348. assert_equal(Token.new('23', 116, 118), t.next)
  349. assert_equal(Token.new('rob', 119, 124), t.next)
  350. assert(! t.next())
  351. end
  352. end
  353. class MappingFilterTest < Test::Unit::TestCase
  354. include Ferret::Analysis
  355. def test_mapping_filter()
  356. mapping = {
  357. ['à','á','â','ã','ä','å','ā','ă'] => 'a',
  358. 'æ' => 'ae',
  359. ['ď','đ'] => 'd',
  360. ['ç','ć','č','ĉ','ċ'] => 'c',
  361. ['è','é','ê','ë','ē','ę','ě','ĕ','ė',] => 'e',
  362. ['ƒ'] => 'f',
  363. ['ĝ','ğ','ġ','ģ'] => 'g',
  364. ['ĥ','ħ'] => 'h',
  365. ['ì','ì','í','î','ï','ī','ĩ','ĭ'] => 'i',
  366. ['į','ı','ij','ĵ'] => 'j',
  367. ['ķ','ĸ'] => 'k',
  368. ['ł','ľ','ĺ','ļ','ŀ'] => 'l',
  369. ['ñ','ń','ň','ņ','ʼn','ŋ'] => 'n',
  370. ['ò','ó','ô','õ','ö','ø','ō','ő','ŏ','ŏ'] => 'o',
  371. 'œ' => 'oek',
  372. 'ą' => 'q',
  373. ['ŕ','ř','ŗ'] => 'r',
  374. ['ś','š','ş','ŝ','ș'] => 's',
  375. ['ť','ţ','ŧ','ț'] => 't',
  376. ['ù','ú','û','ü','ū','ů','ű','ŭ','ũ','ų'] => 'u',
  377. 'ŵ' => 'w',
  378. ['ý','ÿ','ŷ'] => 'y',
  379. ['ž','ż','ź'] => 'z'
  380. }
  381. input = <<END
  382. aàáâãäåāăb cæd eďđf gçćčĉċh ièéêëēęěĕėj kƒl mĝğġģn oĥħp qììíîïīĩĭr sįıijĵt uķĸv
  383. włľĺļŀx yñńňņʼnŋz aòóôõöøōőŏŏb cœd eąf gŕřŗh iśšşŝșj kťţŧțl mùúûüūůűŭũųn oŵp
  384. qýÿŷr sžżźt
  385. END
  386. t = MappingFilter.new(LetterTokenizer.new(input), mapping)
  387. assert_equal(Token.new('aaaaaaaaab', 0, 18), t.next)
  388. assert_equal(Token.new('caed', 19, 23), t.next)
  389. assert_equal(Token.new('eddf', 24, 30), t.next)
  390. assert_equal(Token.new('gccccch', 31, 43), t.next)
  391. assert_equal(Token.new('ieeeeeeeeej', 44, 64), t.next)
  392. assert_equal(Token.new('kfl', 65, 69), t.next)
  393. assert_equal(Token.new('mggggn', 70, 80), t.next)
  394. assert_equal(Token.new('ohhp', 81, 87), t.next)
  395. assert_equal(Token.new('qiiiiiiiir', 88, 106), t.next)
  396. assert_equal(Token.new('sjjjjt', 107, 117), t.next)
  397. assert_equal(Token.new('ukkv', 118, 124), t.next)
  398. assert_equal(Token.new('wlllllx', 125, 137), t.next)
  399. assert_equal(Token.new('ynnnnnnz', 138, 152), t.next)
  400. assert_equal(Token.new('aoooooooooob', 153, 175), t.next)
  401. assert_equal(Token.new('coekd', 176, 180), t.next)
  402. assert_equal(Token.new('eqf', 181, 185), t.next)
  403. assert_equal(Token.new('grrrh', 186, 194), t.next)
  404. assert_equal(Token.new('isssssj', 195, 207), t.next)
  405. assert_equal(Token.new('kttttl', 208, 218), t.next)
  406. assert_equal(Token.new('muuuuuuuuuun', 219, 241), t.next)
  407. assert_equal(Token.new('owp', 242, 246), t.next)
  408. assert_equal(Token.new('qyyyr', 247, 255), t.next)
  409. assert_equal(Token.new('szzzt', 256, 264), t.next)
  410. assert(! t.next())
  411. end
  412. end if (/utf-8/i =~ Ferret.locale)
  413. class StopFilterTest < Test::Unit::TestCase
  414. include Ferret::Analysis
  415. def test_stop_filter()
  416. words = ["one", "four", "five", "seven"]
  417. input = "one, two, three, four, five, six, seven, eight, nine, ten."
  418. t = StopFilter.new(AsciiLetterTokenizer.new(input), words)
  419. assert_equal(Token.new('two', 5, 8, 2), t.next)
  420. assert_equal(Token.new('three', 10, 15, 1), t.next)
  421. assert_equal(Token.new('six', 29, 32, 3), t.next)
  422. assert_equal(Token.new('eight', 41, 46, 2), t.next)
  423. assert_equal(Token.new('nine', 48, 52, 1), t.next)
  424. assert_equal(Token.new('ten', 54, 57, 1), t.next)
  425. assert(! t.next())
  426. end
  427. end
  428. class StemFilterTest < Test::Unit::TestCase
  429. include Ferret::Analysis
  430. def test_stop_filter()
  431. input = "Debate Debates DEBATED DEBating Debater";
  432. t = StemFilter.new(AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(input)),
  433. "english")
  434. assert_equal(Token.new("debat", 0, 6), t.next)
  435. assert_equal(Token.new("debat", 7, 14), t.next)
  436. assert_equal(Token.new("debat", 15, 22), t.next)
  437. assert_equal(Token.new("debat", 23, 31), t.next)
  438. assert_equal(Token.new("debat", 32, 39), t.next)
  439. assert(! t.next())
  440. t = StemFilter.new(AsciiLetterTokenizer.new(input), :english)
  441. assert_equal(Token.new("Debat", 0, 6), t.next)
  442. assert_equal(Token.new("Debat", 7, 14), t.next)
  443. assert_equal(Token.new("DEBATED", 15, 22), t.next)
  444. assert_equal(Token.new("DEBate", 23, 31), t.next)
  445. assert_equal(Token.new("Debat", 32, 39), t.next)
  446. if Ferret.locale and Ferret.locale.downcase.index("utf")
  447. input = "Dêbate dêbates DÊBATED DÊBATing dêbater";
  448. t = StemFilter.new(LowerCaseFilter.new(LetterTokenizer.new(input)), :english)
  449. assert_equal(Token.new("dêbate", 0, 7), t.next)
  450. assert_equal(Token.new("dêbate", 8, 16), t.next)
  451. assert_equal(Token.new("dêbate", 17, 25), t.next)
  452. assert_equal(Token.new("dêbate", 26, 35), t.next)
  453. assert_equal(Token.new("dêbater", 36, 44), t.next)
  454. t = StemFilter.new(LetterTokenizer.new(input), :english)
  455. assert_equal(Token.new("Dêbate", 0, 7), t.next)
  456. assert_equal(Token.new("dêbate", 8, 16), t.next)
  457. assert_equal(Token.new("DÊBATED", 17, 25), t.next)
  458. assert_equal(Token.new("DÊBATing", 26, 35), t.next)
  459. assert_equal(Token.new("dêbater", 36, 44), t.next)
  460. assert(! t.next())
  461. end
  462. end
  463. end
  464. require 'strscan'
  465. module Ferret::Analysis
  466. class MyRegExpTokenizer < TokenStream
  467. def initialize(input)
  468. @ss = StringScanner.new(input)
  469. end
  470. # Returns the next token in the stream, or null at EOS.
  471. def next()
  472. if @ss.scan_until(token_re)
  473. term = @ss.matched
  474. term_end = @ss.pos
  475. term_start = term_end - term.size
  476. else
  477. return nil
  478. end
  479. return Token.new(normalize(term), term_start, term_end)
  480. end
  481. protected
  482. # returns the regular expression used to find the next token
  483. TOKEN_RE = /[[:alpha:]]+/
  484. def token_re
  485. TOKEN_RE
  486. end
  487. # Called on each token to normalize it before it is added to the
  488. # token. The default implementation does nothing. Subclasses may
  489. # use this to, e.g., lowercase tokens.
  490. def normalize(str) return str end
  491. end
  492. class MyCSVTokenizer < MyRegExpTokenizer
  493. protected
  494. # returns the regular expression used to find the next token
  495. TOKEN_RE = /[^,]+/
  496. def token_re
  497. TOKEN_RE
  498. end
  499. # Called on each token to normalize it before it is added to the
  500. # token. The default implementation does nothing. Subclasses may
  501. # use this to, e.g., lowercase tokens.
  502. def normalize(str) return str.upcase end
  503. end
  504. end
  505. class CustomTokenizerTest < Test::Unit::TestCase
  506. include Ferret::Analysis
  507. def test_custom_tokenizer()
  508. input = "First Field,2nd Field, P a d d e d F i e l d "
  509. t = MyCSVTokenizer.new(input)
  510. assert_equal(Token.new("FIRST FIELD", 0, 11), t.next)
  511. assert_equal(Token.new("2ND FIELD", 12, 21), t.next)
  512. assert_equal(Token.new(" P A D D E D F I E L D ", 22, 48), t.next)
  513. assert(! t.next())
  514. t = AsciiLowerCaseFilter.new(MyCSVTokenizer.new(input))
  515. assert_equal(Token.new("first field", 0, 11), t.next)
  516. assert_equal(Token.new("2nd field", 12, 21), t.next)
  517. assert_equal(Token.new(" p a d d e d f i e l d ", 22, 48), t.next)
  518. assert(! t.next())
  519. end
  520. end
  521. module Ferret::Analysis
  522. class TokenFilter < TokenStream
  523. protected
  524. # Construct a token stream filtering the given input.
  525. def initialize(input)
  526. @input = input
  527. end
  528. end
  529. # Normalizes token text to lower case.
  530. class CapitalizeFilter < TokenFilter
  531. def next()
  532. t = @input.next()
  533. return nil if (t.nil?)
  534. t.text = t.text.capitalize
  535. return t
  536. end
  537. end
  538. end
  539. class CustomFilterTest < Test::Unit::TestCase
  540. include Ferret::Analysis
  541. def test_custom_filter()
  542. input = "This text SHOULD be capitalized ... I hope. :-S"
  543. t = CapitalizeFilter.new(AsciiLetterTokenizer.new(input))
  544. assert_equal(Token.new("This", 0, 4), t.next)
  545. assert_equal(Token.new("Text", 5, 9), t.next)
  546. assert_equal(Token.new("Should", 10, 16), t.next)
  547. assert_equal(Token.new("Be", 17, 19), t.next)
  548. assert_equal(Token.new("Capitalized", 20, 31), t.next)
  549. assert_equal(Token.new("I", 36, 37), t.next)
  550. assert_equal(Token.new("Hope", 38, 42), t.next)
  551. assert_equal(Token.new("S", 46, 47), t.next)
  552. assert(! t.next())
  553. t = StemFilter.new(CapitalizeFilter.new(AsciiLetterTokenizer.new(input)))
  554. assert_equal(Token.new("This", 0, 4), t.next)
  555. assert_equal(Token.new("Text", 5, 9), t.next)
  556. assert_equal(Token.new("Should", 10, 16), t.next)
  557. assert_equal(Token.new("Be", 17, 19), t.next)
  558. assert_equal(Token.new("Capit", 20, 31), t.next)
  559. assert_equal(Token.new("I", 36, 37), t.next)
  560. assert_equal(Token.new("Hope", 38, 42), t.next)
  561. assert_equal(Token.new("S", 46, 47), t.next)
  562. assert(! t.next())
  563. end
  564. end