PageRenderTime 26ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/vendor/bundle/ruby/1.9.1/gems/chronic-0.6.7/lib/chronic/numerizer.rb

https://bitbucket.org/mulligan/extractext
Ruby | 121 lines | 93 code | 21 blank | 7 comment | 3 complexity | f3cb149564f3326854d1c8137b2572c1 MD5 | raw file
Possible License(s): Apache-2.0, MIT, GPL-3.0, GPL-2.0, BSD-3-Clause, MPL-2.0-no-copyleft-exception, BSD-2-Clause, JSON
  1. require 'strscan'
  2. module Chronic
  3. class Numerizer
  4. DIRECT_NUMS = [
  5. ['eleven', '11'],
  6. ['twelve', '12'],
  7. ['thirteen', '13'],
  8. ['fourteen', '14'],
  9. ['fifteen', '15'],
  10. ['sixteen', '16'],
  11. ['seventeen', '17'],
  12. ['eighteen', '18'],
  13. ['nineteen', '19'],
  14. ['ninteen', '19'], # Common mis-spelling
  15. ['zero', '0'],
  16. ['one', '1'],
  17. ['two', '2'],
  18. ['three', '3'],
  19. ['four(\W|$)', '4\1'], # The weird regex is so that it matches four but not fourty
  20. ['five', '5'],
  21. ['six(\W|$)', '6\1'],
  22. ['seven(\W|$)', '7\1'],
  23. ['eight(\W|$)', '8\1'],
  24. ['nine(\W|$)', '9\1'],
  25. ['ten', '10'],
  26. ['\ba[\b^$]', '1'] # doesn't make sense for an 'a' at the end to be a 1
  27. ]
  28. ORDINALS = [
  29. ['first', '1'],
  30. ['third', '3'],
  31. ['fourth', '4'],
  32. ['fifth', '5'],
  33. ['sixth', '6'],
  34. ['seventh', '7'],
  35. ['eighth', '8'],
  36. ['ninth', '9'],
  37. ['tenth', '10']
  38. ]
  39. TEN_PREFIXES = [
  40. ['twenty', 20],
  41. ['thirty', 30],
  42. ['forty', 40],
  43. ['fourty', 40], # Common mis-spelling
  44. ['fifty', 50],
  45. ['sixty', 60],
  46. ['seventy', 70],
  47. ['eighty', 80],
  48. ['ninety', 90]
  49. ]
  50. BIG_PREFIXES = [
  51. ['hundred', 100],
  52. ['thousand', 1000],
  53. ['million', 1_000_000],
  54. ['billion', 1_000_000_000],
  55. ['trillion', 1_000_000_000_000],
  56. ]
  57. def self.numerize(string)
  58. string = string.dup
  59. # preprocess
  60. string.gsub!(/ +|([^\d])-([^\d])/, '\1 \2') # will mutilate hyphenated-words but shouldn't matter for date extraction
  61. string.gsub!(/a half/, 'haAlf') # take the 'a' out so it doesn't turn into a 1, save the half for the end
  62. # easy/direct replacements
  63. DIRECT_NUMS.each do |dn|
  64. string.gsub!(/#{dn[0]}/i, '<num>' + dn[1])
  65. end
  66. ORDINALS.each do |on|
  67. string.gsub!(/#{on[0]}/i, '<num>' + on[1] + on[0][-2, 2])
  68. end
  69. # ten, twenty, etc.
  70. TEN_PREFIXES.each do |tp|
  71. string.gsub!(/(?:#{tp[0]}) *<num>(\d(?=[^\d]|$))*/i) { '<num>' + (tp[1] + $1.to_i).to_s }
  72. end
  73. TEN_PREFIXES.each do |tp|
  74. string.gsub!(/#{tp[0]}/i) { '<num>' + tp[1].to_s }
  75. end
  76. # hundreds, thousands, millions, etc.
  77. BIG_PREFIXES.each do |bp|
  78. string.gsub!(/(?:<num>)?(\d*) *#{bp[0]}/i) { '<num>' + (bp[1] * $1.to_i).to_s}
  79. andition(string)
  80. end
  81. # fractional addition
  82. # I'm not combining this with the previous block as using float addition complicates the strings
  83. # (with extraneous .0's and such )
  84. string.gsub!(/(\d+)(?: | and |-)*haAlf/i) { ($1.to_f + 0.5).to_s }
  85. string.gsub(/<num>/, '')
  86. end
  87. class << self
  88. private
  89. def andition(string)
  90. sc = StringScanner.new(string)
  91. while sc.scan_until(/<num>(\d+)( | and )<num>(\d+)(?=[^\w]|$)/i)
  92. if sc[2] =~ /and/ || sc[1].size > sc[3].size
  93. string[(sc.pos - sc.matched_size)..(sc.pos-1)] = '<num>' + (sc[1].to_i + sc[3].to_i).to_s
  94. sc.reset
  95. end
  96. end
  97. end
  98. end
  99. end
  100. end