/script/teachme/extract1.rb
Ruby | 52 lines | 39 code | 5 blank | 8 comment | 10 complexity | aa2aa898d3ca9d7739e978a91174bafc MD5 | raw file
- #!/usr/bin/env ruby
- require File.dirname(__FILE__) + '/../../config/boot'
- enviro = 'production'
- ENV["RAILS_ENV"] = enviro
- RAILS_ENV.replace(enviro) if defined?(RAILS_ENV)
- require RAILS_ROOT + '/config/environment'
- #path = File.dirname(__FILE__) + "/harvest1B.txt"
- path = File.dirname(__FILE__) + "/latin1.txt"
- lines = IO.readlines(path)
- lines.each do |line|
-
- line.chomp!
- next if line !~ /=(.*?)(,=|=,)\s?(.*)/m
- sp = $1
- defn = $3
- next if !sp || !defn
- ## prepare definition part
- defn.gsub!(/_/, '')
- defn.gsub!(/\.$/, '')
- #defn.gsub!(/=(\w.+)=/, "_\1_")
- defn = defn.gsub(/=(\w.+)=/) {|m| "[#{$1}]"}
- ## get gender
- gender = '-'
- if defn =~ /(m|f).,/
- gender = $1
- defn.gsub!(/(m|f).,\s?/, '')
- end
- ## prepare spanish part
- # skip verbs
- next if sp =~ /(ar|er|ir|se),\s*\(/ # e.g., volar, (ue)
- if (sp =~ /(ar|er|ir|se)$/ && defn =~ /to /)
- next
- end
- next if sp =~ /^-+/ # skip entries starting with ---
- next if sp =~ /--+/ # skip entries with --- anywhere (although, these may be useful to store as phrases/idioms)
- sp = sp.gsub(/^(.*)o,\s?-?-a$/) {|m| "#{$1}o,#{$1}a"} # o => -a
- sp = sp.gsub(/^(.*)os,\s?-as$/) {|m| "#{$1}os,#{$1}as"} # os => -as
- sp = sp.gsub(/^(.*)or,\s?-ra$/) {|m| "#{$1}or,#{$1}ora"} # or => -ra
- sp = sp.gsub(/^(.*)es,\s?-esa$/) {|m| "#{$1}es,#{$1}esa"} # es => -esa
- ## print
- print "#{sp} || #{defn} || #{gender}\n"
- s2e = SpanishToEnglish.new
- s2e.spanish_word = sp
- s2e.english_word = defn
- s2e.gender = gender
- s2e.source = 'drupal s2e'
- s2e.save
- end