PageRenderTime 50ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/cwords.rb

https://github.com/andersjacobsen/cwords
Ruby | 446 lines | 323 code | 68 blank | 55 comment | 43 complexity | 72369cada88d5d0d4381c6080679423a MD5 | raw file
  1. #!/usr/bin/env jruby
  2. srcdir = File.dirname(__FILE__)
  3. basedir = srcdir + "/../"
  4. libdir = basedir + '/lib/'
  5. $LOAD_PATH << libdir
  6. require 'wordRS-lib.rb'
  7. require 'rubygems'
  8. require 'progressbar'
  9. require 'optparse'
  10. require 'java'
  11. require 'pp'
  12. require libdir + 'ushuffle.jar'
  13. java_import 'UShuffle'
  14. #default options
  15. options = Hash.new
  16. options[:wordsize] = [7]
  17. options[:split_words]=nil
  18. options[:dbdir] = basedir + "db/"
  19. options[:scoring_scheme] = 'pval'
  20. options[:permutations]=50
  21. options[:seqshuffles]=100
  22. options[:rankfile]=nil
  23. options[:seqfile]=nil
  24. options[:report_words]=nil
  25. options[:plot_words]=nil
  26. options[:onlyanno]=nil
  27. options[:dump]=nil
  28. options[:testing]=nil
  29. options[:rank_all]=true
  30. options[:rank_inverse]=nil
  31. options[:rank_split_median]=nil
  32. options[:rank_abs]=nil
  33. options[:bg]=1 #mononucleotide shuffling
  34. options[:threads]=1
  35. $coptions = OptionParser.new do |opts|
  36. opts.banner = "Usage: cwords [options]"
  37. # analysis settings
  38. opts.on("-c", "--scoring_scheme ARG", "scoring scheme") {|o| options[:scoring_scheme] = o}
  39. opts.on("-p", "--permutations ARG", "number of list permutations") {|o| options[:permutations] = o.to_i}
  40. opts.on("-q", "--shuffles ARG", "number of sequence shuffles for sequence bias correction") {|o| options[:seqshuffles] = o.to_i}
  41. opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
  42. opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
  43. opts.on("-t", "--threads ARG", "use multiple threads to parallelize computations") {|o| options[:threads] = o.to_i}
  44. opts.on( "--split_words WORDS", "split sequence set based on occurrences of WORDS") {|o| options[:split_words] = o.split(",")}
  45. opts.on( "--onlyanno", "only process annotated (i.e. mirbase) words") {|o| options[:onlyanno] = true}
  46. # rank control
  47. opts.on("-x", "--rank_split", "analyze positive and neg. values separetely") {|o| options[:rank_all] = false}
  48. opts.on("-m", "--rank_split_median", "split ranked list at median") {|o| options[:rank_split_median] = true}
  49. opts.on("-i", "--rank_inverse", "inverse all ranked lists") {|o| options[:rank_inverse] = true}
  50. opts.on("-a", "--rank_abs", "rank by absolute value") {|o| options[:rank_abs] = true}
  51. # files and directories
  52. opts.on("-r", "--rankfile ARG", "rank file") {|o| options[:rankfile] = o}
  53. opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
  54. opts.on("-d", "--db ARG", "word database") { |o| options[:db] = o}
  55. # output control
  56. opts.on("-u", "--dump ARG", "dump top words") { |o| options[:dump] = o.to_i}
  57. opts.on( "--report_words ARG", "report on words (comma separated)") {|o| options[:report_words] = o.split(',')}
  58. opts.on( "--plot_words ARG", "only make plot files for words (comma separated)") {|o| options[:plot_words] = o.split(',')}
  59. opts.on( "--testing", "testing mode") {|o| options[:testing] = true}
  60. end
  61. def show_help(msg="", code=0, io=STDOUT)
  62. io.puts "#{msg}\n#{$coptions}"
  63. exit(code)
  64. end
  65. begin
  66. $coptions.parse!(ARGV)
  67. rescue OptionParser::ParseError => error
  68. puts error.message
  69. puts $coptions
  70. exit
  71. end
  72. # mandatory parameters
  73. [:rankfile].each{|p| show_help("option '#{p}' mandatory") if options[p].nil?}
  74. show_help("db or seqfile required") if !(options[:db] or options[:seqfile])
  75. show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval','pval2'].include?(options[:scoring_scheme]))
  76. testing = options[:testing]
  77. # get filename without directory
  78. rankfilename = File.basename(options[:rankfile])
  79. # hard-coded
  80. output_top = 10
  81. prankdir = basedir + "/db/" + options[:db] + "/" if options[:db]
  82. annofile = basedir + "/resources/" + "word_annotation.tsv" #annotation
  83. tidfile = basedir + "/resources/" + "genemap.tsv"
  84. seqshuffles = 5000 # currently hardcoded for database
  85. sequences = nil
  86. nwords = options[:wordsize].map{|x| 4**x}.to_statarray.sum
  87. bg=options[:bg] # TODO, make option
  88. threads=options[:threads]
  89. nperms=options[:permutations]
  90. ###
  91. ### Main program
  92. ###
  93. puts ">> Analysis date: " + Time.now.to_s
  94. puts ">> Parameters"
  95. options.each{|k,v| puts sprintf("%-20s: %s",k,v) if !v.nil?}
  96. # read in mirbase seed family
  97. word_annotation = Hash.new("") # seq => family
  98. IO.readlines(annofile).each{|l| word_annotation[l.split("\t")[0]] = l.split("\t")[1]}
  99. # read optional sequences
  100. if options[:seqfile]
  101. puts "\n>> reading sequences ..."
  102. sequences = Hash.new # id => seq
  103. IO.readlines(options[:seqfile],">")[1..-1].each do |s|
  104. ff = s.split("\n").map{|x| x.chomp}
  105. id = ff.shift
  106. seq = ff[0..-2].join('').downcase.gsub('u','t')
  107. seq += ff[-1] if ff[-1] != '>' # check if last field is ">"
  108. # next if not nucleotide sequence, i.e. "unavailable"
  109. next if (seq.split('').uniq - ['a','c','g','t']).size > 0
  110. next if seq.size < 50 # lower bound
  111. # hash ensures sequence ids are unique
  112. sequences[id] = seq
  113. end
  114. seqshuffles = options[:seqshuffles]
  115. end
  116. # initialize word id hash, word sequence => word id (0..nwords-1)
  117. wids = Hash.new
  118. begin
  119. wi = 0
  120. options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=wi ; wi+=1 }}
  121. end
  122. ###
  123. ### ID mapping
  124. ###
  125. # pre-computed word database:
  126. # map ids given in rankfile to internal ids
  127. # remove rankfile entries with no match to internal id
  128. # sequence file:
  129. # take intersection of rank and sequence IDs
  130. puts "\n>> Mapping and filtering IDs ..."
  131. all = []
  132. begin
  133. idmap = Hash.new
  134. internal_ids = nil
  135. if sequences
  136. internal_ids = sequences
  137. else
  138. IO.readlines(tidfile).each do |l|
  139. tid = l.split(" ")[0]
  140. l.split(" ")[1].split(",").each{|extid| idmap[extid] = tid}
  141. end
  142. internal_ids = idmap.invert # allowed internal ids
  143. end
  144. allh = Hash.new {|h,k| h[k] = []}
  145. filtered = 0
  146. IO.readlines(options[:rankfile]).each do |l|
  147. l = l.split("\t")
  148. #test if internal id or mapable external id
  149. tid = (internal_ids.key?(l[0]) ? l[0] : idmap[l[0]])
  150. tid.nil? ? filtered += 1 : allh[tid] << l[1].to_f
  151. end
  152. # filter unknown sequences
  153. sequences.keys.each{|id| sequences.delete(id) if !allh.key?(id)} if sequences
  154. # we currently mean-collapse ids, we could allow mean/min/max collapsing ...
  155. all = allh.to_a.map{|tid,values| [tid,values.to_statarray.mean]}
  156. puts "removed #{filtered} invalid transcript ids" if filtered > 0
  157. end
  158. allorder = Hash.new # tid => index in all
  159. all.each_with_index{|x,i| allorder[x[0]] = i}
  160. ###
  161. ### Word enumeration (optional)
  162. ###
  163. wordscores = []
  164. if sequences
  165. puts "\n>> Enumerating words in sequences"
  166. wordscores = Array.new(all.size) {Array.new(wids.size,0)} # {Java::short[wids.size].new}
  167. pbar = ProgressBar.new("progress",sequences.size)
  168. all.threach(threads) do |seqid,val|
  169. us = UShuffle.new
  170. seq=sequences[seqid]
  171. seqidx=allorder[seqid]
  172. pbar.inc
  173. seqsize = seq.size
  174. observed = Array.new(wids.size,0)
  175. options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seq[i, ws]]; observed[wid] += 1 if not wid.nil?}}
  176. case options[:scoring_scheme]
  177. when "bin" then wordscores[seqidx] = observed.map{|x| x > 0 ? 1 : -1}
  178. when "obs" then wordscores[seqidx] = observed
  179. else
  180. # pval, compute distribution of expected word occurrences
  181. us.init_shuffle(seq,bg)
  182. seqshuffles.times do |si|
  183. seqsh = us.shuffle
  184. expected = Array.new(wids.size,0)
  185. options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seqsh[i, ws]]; expected[wid] += 1 if !wid.nil?}}
  186. observed.each_with_index{|x,widx| wordscores[seqidx][widx] += 1 if expected[widx]>=x}
  187. end
  188. end
  189. end
  190. pbar.finish
  191. end
  192. ###
  193. ### Generate list ranking
  194. ###
  195. analyze = []
  196. if options[:rank_split_median]
  197. # we should perhaps use an :inverse option,
  198. # reversing the two pos and neg lists
  199. med = all.map{|x| x[1]}.to_statarray.median
  200. pos_set = all.select{|x| x[1] > med}.sort{|a,b| b[1] <=> a[1]}
  201. neg_set = all.select{|x| x[1] <= med}.sort{|a,b| a[1] <=> b[1]}
  202. analyze = [[pos_set,'med_positive'],[neg_set,'med_negative']]
  203. elsif options[:rank_all] # do not split positive and negative range
  204. pos_set = all.sort{|a,b| b[1] <=> a[1]}
  205. neg_set = all.sort{|a,b| a[1] <=> b[1]}
  206. analyze = [[pos_set,'all_positive'],[neg_set,'all_negative']]
  207. elsif options[:rank_abs] # rank by absolute values
  208. pos_set = all.map{|x| [x[0],x[1].abs]}.sort{|a,b| b[1] <=> a[1]}
  209. neg_set = pos_set.reverse
  210. analyze = [[pos_set,'abs_positive'],[neg_set,'abs_negative']]
  211. else
  212. pos_set = all.select{|x| x[1] > 0}.sort{|a,b| b[1] <=> a[1]}
  213. neg_set = all.select{|x| x[1] < 0}.sort{|a,b| a[1] <=> b[1]}
  214. analyze = [[pos_set,'positive'],[neg_set,'negative']]
  215. end
  216. # inverse lists
  217. analyze.map!{|set,nm| [set.reverse,nm+".inv"]} if options[:rank_inverse]
  218. # split sequence set when --split option is given
  219. if options[:split_words]
  220. seqs_with_words = Hash.new
  221. options[:split_words].each do |split_word|
  222. begin
  223. IO.readlines(prankdir + split_word.downcase + ".rnk").each do |x|
  224. l = x.split("\t")
  225. seqs_with_words[l[0]] = 1 if l[1].to_i > 0
  226. end
  227. rescue
  228. warn "could not split sequences on word #{split_word}: " + $!
  229. end
  230. end
  231. analyze_split = []
  232. analyze.each do |set,nm|
  233. analyze_split += set.partition{|x| seqs_with_words.key?(x[0])}.zip([nm+".split+"+options[:split_words].join(","),nm+".split-"+options[:split_words].join(",")])
  234. end
  235. analyze = analyze_split
  236. end
  237. ###
  238. ### Correlation analysis
  239. ###
  240. puts "\n>> Analyzing sequence sets: " + analyze.map{|x| x[1]}.join(", ")
  241. analyze.each do |set,nm|
  242. ngenes = set.size
  243. puts "\n>> Analyzing #{nm} set ...\nnumber of genes: #{ngenes}"
  244. next if ngenes == 0
  245. perms = []
  246. report = []
  247. pfdrz = []
  248. report = Array.new(nwords)
  249. pfdrz = Array.new(nwords*nperms)
  250. franks = Hash.new # tid => index in set
  251. set.each_with_index{|x,i| franks[x[0]] = i}
  252. puts "permuting #{options[:permutations]} times ...\n"
  253. nperms.times{|i| perms << (0..set.size-1).to_a.shuffle}
  254. pbar = ProgressBar.new("progress",nwords)
  255. wids.to_a.threach(threads) do |word,wid|
  256. pbar.inc
  257. next if options[:onlyanno] and not word_annotation.key?(word) #only process annotated words
  258. next if options[:plot_words] and !options[:plot_words].include?(word)
  259. plotfile = File.new(rankfilename + ".#{word}.#{nm}.csv","w") if options[:plot_words]
  260. score = Array.new(ngenes,0) # scores ordered by fold change
  261. if sequences
  262. score = set.map{|x| wordscores[allorder[x[0]]][wid]}
  263. score.map!{|x| -Math.log((x+1.0)/(seqshuffles+1))} if options[:scoring_scheme] == 'pval'
  264. else # use precomputed word database
  265. wordcounts = IO.readlines(prankdir + word + ".rnk").map{|x| x.split("\t")}.select{|x| franks.key?(x[0])}
  266. case options[:scoring_scheme]
  267. when "bin" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_i == 0 ? -1 : 1}
  268. when "obs" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_f}
  269. when "pval" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = -Math.log((gte_obs.to_f+1)/(seqshuffles+1.0))}
  270. when "pval2" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = gte_obs.to_f}
  271. end
  272. end
  273. # center scores
  274. smean = score.to_statarray.mean
  275. score.map!{|x| x-smean}
  276. maxrs = 0
  277. leading_edge = 0
  278. rs = 0 #running sum
  279. rsa = [0]
  280. score.each_with_index do |x,i|
  281. rs += x
  282. rsa << rs
  283. if rs.abs > maxrs.abs
  284. maxrs = rs
  285. leading_edge = i+1
  286. end
  287. end
  288. plotfile.puts(([word+".score"] + [0] + score.map{|x| x.to_e(2)}).join(",")) if options[:plot_words]
  289. plotfile.puts(([word+".rs"] + rsa).join(",")) if options[:plot_words]
  290. # we are only interested in pos. maxrs scores,
  291. # because we currently analyze up/down regulated seperately
  292. next if maxrs <= 0
  293. pmaxrs_pos = StatArray.new
  294. perms.each_with_index do |psa,pidx|
  295. prs = 0
  296. prsa = [0]
  297. pmaxrs = 0
  298. psa.each do |i|
  299. prs += score[i]
  300. prsa << prs
  301. pmaxrs = prs if prs.abs > pmaxrs.abs
  302. end
  303. # the permuted scores are approx. symmetric around 0
  304. pmaxrs_pos << pmaxrs.abs
  305. plotfile.puts(([word+".rs."+pidx.to_s] + prsa).join(",")) if options[:plot_words]
  306. end
  307. pmean = pmaxrs_pos.mean
  308. pstd = pmaxrs_pos.stddev
  309. #Because the word zscore distr. can be quite different,
  310. # we compute the deviation from the mean of the absolute dist.
  311. # The permuted maxRS should be normally distr. (sum of random numbers)
  312. poffset = wid*nperms
  313. pmaxrs_pos.map{|x| (x-pmean)/pstd}.each_with_index{|v,j| pfdrz[poffset+j] = v}
  314. zsc = (maxrs-pmean)/pstd
  315. plotfile.close if options[:plot_words]
  316. report[wid] = [wid,zsc,nil,leading_edge]
  317. end # wordsize
  318. pbar.finish
  319. ###
  320. ### FDR
  321. ###
  322. begin
  323. puts "\n>> Estimating FDR ..."
  324. report.compact! # remove nil entries
  325. pfdrz.compact!
  326. fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
  327. report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
  328. fdrrank = fdrrank.sort_by{|x| x[0].to_f}.reverse # sort high zscore to low zscore
  329. nfp, ntp = pfdrz.size.to_f, report.size.to_f
  330. ifp, itp = 0, 0
  331. fdrrank.each do |zsc,idx|
  332. if idx.nil?
  333. ifp += 1
  334. else
  335. itp += 1
  336. fpr, tpr = ifp/nfp, itp/ntp
  337. report[idx][2] = fpr/tpr
  338. end
  339. end
  340. cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
  341. puts ""
  342. puts (["FDR <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
  343. puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[2] <= x}.size} + [report.size]).join("\t")
  344. end
  345. ###
  346. ### Output summarization
  347. ###
  348. wids2 = wids.invert
  349. report = report.sort_by{|x| x[1]}.reverse
  350. puts "\nTop #{output_top} words"
  351. puts ['rank','word','z-score','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
  352. report[0,output_top].each_with_index do |r,i|
  353. wd = wids2[r[0]]
  354. s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
  355. puts s.map{|x| sprintf("%-10s",x)}.join('')
  356. end
  357. if options[:report_words]
  358. puts "......"
  359. report.each_with_index do |r,i|
  360. if options[:report_words].include?(r[0])
  361. wd = wids2[r[0]]
  362. s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
  363. puts s.map{|x| sprintf("%-10s",x)}.join('')
  364. end
  365. end
  366. end
  367. if options[:dump]
  368. fname = rankfilename + ".#{nm}." + options[:dump].to_s
  369. of = File.new(fname,"w")
  370. of.puts ['rank','word','z-score','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
  371. puts "dumping top #{options[:dump]} words in file: #{fname}"
  372. report[0..options[:dump]-1].each_with_index do |r,i|
  373. wd = wids2[r[0]]
  374. s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
  375. of.puts s.map{|x| sprintf("%-10s",x)}.join('')
  376. end
  377. end
  378. end