cwords.rb | searchcode

/scripts/cwords.rb

https://github.com/andersjacobsen/cwords
Ruby | 446 lines | 323 code | 68 blank | 55 comment | 43 complexity | 72369cada88d5d0d4381c6080679423a MD5 | raw file

#!/usr/bin/env jruby

srcdir = File.dirname(__FILE__)
basedir = srcdir + "/../"
libdir = basedir + '/lib/'
$LOAD_PATH << libdir

require 'wordRS-lib.rb'
require 'rubygems'
require 'progressbar'
require 'optparse'
require 'java'
require 'pp'
require libdir + 'ushuffle.jar'
java_import 'UShuffle'

#default options
options = Hash.new
options[:wordsize] = [7]
options[:split_words]=nil
options[:dbdir] = basedir + "db/"
options[:scoring_scheme] = 'pval'
options[:permutations]=50
options[:seqshuffles]=100
options[:rankfile]=nil
options[:seqfile]=nil
options[:report_words]=nil
options[:plot_words]=nil
options[:onlyanno]=nil
options[:dump]=nil
options[:testing]=nil
options[:rank_all]=true
options[:rank_inverse]=nil
options[:rank_split_median]=nil
options[:rank_abs]=nil
options[:bg]=1 #mononucleotide shuffling
options[:threads]=1

$coptions = OptionParser.new do |opts|
  opts.banner = "Usage: cwords [options]"
  
  # analysis settings
  opts.on("-c", "--scoring_scheme ARG", "scoring scheme") {|o| options[:scoring_scheme] = o} 
  opts.on("-p", "--permutations ARG", "number of list permutations") {|o| options[:permutations] = o.to_i}
  opts.on("-q", "--shuffles ARG", "number of sequence shuffles for sequence bias correction") {|o| options[:seqshuffles] = o.to_i}
  opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
  opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
  opts.on("-t", "--threads ARG", "use multiple threads to parallelize computations") {|o| options[:threads] = o.to_i}
  opts.on(      "--split_words WORDS", "split sequence set based on occurrences of WORDS") {|o| options[:split_words] = o.split(",")}
  opts.on(      "--onlyanno", "only process annotated (i.e. mirbase) words") {|o| options[:onlyanno] = true}
  
  # rank control
  opts.on("-x", "--rank_split", "analyze positive and neg. values separetely") {|o| options[:rank_all] = false}
  opts.on("-m", "--rank_split_median", "split ranked list at median") {|o| options[:rank_split_median] = true}
  opts.on("-i", "--rank_inverse", "inverse all ranked lists") {|o| options[:rank_inverse] = true}
  opts.on("-a", "--rank_abs", "rank by absolute value") {|o| options[:rank_abs] = true}
  
  # files and directories
  opts.on("-r", "--rankfile ARG", "rank file") {|o| options[:rankfile] = o}
  opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
  opts.on("-d", "--db ARG", "word database") { |o| options[:db] = o}
  
  # output control
  opts.on("-u", "--dump ARG", "dump top words") { |o| options[:dump] = o.to_i}
  opts.on(      "--report_words ARG", "report on words (comma separated)") {|o| options[:report_words] = o.split(',')}
  opts.on(      "--plot_words ARG", "only make plot files for words (comma separated)") {|o| options[:plot_words] = o.split(',')}
  opts.on(      "--testing", "testing mode") {|o| options[:testing] = true}
end

def show_help(msg="", code=0, io=STDOUT)
  io.puts "#{msg}\n#{$coptions}"
  exit(code)
end

begin
  $coptions.parse!(ARGV)
rescue OptionParser::ParseError => error
  puts error.message
  puts $coptions
  exit
end

# mandatory parameters
[:rankfile].each{|p| show_help("option '#{p}' mandatory") if options[p].nil?}
show_help("db or seqfile required") if !(options[:db] or options[:seqfile])
show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval','pval2'].include?(options[:scoring_scheme]))

testing = options[:testing]

# get filename without directory
rankfilename = File.basename(options[:rankfile])

# hard-coded
output_top = 10

prankdir = basedir + "/db/" + options[:db] + "/" if options[:db]
annofile = basedir + "/resources/" + "word_annotation.tsv" #annotation
tidfile = basedir + "/resources/" + "genemap.tsv"
seqshuffles = 5000 # currently hardcoded for database
sequences = nil
nwords = options[:wordsize].map{|x| 4**x}.to_statarray.sum
bg=options[:bg] # TODO, make option
threads=options[:threads]
nperms=options[:permutations]

###
### Main program
###

puts ">> Analysis date: " + Time.now.to_s
puts ">> Parameters"
options.each{|k,v| puts sprintf("%-20s:  %s",k,v) if !v.nil?}

# read in mirbase seed family
word_annotation = Hash.new("") # seq => family
IO.readlines(annofile).each{|l| word_annotation[l.split("\t")[0]] = l.split("\t")[1]}

# read optional sequences
if options[:seqfile]
  puts "\n>> reading sequences ..."
  sequences = Hash.new # id => seq
  IO.readlines(options[:seqfile],">")[1..-1].each do  |s|
    ff = s.split("\n").map{|x| x.chomp}
    id = ff.shift
    seq = ff[0..-2].join('').downcase.gsub('u','t')
    seq += ff[-1] if ff[-1] != '>' # check if last field is ">"
    # next if not nucleotide sequence, i.e. "unavailable"
    next if (seq.split('').uniq - ['a','c','g','t']).size > 0
    next if seq.size < 50 # lower bound
    # hash ensures sequence ids are unique
    sequences[id] = seq
  end  
  seqshuffles = options[:seqshuffles]
end

# initialize word id hash, word sequence => word id (0..nwords-1)
wids = Hash.new
begin
  wi = 0
  options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=wi ; wi+=1 }}
end

###
### ID mapping
###

# pre-computed word database:
#  map ids given in rankfile to internal ids
#  remove rankfile entries with no match to internal id
# sequence file:
#  take intersection of rank and sequence IDs

puts "\n>> Mapping and filtering IDs ..."

all = []
begin
  idmap = Hash.new
  internal_ids = nil

  if sequences
    internal_ids = sequences
  else
    IO.readlines(tidfile).each do |l|
      tid = l.split(" ")[0]
      l.split(" ")[1].split(",").each{|extid| idmap[extid] = tid}
    end
    internal_ids = idmap.invert # allowed internal ids
  end

  allh = Hash.new {|h,k| h[k] = []}
  filtered = 0
  
  IO.readlines(options[:rankfile]).each do |l|
    l = l.split("\t")
    #test if internal id or mapable external id
    tid = (internal_ids.key?(l[0]) ? l[0] :  idmap[l[0]])   
    tid.nil? ? filtered += 1 : allh[tid] << l[1].to_f
  end

  # filter unknown sequences
  sequences.keys.each{|id| sequences.delete(id) if !allh.key?(id)} if sequences
  
  # we currently mean-collapse ids, we could allow mean/min/max collapsing ...
  all = allh.to_a.map{|tid,values| [tid,values.to_statarray.mean]}

  puts "removed #{filtered} invalid transcript ids" if filtered > 0
end

allorder = Hash.new # tid => index in all
all.each_with_index{|x,i| allorder[x[0]] = i}

###
### Word enumeration (optional)
###

wordscores = []
if sequences
  puts "\n>> Enumerating words in sequences"
  wordscores = Array.new(all.size) {Array.new(wids.size,0)} # {Java::short[wids.size].new}
  pbar = ProgressBar.new("progress",sequences.size)
  all.threach(threads) do |seqid,val|
    us = UShuffle.new
    seq=sequences[seqid]
    seqidx=allorder[seqid]
    pbar.inc
    seqsize = seq.size
    observed = Array.new(wids.size,0)
    options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seq[i, ws]]; observed[wid] += 1 if not wid.nil?}}

    case options[:scoring_scheme]
    when "bin" then wordscores[seqidx] = observed.map{|x| x > 0 ? 1 : -1}
    when "obs" then wordscores[seqidx] = observed
    else
      # pval, compute distribution of expected word occurrences
      us.init_shuffle(seq,bg)
      seqshuffles.times do |si|
        seqsh = us.shuffle
        expected = Array.new(wids.size,0)
        options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seqsh[i, ws]]; expected[wid] += 1 if !wid.nil?}}
        observed.each_with_index{|x,widx| wordscores[seqidx][widx] += 1 if expected[widx]>=x}
      end
    end
  end
  pbar.finish
end

###
### Generate list ranking
###

analyze = []
if options[:rank_split_median]
  # we should perhaps use an :inverse option,
  # reversing the two pos and neg lists
  med = all.map{|x| x[1]}.to_statarray.median
  pos_set = all.select{|x| x[1] > med}.sort{|a,b| b[1] <=> a[1]}
  neg_set = all.select{|x| x[1] <= med}.sort{|a,b| a[1] <=> b[1]}
  analyze = [[pos_set,'med_positive'],[neg_set,'med_negative']]
elsif options[:rank_all] # do not split positive and negative range
  pos_set = all.sort{|a,b| b[1] <=> a[1]}
  neg_set = all.sort{|a,b| a[1] <=> b[1]}
  analyze = [[pos_set,'all_positive'],[neg_set,'all_negative']]
elsif options[:rank_abs] # rank by absolute values
  pos_set = all.map{|x| [x[0],x[1].abs]}.sort{|a,b| b[1] <=> a[1]}
  neg_set = pos_set.reverse 
  analyze = [[pos_set,'abs_positive'],[neg_set,'abs_negative']]
else
  pos_set = all.select{|x| x[1] > 0}.sort{|a,b| b[1] <=> a[1]}
  neg_set = all.select{|x| x[1] < 0}.sort{|a,b| a[1] <=> b[1]}
  analyze = [[pos_set,'positive'],[neg_set,'negative']]
end

# inverse lists
analyze.map!{|set,nm| [set.reverse,nm+".inv"]} if options[:rank_inverse]

# split sequence set when --split option is given
if options[:split_words]
  seqs_with_words = Hash.new

  options[:split_words].each do |split_word|
    begin
      IO.readlines(prankdir + split_word.downcase + ".rnk").each do |x|
        l = x.split("\t")
        seqs_with_words[l[0]] = 1 if l[1].to_i > 0
      end
    rescue
      warn "could not split sequences on word #{split_word}: " + $!
    end
  end
  
  analyze_split = []
  analyze.each do |set,nm|
    analyze_split += set.partition{|x| seqs_with_words.key?(x[0])}.zip([nm+".split+"+options[:split_words].join(","),nm+".split-"+options[:split_words].join(",")])
  end
  analyze = analyze_split
end

###
### Correlation analysis
###

puts "\n>> Analyzing sequence sets: " + analyze.map{|x| x[1]}.join(", ")

analyze.each do |set,nm|
  ngenes = set.size
  puts "\n>> Analyzing #{nm} set ...\nnumber of genes: #{ngenes}"
  next if ngenes == 0
  perms = []
  report = []
  pfdrz = []

  report = Array.new(nwords)
  pfdrz = Array.new(nwords*nperms)

  franks = Hash.new # tid => index in set
  set.each_with_index{|x,i| franks[x[0]] = i}
  
  puts "permuting #{options[:permutations]} times ...\n"
  nperms.times{|i| perms << (0..set.size-1).to_a.shuffle}
  
  pbar = ProgressBar.new("progress",nwords)

  wids.to_a.threach(threads) do |word,wid|
    pbar.inc
    next if options[:onlyanno] and not word_annotation.key?(word) #only process annotated words
    next if options[:plot_words] and !options[:plot_words].include?(word)
    
    plotfile = File.new(rankfilename + ".#{word}.#{nm}.csv","w") if options[:plot_words]
        
    score = Array.new(ngenes,0) # scores ordered by fold change
    
    if sequences
      score = set.map{|x| wordscores[allorder[x[0]]][wid]}
      score.map!{|x| -Math.log((x+1.0)/(seqshuffles+1))} if options[:scoring_scheme] == 'pval'
    else # use precomputed word database
      wordcounts = IO.readlines(prankdir + word + ".rnk").map{|x| x.split("\t")}.select{|x| franks.key?(x[0])}
      case options[:scoring_scheme]
      when "bin" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_i == 0 ? -1 : 1}
      when "obs" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_f}
      when "pval" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = -Math.log((gte_obs.to_f+1)/(seqshuffles+1.0))}
      when "pval2" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = gte_obs.to_f}
      end
    end
    
    # center scores
    smean = score.to_statarray.mean
    score.map!{|x| x-smean}
    
    maxrs = 0
    leading_edge = 0
    rs = 0 #running sum
    rsa = [0]
    score.each_with_index do |x,i|
      rs += x
      rsa << rs
      if rs.abs > maxrs.abs
        maxrs = rs
        leading_edge = i+1
      end
    end
    
    plotfile.puts(([word+".score"] + [0] + score.map{|x| x.to_e(2)}).join(",")) if options[:plot_words]
    plotfile.puts(([word+".rs"] + rsa).join(",")) if options[:plot_words]
    
    # we are only interested in pos. maxrs scores,
    # because we currently analyze up/down regulated seperately
    next if maxrs <= 0
    
    pmaxrs_pos = StatArray.new
    perms.each_with_index do |psa,pidx|
      prs = 0
      prsa = [0]
      pmaxrs = 0
      psa.each do |i|
        prs += score[i]
        prsa << prs
        pmaxrs = prs if prs.abs > pmaxrs.abs
      end
      # the permuted scores are approx. symmetric around 0
      pmaxrs_pos << pmaxrs.abs
      plotfile.puts(([word+".rs."+pidx.to_s] + prsa).join(",")) if options[:plot_words]
    end
        
    pmean = pmaxrs_pos.mean
    pstd = pmaxrs_pos.stddev
    
    #Because the word zscore distr. can be quite different,
    # we compute the deviation from the mean of the absolute dist.
    # The permuted maxRS should be normally distr. (sum of random numbers)
    poffset = wid*nperms
    pmaxrs_pos.map{|x| (x-pmean)/pstd}.each_with_index{|v,j| pfdrz[poffset+j] = v}

    zsc = (maxrs-pmean)/pstd
    plotfile.close if options[:plot_words]
    report[wid] = [wid,zsc,nil,leading_edge]
        
  end # wordsize
  pbar.finish
  
  ###
  ### FDR
  ###
  
  begin
    puts "\n>> Estimating FDR ..."
    report.compact! # remove nil entries
    pfdrz.compact!
    fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
    report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
    fdrrank = fdrrank.sort_by{|x| x[0].to_f}.reverse # sort high zscore to low zscore
    nfp, ntp = pfdrz.size.to_f, report.size.to_f
    ifp, itp = 0, 0
    fdrrank.each do |zsc,idx|
      if idx.nil?
        ifp += 1
      else
        itp += 1
        fpr, tpr = ifp/nfp, itp/ntp
        report[idx][2] = fpr/tpr
      end
    end
    
    cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
    puts ""
    puts (["FDR <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
    puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[2] <= x}.size} + [report.size]).join("\t")
  end

  ###
  ### Output summarization
  ###
  
  wids2 = wids.invert
  report = report.sort_by{|x| x[1]}.reverse
  puts "\nTop #{output_top} words"
  puts ['rank','word','z-score','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
  report[0,output_top].each_with_index do |r,i|
    wd = wids2[r[0]]
    s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
    puts s.map{|x| sprintf("%-10s",x)}.join('')
  end

  if options[:report_words]
    puts "......"
    report.each_with_index do |r,i|
      if options[:report_words].include?(r[0])
        wd = wids2[r[0]]
        s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
        puts s.map{|x| sprintf("%-10s",x)}.join('')
      end
    end
  end

  if options[:dump]
    fname = rankfilename + ".#{nm}." + options[:dump].to_s 
    of = File.new(fname,"w")
    of.puts ['rank','word','z-score','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
    puts "dumping top #{options[:dump]} words in file: #{fname}"
    report[0..options[:dump]-1].each_with_index do |r,i|
      wd = wids2[r[0]]
      s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
      of.puts s.map{|x| sprintf("%-10s",x)}.join('')
    end
  end
  
end