PageRenderTime 21ms CodeModel.GetById 13ms app.highlight 5ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/discreteWavelet/execute_dwt_var_perFeature.pl

https://bitbucket.org/cistrome/cistrome-harvard/
Perl | 199 lines | 134 code | 34 blank | 31 comment | 15 complexity | 582bf9025cc66d3b4454a11ca0abf2a6 MD5 | raw file
  1#!/usr/bin/perl -w
  2# Author: Erika Kvikstad
  3
  4use warnings;
  5use IO::Handle;
  6use POSIX qw(floor ceil);
  7
  8$usage = "execute_dwt_var_perFeature.pl [TABULAR.in] [FEATURE] [ALPHA] [TABULAR.out] [PDF.out] \n";
  9die $usage unless @ARGV == 5;
 10
 11#get the input arguments
 12my $inputFile = $ARGV[0];
 13my @features = split(/,/,$ARGV[1]);
 14my $features_count = scalar(@features);
 15my $alpha = $ARGV[2];
 16my $outFile1 = $ARGV[3];
 17my $outFile2 = $ARGV[4];
 18
 19open (INPUT, "<", $inputFile) || die("Could not open file $inputFile \n");
 20open (OUTPUT2, ">", $outFile1) || die("Could not open file $outFile1 \n");
 21open (OUTPUT3, ">", $outFile2) || die("Could not open file $outFile2 \n");
 22#open (ERROR,  ">", "error.txt")  or die ("Could not open file error.txt \n");
 23
 24# choosing meaningful names for the output files
 25$pvalue = $outFile1; 
 26$pdf = $outFile2; 
 27
 28# write R script
 29$r_script = "get_dwt_varPermut.r"; 
 30
 31open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n";
 32
 33print Rcmd "
 34	######################################################################
 35	# plot multiscale wavelet variance 
 36	# create null bands by permuting the original data series
 37	# generate plots and table of wavelet variance including p-values
 38	######################################################################
 39	options(echo = FALSE)
 40	#library(\"Rwave\");
 41	#library(\"wavethresh\");
 42	#library(\"waveslim\");
 43	# turn off diagnostics for de-bugging only, turn back on for functional tests on test
 44	suppressMessages(require(\"Rwave\",quietly=TRUE,warn.conflicts = FALSE));
 45	suppressMessages(require(\"wavethresh\",quietly=TRUE,warn.conflicts = FALSE));
 46	suppressMessages(require(\"waveslim\",quietly=TRUE,warn.conflicts = FALSE));
 47	suppressMessages(require(\"bitops\",quietly=TRUE,warn.conflicts = FALSE));
 48
 49	# to determine if data is properly formatted 2^N observations
 50	is.power2<- function(x){x && !(bitAnd(x,x - 1));}
 51
 52	# dwt : discrete wavelet transform using Haar wavelet filter, simplest wavelet function but later can modify to let user-define the wavelet filter function
 53	dwt_var_permut_getMax <- function(data, names, alpha, filter = 1,family=\"DaubExPhase\", bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") {
 54		max_var = NULL;
 55    		matrix = NULL;
 56		title = NULL;
 57    		final_pvalue = NULL;
 58		J = NULL;
 59		scale = NULL;
 60		out = NULL;
 61	
 62	print(class(data));	
 63    	print(names);
 64	print(alpha);
 65    	
 66	par(mar=c(5,4,4,3),oma = c(4, 4, 3, 2), xaxt = \"s\", cex = 1, las = 1);
 67   
 68	title<-c(\"Wavelet\",\"Variance\",\"Pvalue\",\"Test\");
 69	print(title);
 70
 71    	for(i in 1:length(names)){
 72		temp = NULL;
 73		results = NULL;
 74		wave1.dwt = NULL;
 75	
 76		# if data fails formatting check, do something
 77				
 78		print(is.numeric(as.matrix(data)[, i]));
 79		if(!is.numeric(as.matrix(data)[, i]))
 80			stop(\"data must be a numeric vector\");
 81		
 82		print(length(as.matrix(data)[, i]));
 83		print(is.power2(length(as.matrix(data)[, i])));
 84		if(!is.power2(length(as.matrix(data)[, i])))	
 85			stop(\"data length must be a power of two\");
 86
 87
 88    		J <- wd(as.matrix(data)[, i], filter.number = filter, family=family, bc = bc)\$nlevels;
 89		print(J);
 90            	temp <- vector(length = J);
 91               	wave1.dwt <- dwt(as.matrix(data)[, i], wf = wf, J, boundary = boundary); 
 92		#print(wave1.dwt);
 93                		
 94                temp <- wave.variance(wave1.dwt)[-(J+1), 1];
 95		print(temp);
 96
 97                #permutations code :
 98                feature1 = NULL;
 99		null = NULL;
100		var_lower=limit_lower=NULL;
101		var_upper=limit_upper=NULL;
102		med = NULL;
103
104		limit_lower = alpha/2*1000;
105		print(limit_lower);
106		limit_upper = (1-alpha/2)*1000;
107		print(limit_upper);
108		
109		feature1 = as.matrix(data)[,i];
110                for (k in 1:1000) {
111			nk_1 = NULL;
112			null.levels = NULL;
113			var = NULL;
114			null_wave1 = NULL;
115
116                       	nk_1 = sample(feature1, length(feature1), replace = FALSE);
117                       	null.levels <- wd(nk_1, filter.number = filter,family=family ,bc = bc)\$nlevels;
118                       	var <- vector(length = length(null.levels));
119                       	null_wave1 <- dwt(nk_1, wf = wf, J, boundary = boundary);
120                       	var<- wave.variance(null_wave1)[-(null.levels+1), 1];
121                       	null= rbind(null, var);
122               }
123               null <- apply(null, 2, sort, na.last = TRUE);
124               var_lower <- null[limit_lower, ];
125               var_upper <- null[limit_upper, ];
126               med <- (apply(null, 2, median, na.rm = TRUE));
127
128               # plot
129               results <- cbind(temp, var_lower, var_upper);
130		print(results);
131                matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2),xaxt='n',xlab=\"Wavelet Scale\",ylab=\"Wavelet variance\" );
132		mtext(names[i], side = 3, line = 0.5, cex = 1);
133		axis(1, at = 1:J , labels=c(2^(0:(J-1))), las = 3, cex.axis = 1);
134
135                # get pvalues by comparison to null distribution
136		#out <- (names[i]);
137                for (m in 1:length(temp)){
138                    	print(paste(\"scale\", m, sep = \" \"));
139                       	print(paste(\"var\", temp[m], sep = \" \"));
140                       	print(paste(\"med\", med[m], sep = \" \"));
141                       	pv = tail =scale = NULL;
142			scale=2^(m-1);
143			#out <- c(out, format(temp[m], digits = 3));	
144                       	if (temp[m] >= med[m]){
145                       		# R tail test
146                           	print(\"R\");
147	                       	tail <- \"R\";
148                            	pv <- (length(which(null[, m] >= temp[m])))/(length(na.exclude(null[, m])));
149
150                       	} else {
151                       		if (temp[m] < med[m]){
152                               		# L tail test
153                               		print(\"L\");
154	                            	tail <- \"L\";
155                                	pv <- (length(which(null[, m] <= temp[m])))/(length(na.exclude(null[, m])));
156                        	}
157			}
158			print(pv);
159			out<-rbind(out,c(paste(\"Scale\", scale, sep=\"_\"),format(temp[m], digits = 3),pv,tail));
160                }
161		final_pvalue <-rbind(final_pvalue, out);
162  	}
163	colnames(final_pvalue) <- title;
164    	return(final_pvalue);
165}\n";
166
167print Rcmd "
168# execute
169# read in data 
170data_test = final = NULL;
171sub = sub_names = NULL;
172data_test <- read.delim(\"$inputFile\",header=FALSE);
173pdf(file = \"$pdf\", width = 11, height = 8)\n";
174
175for ($x=0;$x<$features_count;$x++){	
176	$feature=$features[$x];
177print Rcmd "
178	if ($feature > ncol(data_test))
179		stop(\"column $feature doesn't exist\");	
180	sub<-data_test[,$feature];
181	#sub_names <- colnames(data_test);
182	sub_names<-colnames(data_test)[$feature];
183	final <- rbind(final,dwt_var_permut_getMax(sub, sub_names,$alpha));\n";
184}
185
186print Rcmd "
187
188	dev.off();
189	write.table(final, file = \"$pvalue\", sep = \"\\t\", quote = FALSE, row.names = FALSE);
190
191#eof\n";
192
193close Rcmd;
194system("R --no-restore --no-save --no-readline < $r_script > $r_script.out");
195
196#close the input and output and error files
197close(OUTPUT3);
198close(OUTPUT2);
199close(INPUT);