/tools/discreteWavelet/execute_dwt_var_perFeature.pl

https://bitbucket.org/cistrome/cistrome-harvard/ · Perl · 199 lines · 179 code · 13 blank · 7 comment · 5 complexity · 582bf9025cc66d3b4454a11ca0abf2a6 MD5 · raw file

  1. #!/usr/bin/perl -w
  2. # Author: Erika Kvikstad
  3. use warnings;
  4. use IO::Handle;
  5. use POSIX qw(floor ceil);
  6. $usage = "execute_dwt_var_perFeature.pl [TABULAR.in] [FEATURE] [ALPHA] [TABULAR.out] [PDF.out] \n";
  7. die $usage unless @ARGV == 5;
  8. #get the input arguments
  9. my $inputFile = $ARGV[0];
  10. my @features = split(/,/,$ARGV[1]);
  11. my $features_count = scalar(@features);
  12. my $alpha = $ARGV[2];
  13. my $outFile1 = $ARGV[3];
  14. my $outFile2 = $ARGV[4];
  15. open (INPUT, "<", $inputFile) || die("Could not open file $inputFile \n");
  16. open (OUTPUT2, ">", $outFile1) || die("Could not open file $outFile1 \n");
  17. open (OUTPUT3, ">", $outFile2) || die("Could not open file $outFile2 \n");
  18. #open (ERROR, ">", "error.txt") or die ("Could not open file error.txt \n");
  19. # choosing meaningful names for the output files
  20. $pvalue = $outFile1;
  21. $pdf = $outFile2;
  22. # write R script
  23. $r_script = "get_dwt_varPermut.r";
  24. open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n";
  25. print Rcmd "
  26. ######################################################################
  27. # plot multiscale wavelet variance
  28. # create null bands by permuting the original data series
  29. # generate plots and table of wavelet variance including p-values
  30. ######################################################################
  31. options(echo = FALSE)
  32. #library(\"Rwave\");
  33. #library(\"wavethresh\");
  34. #library(\"waveslim\");
  35. # turn off diagnostics for de-bugging only, turn back on for functional tests on test
  36. suppressMessages(require(\"Rwave\",quietly=TRUE,warn.conflicts = FALSE));
  37. suppressMessages(require(\"wavethresh\",quietly=TRUE,warn.conflicts = FALSE));
  38. suppressMessages(require(\"waveslim\",quietly=TRUE,warn.conflicts = FALSE));
  39. suppressMessages(require(\"bitops\",quietly=TRUE,warn.conflicts = FALSE));
  40. # to determine if data is properly formatted 2^N observations
  41. is.power2<- function(x){x && !(bitAnd(x,x - 1));}
  42. # dwt : discrete wavelet transform using Haar wavelet filter, simplest wavelet function but later can modify to let user-define the wavelet filter function
  43. dwt_var_permut_getMax <- function(data, names, alpha, filter = 1,family=\"DaubExPhase\", bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") {
  44. max_var = NULL;
  45. matrix = NULL;
  46. title = NULL;
  47. final_pvalue = NULL;
  48. J = NULL;
  49. scale = NULL;
  50. out = NULL;
  51. print(class(data));
  52. print(names);
  53. print(alpha);
  54. par(mar=c(5,4,4,3),oma = c(4, 4, 3, 2), xaxt = \"s\", cex = 1, las = 1);
  55. title<-c(\"Wavelet\",\"Variance\",\"Pvalue\",\"Test\");
  56. print(title);
  57. for(i in 1:length(names)){
  58. temp = NULL;
  59. results = NULL;
  60. wave1.dwt = NULL;
  61. # if data fails formatting check, do something
  62. print(is.numeric(as.matrix(data)[, i]));
  63. if(!is.numeric(as.matrix(data)[, i]))
  64. stop(\"data must be a numeric vector\");
  65. print(length(as.matrix(data)[, i]));
  66. print(is.power2(length(as.matrix(data)[, i])));
  67. if(!is.power2(length(as.matrix(data)[, i])))
  68. stop(\"data length must be a power of two\");
  69. J <- wd(as.matrix(data)[, i], filter.number = filter, family=family, bc = bc)\$nlevels;
  70. print(J);
  71. temp <- vector(length = J);
  72. wave1.dwt <- dwt(as.matrix(data)[, i], wf = wf, J, boundary = boundary);
  73. #print(wave1.dwt);
  74. temp <- wave.variance(wave1.dwt)[-(J+1), 1];
  75. print(temp);
  76. #permutations code :
  77. feature1 = NULL;
  78. null = NULL;
  79. var_lower=limit_lower=NULL;
  80. var_upper=limit_upper=NULL;
  81. med = NULL;
  82. limit_lower = alpha/2*1000;
  83. print(limit_lower);
  84. limit_upper = (1-alpha/2)*1000;
  85. print(limit_upper);
  86. feature1 = as.matrix(data)[,i];
  87. for (k in 1:1000) {
  88. nk_1 = NULL;
  89. null.levels = NULL;
  90. var = NULL;
  91. null_wave1 = NULL;
  92. nk_1 = sample(feature1, length(feature1), replace = FALSE);
  93. null.levels <- wd(nk_1, filter.number = filter,family=family ,bc = bc)\$nlevels;
  94. var <- vector(length = length(null.levels));
  95. null_wave1 <- dwt(nk_1, wf = wf, J, boundary = boundary);
  96. var<- wave.variance(null_wave1)[-(null.levels+1), 1];
  97. null= rbind(null, var);
  98. }
  99. null <- apply(null, 2, sort, na.last = TRUE);
  100. var_lower <- null[limit_lower, ];
  101. var_upper <- null[limit_upper, ];
  102. med <- (apply(null, 2, median, na.rm = TRUE));
  103. # plot
  104. results <- cbind(temp, var_lower, var_upper);
  105. print(results);
  106. matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2),xaxt='n',xlab=\"Wavelet Scale\",ylab=\"Wavelet variance\" );
  107. mtext(names[i], side = 3, line = 0.5, cex = 1);
  108. axis(1, at = 1:J , labels=c(2^(0:(J-1))), las = 3, cex.axis = 1);
  109. # get pvalues by comparison to null distribution
  110. #out <- (names[i]);
  111. for (m in 1:length(temp)){
  112. print(paste(\"scale\", m, sep = \" \"));
  113. print(paste(\"var\", temp[m], sep = \" \"));
  114. print(paste(\"med\", med[m], sep = \" \"));
  115. pv = tail =scale = NULL;
  116. scale=2^(m-1);
  117. #out <- c(out, format(temp[m], digits = 3));
  118. if (temp[m] >= med[m]){
  119. # R tail test
  120. print(\"R\");
  121. tail <- \"R\";
  122. pv <- (length(which(null[, m] >= temp[m])))/(length(na.exclude(null[, m])));
  123. } else {
  124. if (temp[m] < med[m]){
  125. # L tail test
  126. print(\"L\");
  127. tail <- \"L\";
  128. pv <- (length(which(null[, m] <= temp[m])))/(length(na.exclude(null[, m])));
  129. }
  130. }
  131. print(pv);
  132. out<-rbind(out,c(paste(\"Scale\", scale, sep=\"_\"),format(temp[m], digits = 3),pv,tail));
  133. }
  134. final_pvalue <-rbind(final_pvalue, out);
  135. }
  136. colnames(final_pvalue) <- title;
  137. return(final_pvalue);
  138. }\n";
  139. print Rcmd "
  140. # execute
  141. # read in data
  142. data_test = final = NULL;
  143. sub = sub_names = NULL;
  144. data_test <- read.delim(\"$inputFile\",header=FALSE);
  145. pdf(file = \"$pdf\", width = 11, height = 8)\n";
  146. for ($x=0;$x<$features_count;$x++){
  147. $feature=$features[$x];
  148. print Rcmd "
  149. if ($feature > ncol(data_test))
  150. stop(\"column $feature doesn't exist\");
  151. sub<-data_test[,$feature];
  152. #sub_names <- colnames(data_test);
  153. sub_names<-colnames(data_test)[$feature];
  154. final <- rbind(final,dwt_var_permut_getMax(sub, sub_names,$alpha));\n";
  155. }
  156. print Rcmd "
  157. dev.off();
  158. write.table(final, file = \"$pvalue\", sep = \"\\t\", quote = FALSE, row.names = FALSE);
  159. #eof\n";
  160. close Rcmd;
  161. system("R --no-restore --no-save --no-readline < $r_script > $r_script.out");
  162. #close the input and output and error files
  163. close(OUTPUT3);
  164. close(OUTPUT2);
  165. close(INPUT);