/tools/stats/lda_analy.xml

https://bitbucket.org/cistrome/cistrome-harvard/ · XML · 285 lines · 243 code · 42 blank · 0 comment · 0 complexity · cf3a5045b446556783fe5560bfa37c91 MD5 · raw file

  1. <tool id="lda_analy1" name="Perform LDA" version="1.0.1">
  2. <description>Linear Discriminant Analysis</description>
  3. <command interpreter="sh">r_wrapper.sh $script_file</command>
  4. <inputs>
  5. <param format="tabular" name="input" type="data" label="Source file"/>
  6. <param name="cond" size="30" type="integer" value="3" label="Number of principal components" help="See TIP below">
  7. <validator type="empty_field" message="Enter a valid number of principal components, see syntax below for examples"/>
  8. </param>
  9. </inputs>
  10. <outputs>
  11. <data format="txt" name="output" />
  12. </outputs>
  13. <tests>
  14. <test>
  15. <param name="input" value="matrix_generator_for_pc_and_lda_output.tabular"/>
  16. <output name="output" file="lda_analy_output.txt"/>
  17. <param name="cond" value="2"/>
  18. </test>
  19. </tests>
  20. <configfiles>
  21. <configfile name="script_file">
  22. rm(list = objects() )
  23. ############# FORMAT X DATA #########################
  24. format&lt;-function(data) {
  25. ind=NULL
  26. for(i in 1 : ncol(data)){
  27. if (is.na(data[nrow(data),i])) {
  28. ind&lt;-c(ind,i)
  29. }
  30. }
  31. #print(is.null(ind))
  32. if (!is.null(ind)) {
  33. data&lt;-data[,-c(ind)]
  34. }
  35. data
  36. }
  37. ########GET RESPONSES ###############################
  38. get_resp&lt;- function(data) {
  39. resp1&lt;-as.vector(data[,ncol(data)])
  40. resp=numeric(length(resp1))
  41. for (i in 1:length(resp1)) {
  42. if (resp1[i]=="Y ") {
  43. resp[i] = 0
  44. }
  45. if (resp1[i]=="X ") {
  46. resp[i] = 1
  47. }
  48. }
  49. return(resp)
  50. }
  51. ######## CHARS TO NUMBERS ###########################
  52. f_to_numbers&lt;- function(F) {
  53. ind&lt;-NULL
  54. G&lt;-matrix(0,nrow(F), ncol(F))
  55. for (i in 1:nrow(F)) {
  56. for (j in 1:ncol(F)) {
  57. G[i,j]&lt;-as.integer(F[i,j])
  58. }
  59. }
  60. return(G)
  61. }
  62. ###################NORMALIZING#########################
  63. norm &lt;- function(M, a=NULL, b=NULL) {
  64. C&lt;-NULL
  65. ind&lt;-NULL
  66. for (i in 1: ncol(M)) {
  67. if (sd(M[,i])!=0) {
  68. M[,i]&lt;-(M[,i]-mean(M[,i]))/sd(M[,i])
  69. }
  70. # else {print(mean(M[,i]))}
  71. }
  72. return(M)
  73. }
  74. ##### LDA DIRECTIONS #################################
  75. lda_dec &lt;- function(data, k){
  76. priors=numeric(k)
  77. grandmean&lt;-numeric(ncol(data)-1)
  78. means=matrix(0,k,ncol(data)-1)
  79. B = matrix(0, ncol(data)-1, ncol(data)-1)
  80. N=nrow(data)
  81. for (i in 1:k){
  82. priors[i]=sum(data[,1]==i)/N
  83. grp=subset(data,data\$group==i)
  84. means[i,]=mean(grp[,2:ncol(data)])
  85. #print(means[i,])
  86. #print(priors[i])
  87. #print(priors[i]*means[i,])
  88. grandmean = priors[i]*means[i,] + grandmean
  89. }
  90. for (i in 1:k) {
  91. B= B + priors[i]*((means[i,]-grandmean)%*%t(means[i,]-grandmean))
  92. }
  93. W = var(data[,2:ncol(data)])
  94. svdW = svd(W)
  95. inv_sqrtW =solve(svdW\$v %*% diag(sqrt(svdW\$d)) %*% t(svdW\$v))
  96. B_star= t(inv_sqrtW)%*%B%*%inv_sqrtW
  97. B_star_decomp = svd(B_star)
  98. directions = inv_sqrtW%*%B_star_decomp\$v
  99. return( list(directions, B_star_decomp\$d) )
  100. }
  101. ################ NAIVE BAYES FOR 1D SIR OR LDA ##############
  102. naive_bayes_classifier &lt;- function(resp, tr_data, test_data, k=2, tau) {
  103. tr_data=data.frame(resp=resp, dir=tr_data)
  104. means=numeric(k)
  105. #print(k)
  106. cl=numeric(k)
  107. predclass=numeric(length(test_data))
  108. for (i in 1:k) {
  109. grp = subset(tr_data, resp==i)
  110. means[i] = mean(grp\$dir)
  111. #print(i, means[i])
  112. }
  113. cutoff = tau*means[1]+(1-tau)*means[2]
  114. #print(tau)
  115. #print(means)
  116. #print(cutoff)
  117. if (cutoff&gt;means[1]) {
  118. cl[1]=1
  119. cl[2]=2
  120. }
  121. else {
  122. cl[1]=2
  123. cl[2]=1
  124. }
  125. for (i in 1:length(test_data)) {
  126. if (test_data[i] &lt;= cutoff) {
  127. predclass[i] = cl[1]
  128. }
  129. else {
  130. predclass[i] = cl[2]
  131. }
  132. }
  133. #print(means)
  134. #print(mean(means))
  135. #X11()
  136. #plot(test_data,pch=predclass, col=resp)
  137. predclass
  138. }
  139. ################# EXTENDED ERROR RATES #################
  140. ext_error_rate &lt;- function(predclass, actualclass,msg=c("you forgot the message"), pr=1) {
  141. er=sum(predclass != actualclass)/length(predclass)
  142. matr&lt;-data.frame(predclass=predclass,actualclass=actualclass)
  143. escapes = subset(matr, actualclass==1)
  144. subjects = subset(matr, actualclass==2)
  145. er_esc=sum(escapes\$predclass != escapes\$actualclass)/length(escapes\$predclass)
  146. er_subj=sum(subjects\$predclass != subjects\$actualclass)/length(subjects\$predclass)
  147. if (pr==1) {
  148. # print(paste(c(msg, 'overall : ', (1-er)*100, "%."),collapse=" "))
  149. # print(paste(c(msg, 'within escapes : ', (1-er_esc)*100, "%."),collapse=" "))
  150. # print(paste(c(msg, 'within subjects: ', (1-er_subj)*100, "%."),collapse=" "))
  151. }
  152. return(c((1-er)*100, (1-er_esc)*100, (1-er_subj)*100))
  153. }
  154. ## Main Function ##
  155. files&lt;-matrix("${input}", 1,1, byrow=T)
  156. d&lt;-"${cond}" # Number of PC
  157. tau&lt;-seq(0,1, by=0.005)
  158. #tau&lt;-seq(0,1, by=0.1)
  159. for_curve=matrix(-10, 3,length(tau))
  160. ##############################################################
  161. test_data_whole_X &lt;-read.delim(files[1,1], row.names=1)
  162. #### FORMAT TRAINING DATA ####################################
  163. # get only necessary columns
  164. test_data_whole_X&lt;-format(test_data_whole_X)
  165. oligo_labels&lt;-test_data_whole_X[1:(nrow(test_data_whole_X)-1),ncol(test_data_whole_X)]
  166. test_data_whole_X&lt;-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)]
  167. X_names&lt;-colnames(test_data_whole_X)[1:ncol(test_data_whole_X)]
  168. test_data_whole_X&lt;-t(test_data_whole_X)
  169. resp&lt;-get_resp(test_data_whole_X)
  170. ldaqda_resp = resp + 1
  171. a&lt;-sum(resp) # Number of Subject
  172. b&lt;-length(resp) - a # Number of Escape
  173. ## FREQUENCIES #################################################
  174. F&lt;-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)]
  175. F&lt;-f_to_numbers(F)
  176. FN&lt;-norm(F, a, b)
  177. ss&lt;-svd(FN)
  178. eigvar&lt;-NULL
  179. eig&lt;-ss\$d^2
  180. for ( i in 1:length(ss\$d)) {
  181. eigvar[i]&lt;-sum(eig[1:i])/sum(eig)
  182. }
  183. #print(paste(c("Variance explained : ", eigvar[d]*100, "%"), collapse=""))
  184. Z&lt;-F%*%ss\$v
  185. ldaqda_data &lt;- data.frame(group=ldaqda_resp,Z[,1:d])
  186. lda_dir&lt;-lda_dec(ldaqda_data,2)
  187. train_lda_pred &lt;-Z[,1:d]%*%lda_dir[[1]]
  188. ############# NAIVE BAYES CROSS-VALIDATION #############
  189. ### LDA #####
  190. y&lt;-ldaqda_resp
  191. X&lt;-F
  192. cv&lt;-matrix(c(rep('NA',nrow(test_data_whole_X))), nrow(test_data_whole_X), length(tau))
  193. for (i in 1:nrow(test_data_whole_X)) {
  194. # print(i)
  195. resp&lt;-y[-i]
  196. p&lt;-matrix(X[-i,], dim(X)[1]-1, dim(X)[2])
  197. testdata&lt;-matrix(X[i,],1,dim(X)[2])
  198. p1&lt;-norm(p)
  199. sss&lt;-svd(p1)
  200. pred&lt;-(p%*%sss\$v)[,1:d]
  201. test&lt;- (testdata%*%sss\$v)[,1:d]
  202. lda &lt;- lda_dec(data.frame(group=resp,pred),2)
  203. pred &lt;- pred[,1:d]%*%lda[[1]][,1]
  204. test &lt;- test%*%lda[[1]][,1]
  205. test&lt;-matrix(test, 1, length(test))
  206. for (t in 1:length(tau)) {
  207. cv[i, t] &lt;- naive_bayes_classifier (resp, pred, test,k=2, tau[t])
  208. }
  209. }
  210. for (t in 1:length(tau)) {
  211. tr_err&lt;-ext_error_rate(cv[,t], ldaqda_resp , c("CV"), 1)
  212. for_curve[1:3,t]&lt;-tr_err
  213. }
  214. dput(for_curve, file="${output}")
  215. </configfile>
  216. </configfiles>
  217. <help>
  218. .. class:: infomark
  219. **TIP:** If you want to perform Principal Component Analysis (PCA) on the give numeric input data (which corresponds to the "Source file First in "Generate A Matrix" tool), please use *Multivariate Analysis/Principal Component Analysis*
  220. -----
  221. .. class:: infomark
  222. **What it does**
  223. This tool consists of the module to perform the Linear Discriminant Analysis as described in Carrel et al., 2006 (PMID: 17009873)
  224. *Carrel L, Park C, Tyekucheva S, Dunn J, Chiaromonte F, et al. (2006) Genomic Environment Predicts Expression Patterns on the Human Inactive X Chromosome. PLoS Genet 2(9): e151. doi:10.1371/journal.pgen.0020151*
  225. -----
  226. .. class:: warningmark
  227. **Note**
  228. - Output from "Generate A Matrix" tool is used as input file for this tool
  229. - Output of this tool contains LDA classification success rates for different values of the turning parameter tau (from 0 to 1 with 0.005 interval). This output file will be used to establish the ROC plot, and you can obtain more detail information from this plot.
  230. </help>
  231. </tool>