/tools/stats/lda_analy.xml
https://bitbucket.org/cistrome/cistrome-harvard/ · XML · 285 lines · 243 code · 42 blank · 0 comment · 0 complexity · cf3a5045b446556783fe5560bfa37c91 MD5 · raw file
- <tool id="lda_analy1" name="Perform LDA" version="1.0.1">
- <description>Linear Discriminant Analysis</description>
- <command interpreter="sh">r_wrapper.sh $script_file</command>
- <inputs>
- <param format="tabular" name="input" type="data" label="Source file"/>
- <param name="cond" size="30" type="integer" value="3" label="Number of principal components" help="See TIP below">
- <validator type="empty_field" message="Enter a valid number of principal components, see syntax below for examples"/>
- </param>
- </inputs>
- <outputs>
- <data format="txt" name="output" />
- </outputs>
- <tests>
- <test>
- <param name="input" value="matrix_generator_for_pc_and_lda_output.tabular"/>
- <output name="output" file="lda_analy_output.txt"/>
- <param name="cond" value="2"/>
- </test>
- </tests>
- <configfiles>
- <configfile name="script_file">
- rm(list = objects() )
- ############# FORMAT X DATA #########################
- format<-function(data) {
- ind=NULL
- for(i in 1 : ncol(data)){
- if (is.na(data[nrow(data),i])) {
- ind<-c(ind,i)
- }
- }
- #print(is.null(ind))
- if (!is.null(ind)) {
- data<-data[,-c(ind)]
- }
- data
- }
- ########GET RESPONSES ###############################
- get_resp<- function(data) {
- resp1<-as.vector(data[,ncol(data)])
- resp=numeric(length(resp1))
- for (i in 1:length(resp1)) {
- if (resp1[i]=="Y ") {
- resp[i] = 0
- }
- if (resp1[i]=="X ") {
- resp[i] = 1
- }
- }
- return(resp)
- }
- ######## CHARS TO NUMBERS ###########################
- f_to_numbers<- function(F) {
- ind<-NULL
- G<-matrix(0,nrow(F), ncol(F))
- for (i in 1:nrow(F)) {
- for (j in 1:ncol(F)) {
- G[i,j]<-as.integer(F[i,j])
- }
- }
- return(G)
- }
- ###################NORMALIZING#########################
- norm <- function(M, a=NULL, b=NULL) {
- C<-NULL
- ind<-NULL
- for (i in 1: ncol(M)) {
- if (sd(M[,i])!=0) {
- M[,i]<-(M[,i]-mean(M[,i]))/sd(M[,i])
- }
- # else {print(mean(M[,i]))}
- }
- return(M)
- }
- ##### LDA DIRECTIONS #################################
- lda_dec <- function(data, k){
- priors=numeric(k)
- grandmean<-numeric(ncol(data)-1)
- means=matrix(0,k,ncol(data)-1)
- B = matrix(0, ncol(data)-1, ncol(data)-1)
- N=nrow(data)
- for (i in 1:k){
- priors[i]=sum(data[,1]==i)/N
- grp=subset(data,data\$group==i)
- means[i,]=mean(grp[,2:ncol(data)])
- #print(means[i,])
- #print(priors[i])
- #print(priors[i]*means[i,])
- grandmean = priors[i]*means[i,] + grandmean
- }
- for (i in 1:k) {
- B= B + priors[i]*((means[i,]-grandmean)%*%t(means[i,]-grandmean))
- }
-
- W = var(data[,2:ncol(data)])
- svdW = svd(W)
- inv_sqrtW =solve(svdW\$v %*% diag(sqrt(svdW\$d)) %*% t(svdW\$v))
- B_star= t(inv_sqrtW)%*%B%*%inv_sqrtW
- B_star_decomp = svd(B_star)
- directions = inv_sqrtW%*%B_star_decomp\$v
- return( list(directions, B_star_decomp\$d) )
- }
- ################ NAIVE BAYES FOR 1D SIR OR LDA ##############
- naive_bayes_classifier <- function(resp, tr_data, test_data, k=2, tau) {
- tr_data=data.frame(resp=resp, dir=tr_data)
- means=numeric(k)
- #print(k)
- cl=numeric(k)
- predclass=numeric(length(test_data))
- for (i in 1:k) {
- grp = subset(tr_data, resp==i)
- means[i] = mean(grp\$dir)
- #print(i, means[i])
- }
- cutoff = tau*means[1]+(1-tau)*means[2]
- #print(tau)
- #print(means)
- #print(cutoff)
- if (cutoff>means[1]) {
- cl[1]=1
- cl[2]=2
- }
- else {
- cl[1]=2
- cl[2]=1
- }
- for (i in 1:length(test_data)) {
- if (test_data[i] <= cutoff) {
- predclass[i] = cl[1]
- }
- else {
- predclass[i] = cl[2]
- }
- }
- #print(means)
- #print(mean(means))
- #X11()
- #plot(test_data,pch=predclass, col=resp)
- predclass
- }
- ################# EXTENDED ERROR RATES #################
- ext_error_rate <- function(predclass, actualclass,msg=c("you forgot the message"), pr=1) {
- er=sum(predclass != actualclass)/length(predclass)
- matr<-data.frame(predclass=predclass,actualclass=actualclass)
- escapes = subset(matr, actualclass==1)
- subjects = subset(matr, actualclass==2)
- er_esc=sum(escapes\$predclass != escapes\$actualclass)/length(escapes\$predclass)
- er_subj=sum(subjects\$predclass != subjects\$actualclass)/length(subjects\$predclass)
- if (pr==1) {
- # print(paste(c(msg, 'overall : ', (1-er)*100, "%."),collapse=" "))
- # print(paste(c(msg, 'within escapes : ', (1-er_esc)*100, "%."),collapse=" "))
- # print(paste(c(msg, 'within subjects: ', (1-er_subj)*100, "%."),collapse=" "))
- }
- return(c((1-er)*100, (1-er_esc)*100, (1-er_subj)*100))
- }
- ## Main Function ##
- files<-matrix("${input}", 1,1, byrow=T)
- d<-"${cond}" # Number of PC
- tau<-seq(0,1, by=0.005)
- #tau<-seq(0,1, by=0.1)
- for_curve=matrix(-10, 3,length(tau))
- ##############################################################
- test_data_whole_X <-read.delim(files[1,1], row.names=1)
- #### FORMAT TRAINING DATA ####################################
- # get only necessary columns
- test_data_whole_X<-format(test_data_whole_X)
- oligo_labels<-test_data_whole_X[1:(nrow(test_data_whole_X)-1),ncol(test_data_whole_X)]
- test_data_whole_X<-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)]
- X_names<-colnames(test_data_whole_X)[1:ncol(test_data_whole_X)]
- test_data_whole_X<-t(test_data_whole_X)
- resp<-get_resp(test_data_whole_X)
- ldaqda_resp = resp + 1
- a<-sum(resp) # Number of Subject
- b<-length(resp) - a # Number of Escape
- ## FREQUENCIES #################################################
- F<-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)]
- F<-f_to_numbers(F)
- FN<-norm(F, a, b)
- ss<-svd(FN)
- eigvar<-NULL
- eig<-ss\$d^2
- for ( i in 1:length(ss\$d)) {
- eigvar[i]<-sum(eig[1:i])/sum(eig)
- }
- #print(paste(c("Variance explained : ", eigvar[d]*100, "%"), collapse=""))
-
- Z<-F%*%ss\$v
- ldaqda_data <- data.frame(group=ldaqda_resp,Z[,1:d])
- lda_dir<-lda_dec(ldaqda_data,2)
- train_lda_pred <-Z[,1:d]%*%lda_dir[[1]]
- ############# NAIVE BAYES CROSS-VALIDATION #############
- ### LDA #####
- y<-ldaqda_resp
- X<-F
- cv<-matrix(c(rep('NA',nrow(test_data_whole_X))), nrow(test_data_whole_X), length(tau))
- for (i in 1:nrow(test_data_whole_X)) {
- # print(i)
- resp<-y[-i]
- p<-matrix(X[-i,], dim(X)[1]-1, dim(X)[2])
- testdata<-matrix(X[i,],1,dim(X)[2])
- p1<-norm(p)
- sss<-svd(p1)
- pred<-(p%*%sss\$v)[,1:d]
- test<- (testdata%*%sss\$v)[,1:d]
- lda <- lda_dec(data.frame(group=resp,pred),2)
- pred <- pred[,1:d]%*%lda[[1]][,1]
- test <- test%*%lda[[1]][,1]
- test<-matrix(test, 1, length(test))
- for (t in 1:length(tau)) {
- cv[i, t] <- naive_bayes_classifier (resp, pred, test,k=2, tau[t])
- }
- }
- for (t in 1:length(tau)) {
- tr_err<-ext_error_rate(cv[,t], ldaqda_resp , c("CV"), 1)
- for_curve[1:3,t]<-tr_err
- }
- dput(for_curve, file="${output}")
- </configfile>
- </configfiles>
- <help>
- .. class:: infomark
-
- **TIP:** If you want to perform Principal Component Analysis (PCA) on the give numeric input data (which corresponds to the "Source file First in "Generate A Matrix" tool), please use *Multivariate Analysis/Principal Component Analysis*
-
- -----
- .. class:: infomark
- **What it does**
- This tool consists of the module to perform the Linear Discriminant Analysis as described in Carrel et al., 2006 (PMID: 17009873)
- *Carrel L, Park C, Tyekucheva S, Dunn J, Chiaromonte F, et al. (2006) Genomic Environment Predicts Expression Patterns on the Human Inactive X Chromosome. PLoS Genet 2(9): e151. doi:10.1371/journal.pgen.0020151*
- -----
- .. class:: warningmark
-
- **Note**
- - Output from "Generate A Matrix" tool is used as input file for this tool
- - Output of this tool contains LDA classification success rates for different values of the turning parameter tau (from 0 to 1 with 0.005 interval). This output file will be used to establish the ROC plot, and you can obtain more detail information from this plot.
- </help>
- </tool>