PageRenderTime 182ms CodeModel.GetById 171ms app.highlight 8ms RepoModel.GetById 1ms app.codeStats 0ms

/fingerprint/R/misc.R

http://github.com/rajarshi/cdkr
R | 233 lines | 192 code | 32 blank | 9 comment | 109 complexity | b635834ec05b7b42fb0634b8045201d2 MD5 | raw file
  1
  2setGeneric("fold", function(fp) standardGeneric("fold"))
  3setMethod("fold", "fingerprint",
  4          function(fp) {
  5            size <- fp@nbit
  6            if (size %% 2 != 0) {
  7              stop('Need to supply a fingerprint of even numbered length')
  8            }
  9            bfp <- rep(FALSE, size)
 10            bfp[fp@bits] <- TRUE
 11
 12            subfplen <- size/2
 13            
 14            b1 <- which(bfp[1:subfplen])
 15            b2 <- which(bfp[(subfplen+1):size])
 16            
 17            subfp1 <- new("fingerprint",
 18                          nbit=subfplen,
 19                          bits=b1,
 20                          provider="R");
 21            
 22            subfp2 <- new("fingerprint",
 23                          nbit=subfplen,
 24                          bits=b2,
 25                          provider="R")
 26            foldedfp <- subfp1 | subfp2
 27            foldedfp@folded <- TRUE
 28            return(foldedfp)
 29          })
 30
 31setGeneric("euc.vector", function(fp) standardGeneric("euc.vector"))
 32setMethod("euc.vector", "fingerprint",
 33          function(fp) {
 34            coord <- rep(0,length(fp))
 35            coord[fp@bits] <- 1.0 / sqrt(length(fp))
 36            coord
 37          })
 38
 39
 40setGeneric("distance", function(fp1,fp2,method,a,b) standardGeneric("distance"))
 41setMethod("distance", c("featvec", "featvec", "missing", "missing", "missing"),
 42          function(fp1, fp2) {
 43            distance(fp1, fp2, "tanimoto" )
 44          })
 45setMethod("distance", c("featvec", "featvec", "character", "missing", "missing"),
 46          function(fp1, fp2, method=c("tanimoto", "dice", "robust")) {
 47            method <- match.arg(method)
 48            n1 <- length(fp1)
 49            n2 <- length(fp2)
 50            ## extract the feature strings, ignoring counts for now
 51            f1 <- sapply(fp1@features, function(x) x@feature)
 52            f2 <- sapply(fp2@features, function(x) x@feature)
 53            
 54            n12 <- length(intersect(f1,f2))
 55            if (method == 'tanimoto') {
 56              return(n12/(n1+n2-n12))
 57            } else if (method == "robust") {
 58              return(0.5 + 0.5 * n12 * n12 / (n1*n2))
 59            } else if (method == "dice") {
 60              return(2.0 * n12 / (n1+n2))
 61            }
 62          })
 63
 64setMethod("distance", c("fingerprint", "fingerprint", "missing", "missing", "missing"),
 65          function(fp1,fp2) {
 66            distance(fp1,fp2,"tanimoto")
 67          })
 68
 69setMethod("distance", c("fingerprint", "fingerprint", "character", "numeric", "numeric"),
 70          function(fp1, fp2, method="tversky", a, b) {
 71            if (!is.null(method) && !is.na(method) && method != "tversky") distance(fp1, fp2, method)
 72            if ( length(fp1) != length(fp2))
 73              stop("Fingerprints must of the same bit length")
 74            if (a < 0 || b < 0) stop("a and b must be positive")
 75
 76            tmp <- fp1 & fp2
 77            xiy <- length(tmp@bits)
 78
 79            tmp <- fp1 | fp2
 80            xuy <- length(tmp@bits)
 81
 82            x <- length(fp1@bits)
 83            y <- length(fp2@bits)
 84            return( xiy / (a*x + b*y + (1-a-b)*xiy ) )
 85          })
 86setMethod("distance", c("fingerprint", "fingerprint", "character", "missing", "missing"),
 87          function(fp1,fp2, method=c('tanimoto', 'euclidean', 'mt',
 88                              'simple', 'jaccard', 'dice',
 89                              'russelrao', 'rodgerstanimoto','cosine',
 90                              'achiai', 'carbo', 'baroniurbanibuser',
 91                              'kulczynski2',
 92                              
 93                              'hamming', 'meanHamming', 'soergel',
 94                              'patternDifference', 'variance', 'size', 'shape',
 95
 96                              'hamann', 'yule', 'pearson', 'dispersion',
 97                              'mcconnaughey', 'stiles',
 98
 99                              'simpson', 'petke',
100                              'stanimoto', 'seuclidean'
101                              )) {
102
103            if (method == 'tversky')
104              stop("If Tversky metric is desired, must specify a and b")
105            
106            if ( length(fp1) != length(fp2))
107              stop("Fingerprints must of the same bit length")
108            
109            method <- match.arg(method)
110            n <- length(fp1)
111
112            if (method == 'tanimoto') {
113              f1 <- numeric(n)
114              f2 <- numeric(n)
115              f1[fp1@bits] <- 1
116              f2[fp2@bits] <- 1
117              sim <- 0.0
118              ret <-  .C("fpdistance", as.double(f1), as.double(f2),
119                         as.integer(n), as.integer(1),
120                         as.double(sim),
121                         PACKAGE="fingerprint")
122              return (ret[[5]])
123            } else if (method == 'euclidean') {
124              f1 <- numeric(n)
125              f2 <- numeric(n)
126              f1[fp1@bits] <- 1
127              f2[fp2@bits] <- 1
128              sim <- 0.0
129              ret <-  .C("fpdistance", as.double(f1), as.double(f1),
130                         as.integer(n), as.integer(2),
131                         as.double(sim),
132                         PACKAGE="fingerprint")
133              return (ret[[5]])
134            }
135
136            size <- n
137
138            ## in A & B
139            tmp <- fp1 & fp2
140            c <- length(tmp@bits)
141
142            ## in A not in B
143            tmp <- (fp1 | fp2) & !fp2
144            a <- length(tmp@bits)
145
146            ## in B not in A
147            tmp <- (fp1 | fp2) & !fp1
148            b <- length(tmp@bits)
149
150            ## not in A, not in B
151            tmp <- !(fp1 | fp2)
152            d <- length(tmp@bits)
153
154            dist <- NULL
155
156            ## Simlarity
157            if (method == 'stanimoto') {
158              dist <- c / (a+b+c)
159            } else if (method == 'seuclidean') {
160              dist <- sqrt((d+c) / (a+b+c+d))
161            } else if (method == 'dice') {
162              dist <- c / (.5*a + .5*b + c)
163            } else if (method == 'mt') {
164              t1 <- c/(size-d)
165              t0 <- d/(size-c)
166              phat <- ((size-d) + c)/(2*size)
167              dist <- (2-phat)*t1/3 + (1+phat)*t0/3
168            } else if (method == 'simple') {
169              dist <- (c+d)/n
170            } else if (method == 'jaccard') {
171              dist <- c/(a+b+c)
172            } else if (method == 'russelrao') {
173              dist <- c/size
174            } else if (method == 'rodgerstanimoto') {
175              dist <- (c+d)/(2*a+2*b+c+d)
176            } else if (method == 'cosine' || method == 'achiai' || method == 'carbo') {
177              dist <- c/sqrt((a+c)*(b+c))
178            } else if (method == 'baroniurbanibuser') {
179              dist <- (sqrt(c*d)+c)/(sqrt(c*d)+a+b+c)
180            } else if (method == 'kulczynski2') {
181              dist <- .5*(c/(a+c)+c/(b+c))              
182            }
183            ## Dissimilarity
184            else if (method == 'hamming') {
185              dist <- a+b
186            } else if (method == 'meanHamming') {
187              dist <- (a+b)/(a+b+c+d)
188            }else if (method == 'soergel') {
189              dist <- (a+b)/(a+b+c)
190            } else if (method == 'patternDifference') {
191              dist <- (a*b)/(a+b+c+d)^2
192            } else if (method == 'variance') {
193              dist <- (a+b)/(4*n)
194            } else if (method == 'size') {
195              dist <-  (a-b)^2/n^2
196            } else if (method == 'shape') {
197              dist <- (a+b)/n-((a-b)/(n))^2
198            }
199
200            ## Composite
201            else if (method == 'hamann') {
202              dist <- (c+d-a-b)/(a+b+c+d)
203            } else if (method == 'yule') {
204              dist <-  (c*d-a*b)/(c*d+a*b)
205            } else if (method == 'pearson') {
206              dist <- (c*d-a*b)/sqrt((a+c)*(b+c)*(a+d)*(b+d))
207            } else if (method == 'dispersion') {
208              dist <- (c*d-a*b)/n^2
209            } else if (method == 'mcconaughey') {
210              dist <- (c^2-a*b)/((a+c)*(b+c))
211            } else if (method == 'stiles') {
212              dist <- log10(n*(abs(c*d-a*b)-n/2)^2/((a+c)*(b+c)*(a+d)*(b+d)))
213            }
214
215            ## Asymmetric
216            else if (method == 'simpson') {
217              dist <- c/min((a+c),(b+c))
218            } else if (method == 'petke') {
219              dist <- c/max((a+c),(b+c))
220            }
221            
222            dist
223          })
224
225setGeneric("random.fingerprint",
226           function(nbit, on) standardGeneric("random.fingerprint"))
227setMethod("random.fingerprint", c("numeric", "numeric"),
228          function(nbit, on) {
229            if (nbit <= 0) stop("Bit length must be positive integer")
230            if (on <= 0) stop("Number of bits to be set to 1 must be positive integer")            
231            bits <- sample(1:nbit, size=on)
232            new("fingerprint", nbit=nbit, bits=bits, provider="R", folded=FALSE)
233          })