/fingerprint/R/misc.R

http://github.com/rajarshi/cdkr · R · 233 lines · 192 code · 32 blank · 9 comment · 109 complexity · b635834ec05b7b42fb0634b8045201d2 MD5 · raw file

  1. setGeneric("fold", function(fp) standardGeneric("fold"))
  2. setMethod("fold", "fingerprint",
  3. function(fp) {
  4. size <- fp@nbit
  5. if (size %% 2 != 0) {
  6. stop('Need to supply a fingerprint of even numbered length')
  7. }
  8. bfp <- rep(FALSE, size)
  9. bfp[fp@bits] <- TRUE
  10. subfplen <- size/2
  11. b1 <- which(bfp[1:subfplen])
  12. b2 <- which(bfp[(subfplen+1):size])
  13. subfp1 <- new("fingerprint",
  14. nbit=subfplen,
  15. bits=b1,
  16. provider="R");
  17. subfp2 <- new("fingerprint",
  18. nbit=subfplen,
  19. bits=b2,
  20. provider="R")
  21. foldedfp <- subfp1 | subfp2
  22. foldedfp@folded <- TRUE
  23. return(foldedfp)
  24. })
  25. setGeneric("euc.vector", function(fp) standardGeneric("euc.vector"))
  26. setMethod("euc.vector", "fingerprint",
  27. function(fp) {
  28. coord <- rep(0,length(fp))
  29. coord[fp@bits] <- 1.0 / sqrt(length(fp))
  30. coord
  31. })
  32. setGeneric("distance", function(fp1,fp2,method,a,b) standardGeneric("distance"))
  33. setMethod("distance", c("featvec", "featvec", "missing", "missing", "missing"),
  34. function(fp1, fp2) {
  35. distance(fp1, fp2, "tanimoto" )
  36. })
  37. setMethod("distance", c("featvec", "featvec", "character", "missing", "missing"),
  38. function(fp1, fp2, method=c("tanimoto", "dice", "robust")) {
  39. method <- match.arg(method)
  40. n1 <- length(fp1)
  41. n2 <- length(fp2)
  42. ## extract the feature strings, ignoring counts for now
  43. f1 <- sapply(fp1@features, function(x) x@feature)
  44. f2 <- sapply(fp2@features, function(x) x@feature)
  45. n12 <- length(intersect(f1,f2))
  46. if (method == 'tanimoto') {
  47. return(n12/(n1+n2-n12))
  48. } else if (method == "robust") {
  49. return(0.5 + 0.5 * n12 * n12 / (n1*n2))
  50. } else if (method == "dice") {
  51. return(2.0 * n12 / (n1+n2))
  52. }
  53. })
  54. setMethod("distance", c("fingerprint", "fingerprint", "missing", "missing", "missing"),
  55. function(fp1,fp2) {
  56. distance(fp1,fp2,"tanimoto")
  57. })
  58. setMethod("distance", c("fingerprint", "fingerprint", "character", "numeric", "numeric"),
  59. function(fp1, fp2, method="tversky", a, b) {
  60. if (!is.null(method) && !is.na(method) && method != "tversky") distance(fp1, fp2, method)
  61. if ( length(fp1) != length(fp2))
  62. stop("Fingerprints must of the same bit length")
  63. if (a < 0 || b < 0) stop("a and b must be positive")
  64. tmp <- fp1 & fp2
  65. xiy <- length(tmp@bits)
  66. tmp <- fp1 | fp2
  67. xuy <- length(tmp@bits)
  68. x <- length(fp1@bits)
  69. y <- length(fp2@bits)
  70. return( xiy / (a*x + b*y + (1-a-b)*xiy ) )
  71. })
  72. setMethod("distance", c("fingerprint", "fingerprint", "character", "missing", "missing"),
  73. function(fp1,fp2, method=c('tanimoto', 'euclidean', 'mt',
  74. 'simple', 'jaccard', 'dice',
  75. 'russelrao', 'rodgerstanimoto','cosine',
  76. 'achiai', 'carbo', 'baroniurbanibuser',
  77. 'kulczynski2',
  78. 'hamming', 'meanHamming', 'soergel',
  79. 'patternDifference', 'variance', 'size', 'shape',
  80. 'hamann', 'yule', 'pearson', 'dispersion',
  81. 'mcconnaughey', 'stiles',
  82. 'simpson', 'petke',
  83. 'stanimoto', 'seuclidean'
  84. )) {
  85. if (method == 'tversky')
  86. stop("If Tversky metric is desired, must specify a and b")
  87. if ( length(fp1) != length(fp2))
  88. stop("Fingerprints must of the same bit length")
  89. method <- match.arg(method)
  90. n <- length(fp1)
  91. if (method == 'tanimoto') {
  92. f1 <- numeric(n)
  93. f2 <- numeric(n)
  94. f1[fp1@bits] <- 1
  95. f2[fp2@bits] <- 1
  96. sim <- 0.0
  97. ret <- .C("fpdistance", as.double(f1), as.double(f2),
  98. as.integer(n), as.integer(1),
  99. as.double(sim),
  100. PACKAGE="fingerprint")
  101. return (ret[[5]])
  102. } else if (method == 'euclidean') {
  103. f1 <- numeric(n)
  104. f2 <- numeric(n)
  105. f1[fp1@bits] <- 1
  106. f2[fp2@bits] <- 1
  107. sim <- 0.0
  108. ret <- .C("fpdistance", as.double(f1), as.double(f1),
  109. as.integer(n), as.integer(2),
  110. as.double(sim),
  111. PACKAGE="fingerprint")
  112. return (ret[[5]])
  113. }
  114. size <- n
  115. ## in A & B
  116. tmp <- fp1 & fp2
  117. c <- length(tmp@bits)
  118. ## in A not in B
  119. tmp <- (fp1 | fp2) & !fp2
  120. a <- length(tmp@bits)
  121. ## in B not in A
  122. tmp <- (fp1 | fp2) & !fp1
  123. b <- length(tmp@bits)
  124. ## not in A, not in B
  125. tmp <- !(fp1 | fp2)
  126. d <- length(tmp@bits)
  127. dist <- NULL
  128. ## Simlarity
  129. if (method == 'stanimoto') {
  130. dist <- c / (a+b+c)
  131. } else if (method == 'seuclidean') {
  132. dist <- sqrt((d+c) / (a+b+c+d))
  133. } else if (method == 'dice') {
  134. dist <- c / (.5*a + .5*b + c)
  135. } else if (method == 'mt') {
  136. t1 <- c/(size-d)
  137. t0 <- d/(size-c)
  138. phat <- ((size-d) + c)/(2*size)
  139. dist <- (2-phat)*t1/3 + (1+phat)*t0/3
  140. } else if (method == 'simple') {
  141. dist <- (c+d)/n
  142. } else if (method == 'jaccard') {
  143. dist <- c/(a+b+c)
  144. } else if (method == 'russelrao') {
  145. dist <- c/size
  146. } else if (method == 'rodgerstanimoto') {
  147. dist <- (c+d)/(2*a+2*b+c+d)
  148. } else if (method == 'cosine' || method == 'achiai' || method == 'carbo') {
  149. dist <- c/sqrt((a+c)*(b+c))
  150. } else if (method == 'baroniurbanibuser') {
  151. dist <- (sqrt(c*d)+c)/(sqrt(c*d)+a+b+c)
  152. } else if (method == 'kulczynski2') {
  153. dist <- .5*(c/(a+c)+c/(b+c))
  154. }
  155. ## Dissimilarity
  156. else if (method == 'hamming') {
  157. dist <- a+b
  158. } else if (method == 'meanHamming') {
  159. dist <- (a+b)/(a+b+c+d)
  160. }else if (method == 'soergel') {
  161. dist <- (a+b)/(a+b+c)
  162. } else if (method == 'patternDifference') {
  163. dist <- (a*b)/(a+b+c+d)^2
  164. } else if (method == 'variance') {
  165. dist <- (a+b)/(4*n)
  166. } else if (method == 'size') {
  167. dist <- (a-b)^2/n^2
  168. } else if (method == 'shape') {
  169. dist <- (a+b)/n-((a-b)/(n))^2
  170. }
  171. ## Composite
  172. else if (method == 'hamann') {
  173. dist <- (c+d-a-b)/(a+b+c+d)
  174. } else if (method == 'yule') {
  175. dist <- (c*d-a*b)/(c*d+a*b)
  176. } else if (method == 'pearson') {
  177. dist <- (c*d-a*b)/sqrt((a+c)*(b+c)*(a+d)*(b+d))
  178. } else if (method == 'dispersion') {
  179. dist <- (c*d-a*b)/n^2
  180. } else if (method == 'mcconaughey') {
  181. dist <- (c^2-a*b)/((a+c)*(b+c))
  182. } else if (method == 'stiles') {
  183. dist <- log10(n*(abs(c*d-a*b)-n/2)^2/((a+c)*(b+c)*(a+d)*(b+d)))
  184. }
  185. ## Asymmetric
  186. else if (method == 'simpson') {
  187. dist <- c/min((a+c),(b+c))
  188. } else if (method == 'petke') {
  189. dist <- c/max((a+c),(b+c))
  190. }
  191. dist
  192. })
  193. setGeneric("random.fingerprint",
  194. function(nbit, on) standardGeneric("random.fingerprint"))
  195. setMethod("random.fingerprint", c("numeric", "numeric"),
  196. function(nbit, on) {
  197. if (nbit <= 0) stop("Bit length must be positive integer")
  198. if (on <= 0) stop("Number of bits to be set to 1 must be positive integer")
  199. bits <- sample(1:nbit, size=on)
  200. new("fingerprint", nbit=nbit, bits=bits, provider="R", folded=FALSE)
  201. })