PageRenderTime 42ms CodeModel.GetById 36ms app.highlight 4ms RepoModel.GetById 0ms app.codeStats 1ms

/fingerprint/R/read.R

http://github.com/rajarshi/cdkr
R | 115 lines | 99 code | 10 blank | 6 comment | 24 complexity | 64fa3fde01189ac33b8ba55b7483a567 MD5 | raw file
  1jchem.binary.lf <- function(line) {
  2  molid <- strsplit(line, "\t")[[1]][1]
  3  bitpos <- .Call("parse_jchem_binary", as.character(line), as.integer(nchar(line)) )
  4  if (is.null(bitpos)) return(NULL)
  5  list(molid, bitpos+1, list()) ## we add 1, since C does bit positions from 0  
  6}
  7
  8fps.lf <- function(line) {
  9  toks <- strsplit(line, "\\s")[[1]];
 10  bitpos <- .Call("parse_hex", as.character(toks[1]), as.integer(nchar(toks[1])))
 11  if (is.null(bitpos)) return(NULL)
 12  if (length(toks) > 2) {
 13    misc <- list(toks[-c(1,2)])
 14  } else { misc <- list() }
 15  list(toks[2], bitpos+1, misc) ## we add 1, since C does bit positions from 0
 16}
 17
 18cdk.lf <- function(line) {
 19  p <- regexpr("{([0-9,\\s]*)}",line,perl=T)
 20  s <- gsub(',','',substr(line, p+1, p+attr(p,"match.length")-2))
 21  s <- lapply( strsplit(s,' '), as.numeric )
 22  molid <- gsub("\\s+","", strsplit(line, "\\{")[[1]][1])
 23  list(molid, s[[1]], list())
 24}
 25
 26moe.lf <- function(line) {
 27  p <- regexpr("\"([0-9\\s]*)\"",line, perl=T)
 28  s <- substr(line, p+1, p+attr(p,"match.length")-2)
 29  s <- lapply( strsplit(s,' '), as.numeric )
 30  list(NA, s[[1]], list())
 31}
 32
 33bci.lf <- function(line) {
 34  tokens <- strsplit(line, '\\s')[[1]]
 35  name <- tokens[1]
 36  tokens <- tokens[-c(1, length(tokens), length(tokens)-1)]
 37  list(name, as.numeric(tokens), list())
 38}
 39
 40ecfp.lf <- function(line) {
 41  tokens <- strsplit(line, '\\s')[[1]]
 42  name <- tokens[1]
 43  tokens <- tokens[-1]
 44  list(name, tokens, list())
 45}
 46
 47## TODO we should be iterating over lines and not reading
 48## them all in
 49fp.read <- function(f='fingerprint.txt', size=1024, lf=cdk.lf, header=FALSE, binary=TRUE) {
 50  lf.name <- deparse(substitute(lf))
 51  
 52  provider <- lf.name
 53  
 54  fplist <- list()
 55  fcon <- file(description=f,open='r')
 56  lines = readLines(fcon,n=-1)
 57  if (header && lf.name != 'fps.lf') lines = lines[-1]
 58  if (lf.name == 'fps.lf') {
 59    binary <- TRUE
 60    size <- NULL
 61    ## process the header block
 62    nheaderline = 0
 63    for (line in lines) {
 64      if (substr(line,1,1) != '#') break
 65      nheaderline <- nheaderline + 1
 66      if (nheaderline == 1 && length(grep("#FPS1", line)) != 1) stop("Invalid FPS format")
 67      if (length(grep("#num_bits", line)) == 1) size <- as.numeric(strsplit(line, '=')[[1]][2])
 68      if (length(grep("#software", line)) == 1) provider <- as.character(strsplit(line, '=')[[1]][2])      
 69    }
 70    lines <- lines[ (nheaderline+1):length(lines) ]
 71    if (is.null(size)) { # num_bit
 72      size <- nchar(strsplit(line, '\\s')[[1]][1]) * 4
 73    }
 74  }
 75  c = 1
 76  for (line in lines) {
 77    dat <- lf(line)
 78    if (is.null(dat)) {
 79      warning(sprintf("Couldn't parse: %s", line))
 80      next
 81    }
 82    if (is.na(dat[[1]])) name <- ""
 83    else name <- dat[[1]]
 84
 85    misc <- dat[[3]] ## usually empty
 86    if (binary) {
 87      fplist[[c]] <- new("fingerprint",
 88                         nbit=size,
 89                         bits=as.numeric(dat[[2]]),
 90                         folded=FALSE,
 91                         provider=provider,
 92                         name=name,
 93                         misc=misc)
 94    } else {
 95      ## convert the features to 'feature' objects
 96      feats <- lapply(dat[[2]], function(x) new("feature", feature=x))
 97      fplist[[c]] <- new("featvec",
 98                         features=feats,
 99                         provider=provider,
100                         name=name,
101                         misc=misc)
102    }
103    c <- c+1
104  }
105  close(fcon)
106  fplist
107}
108
109## Need to supply the length of the bit string since fp.read does
110## not provide that information
111fp.read.to.matrix <- function(f='fingerprint.txt', size=1024, lf=cdk.lf, header=FALSE) {
112  fplist <- fp.read(f, size, lf, header)
113  fpmat <- fp.to.matrix(fplist)
114  fpmat
115}