PageRenderTime 190ms CodeModel.GetById 180ms app.highlight 7ms RepoModel.GetById 1ms app.codeStats 0ms

/rcdk/R/io.R

http://github.com/rajarshi/cdkr
R | 221 lines | 103 code | 16 blank | 102 comment | 34 complexity | c96b52b7625aa66130195900755d43df MD5 | raw file
  1.packageName <- "rcdk"
  2
  3#' Write molecules to disk.
  4#' 
  5#' This function writes one or more molecules to an SD file on disk,
  6#' which can be of the single- or multi-molecule variety. In
  7#' addition, if the molecule has keyed properties, they can also be
  8#' written out as SD tags.
  9#' 
 10#' @details In case individual SD files are desired the
 11#' \code{together} argument can be set ot \code{FALSE}. In this case, the
 12#' value of \code{filename} is used as a prefix, to which a numeric
 13#' identifier and the suffix of ".sdf" is appended. 
 14#' 
 15#' @param mols A `list` of `jobjRef` objects representing  `IAtomContainer` objects
 16#' @param filename The name of the SD file to write. Note that if
 17#' `together` is `FALSE` then this argument is taken as a prefix for
 18#' the name of the individual files
 19#' @param together If `TRUE` then all the molecules are written to a
 20#' single SD file. If `FALSE` each molecule is written to an
 21#' individual file
 22#' @param write.props If `TRUE`, keyed properties are included in the SD file output
 23#' @seealso \code{\link{load.molecules}}, \code{\link{parse.smiles}}, \code{\link{iload.molecules}}
 24#' @export
 25#' @author Rajarshi Guha (\email{rajarshi.guha@@gmail.com})
 26write.molecules <- function(mols, filename, together=TRUE, write.props=FALSE) {
 27  if (together) {
 28    value <-.jcall('org/guha/rcdk/util/Misc', 'V', 'writeMoleculesInOneFile',
 29                   .jarray(mols,
 30                           contents.class = "org/openscience/cdk/interfaces/IAtomContainer"),
 31                   as.character(filename), as.integer(ifelse(write.props,1,0)))
 32  } else {
 33    value <- .jcall('org/guha/rcdk/util/Misc', 'V', 'writeMolecules',
 34                    .jarray(mols,
 35                            contents.class = "org/openscience/cdk/interfaces/IAtomContainer"),
 36                    as.character(filename), as.integer(ifelse(write.props,1,0)))
 37  }
 38}
 39
 40#' Load molecular structures from disk or URL
 41#' 
 42#' The CDK can read a variety of molecular structure formats. This function
 43#' encapsulates the calls to the CDK API to load a structure given its filename
 44#' or a URL to a structure file.
 45#' 
 46#' @details 
 47#' Note that this method will load all molecules into memory. For files containing
 48#' tens of thousands of molecules this may lead to out of memory errors. In such 
 49#' situations consider using the iterating file readers.
 50#' 
 51#' Note that if molecules are read in from formats that do not have rules for
 52#' handling implicit hydrogens (such as MDL MOL), the molecule will not have
 53#' implicit or explicit hydrogens. To add explicit hydrogens, make sure that the molecule
 54#' has been typed (this is `TRUE` by default for this function) and then call 
 55#' \code{\link{convert.implicit.to.explicit}}. On the other hand for a format 
 56#' such as SMILES, implicit or explicit hydrogens will be present.
 57
 58#' @param molfiles A `character` vector of filenames. Note that the full
 59#' path to the files should be provided. URL's can also be used as
 60#' paths. In such a case, the URL should start with "http://"
 61#' @param aromaticity If `TRUE` then aromaticity detection is
 62#' performed on all loaded molecules. If this fails for a given
 63#' molecule, then the molecule is set to `NA` in the return list
 64#' @param typing If `TRUE` then atom typing is
 65#' performed on all loaded molecules. The assigned types will be CDK
 66#' internal types. If this fails for a given molecule, then the molecule 
 67#' is set to `NA` in the return list
 68#' @param isotopes If `TRUE` then atoms are configured with isotopic masses
 69#' @param verbose If `TRUE`, output (such as file download progress) will
 70#' be bountiful
 71#' @return A `list` of CDK `IAtomContainer` objects, represented as `jobjRef` objects 
 72#' in R, which can be used in other `rcdk` functions
 73#' @seealso \code{\link{write.molecules}}, \code{\link{parse.smiles}}, \code{\link{iload.molecules}}
 74#' @importFrom utils download.file
 75#' @export
 76#' @author Rajarshi Guha (\email{rajarshi.guha@@gmail.com})
 77#' @examples 
 78#' \dontrun{
 79#' sdffile <- system.file("molfiles/dhfr00008.sdf", package="rcdk")
 80#' mols <- load.molecules(c('mol1.sdf', 'mol2.smi', sdfile))
 81
 82#' }
 83load.molecules <- function(molfiles=NA, aromaticity = TRUE, 
 84                           typing = TRUE, isotopes = TRUE, 
 85                           verbose=FALSE) {
 86  if (any(is.na(molfiles))) {
 87    stop("Must supply a vector of file names")
 88  }
 89  if (length(molfiles) == 0) {
 90    stop("Must supply a vector of file names")
 91  }
 92
 93  for (f in molfiles) {
 94    if (!file.exists(f) && length(grep('http://', f)) == 0 && length(grep('https://', f)) == 0)
 95      stop(paste(f, ": Does not exist", sep=''))
 96  }
 97
 98  urls <- grep('http|https', molfiles)
 99  if (length(urls) > 0) { ## download the files and replace the URL's with the temp names
100    for (idx in urls) {
101      url <- molfiles[idx]
102      tmpdest <- tempfile(pattern='xxx')
103      status <- try(download.file(url, destfile=tmpdest,
104                                  method='curl',
105                                  mode='wb', quiet=!verbose),
106                    silent=verbose)
107      if (class(status) == 'try-error') {
108        molfiles[idx] <- NA
109        cat("Can't get ", url, '\n')
110      } else {
111        molfiles[idx] <- tmpdest
112      }
113    }
114  }
115  molfiles <- molfiles[ !is.na(molfiles) ]
116  farr <- .jarray(molfiles, contents.class = 'S')
117  molecules <- .jcall('org/guha/rcdk/util/Misc', '[Lorg/openscience/cdk/interfaces/IAtomContainer;',
118                      'loadMolecules', farr, aromaticity, typing, isotopes,
119                      check=FALSE)
120  exception <- .jgetEx(clear = TRUE)
121  if (!is.null(exception)) {
122    stop(exception$toString())
123  }
124
125  if (is.jnull(molecules)) {
126    return(NA)
127  }
128  if (length(molecules) == 0) {
129    return(molecules)
130  } else {
131    nulls <- which( unlist(lapply(molecules, is.jnull)) )
132    if (length(nulls) > 0) molecules[nulls] <- NA
133    return(molecules)
134  }
135}
136
137#' @importFrom itertools hasNext
138hasNext <- function(obj, ...) { UseMethod("hasNext") } 
139#' @export
140hasNext.iload.molecules <- function(obj, ...) obj$hasNext()
141
142#' Load molecules using an iterator.
143#' 
144#' The CDK can read a variety of molecular structure formats. Some file
145#' formats support multiple molecules in a single file. If read using
146#' \code{\link{load.molecules}}, all are read into memory. For very large
147#' structure files, this can lead to out of memory errors. Instead it is 
148#' recommended to use the iterating version of the loader so that only a
149#' single molecule is read at a time.
150#' 
151#' Note that the iterating loader only supports SDF and SMILES file formats.
152#' 
153#' @param molfile A string containing the filename to load. Must be a local file
154#' @param type Indicates whether the input file is SMILES or SDF. Valid values are
155#' `"smi"` or `"sdf"`
156#' @param skip If `TRUE`, then the reader will continue reading even when 
157#' faced with an invalid molecule. If `FALSE`, the reader will stop at 
158#' the fist invalid molecule
159#' @param aromaticity If `TRUE` then aromaticity detection is
160#' performed on all loaded molecules. If this fails for a given
161#' molecule, then the molecule is set to `NA` in the return list
162#' @param typing If `TRUE` then atom typing is
163#' performed on all loaded molecules. The assigned types will be CDK
164#' internal types. If this fails for a given molecule, then the molecule 
165#' is set to `NA` in the return list
166#' @param isotopes If `TRUE` then atoms are configured with isotopic masses
167#' @seealso \code{\link{write.molecules}}, \code{\link{load.molecules}}, \code{\link{parse.smiles}}
168#' @export
169#' @author Rajarshi Guha (\email{rajarshi.guha@@gmail.com})
170#' @examples 
171#' \dontrun{
172#' moliter <- iload.molecules("big.sdf", type="sdf")
173#' while(hasNext(moliter)) {
174#' mol <- nextElem(moliter)
175#'   print(get.property(mol, "cdk:Title"))
176#' }
177#' }
178iload.molecules<- function(molfile, type = 'smi', 
179                           aromaticity = TRUE, typing = TRUE, isotopes = TRUE, 
180                           skip=TRUE) {
181  
182  if (!file.exists(molfile) && length(grep('http://', molfile)) == 0)
183    stop(paste(molfile, ": Does not exist", sep=''))
184  
185  fr <- .jnew("java/io/FileReader", as.character(molfile))
186  dcob <- get.chem.object.builder()
187  if (type == 'smi') {
188    sreader <- .jnew("org/openscience/cdk/io/iterator/IteratingSMILESReader",.jcast(fr, "java/io/Reader"), dcob)
189  } else if (type == 'sdf') {
190    sreader <- .jnew("org/openscience/cdk/io/iterator/IteratingSDFReader",.jcast(fr, "java/io/Reader"), dcob)
191    .jcall(sreader, "V", "setSkip", skip)
192  }
193  hasNext <- NA
194  mol <- NA
195  molr <- NA
196  
197  hasNx <- function() {
198    hasNext <<- .jcall(sreader, "Z", "hasNext")
199    if (!hasNext) {
200      .jcall(sreader, "V", "close")      
201      mol <<- NA
202    }
203    return(hasNext)
204  }
205  
206  nextEl <- function() {
207    mol <<- .jcall(sreader, "Ljava/lang/Object;", "next")
208    mol <<- .jcast(mol, "org/openscience/cdk/interfaces/IAtomContainer")
209    if (aromaticity) do.aromaticity(mol)
210    if (typing) do.typing(mol)
211    if (isotopes) do.isotopes(mol)
212    
213    hasNext <<- NA    
214    return(mol)
215  }
216  
217  obj <- list(nextElem = nextEl, hasNext = hasNx)
218  class(obj) <- c("iload.molecules", "abstractiter", "iter")
219  obj
220}
221