/rcdk/R/io.R
R | 221 lines | 103 code | 16 blank | 102 comment | 34 complexity | c96b52b7625aa66130195900755d43df MD5 | raw file
1.packageName <- "rcdk" 2 3#' Write molecules to disk. 4#' 5#' This function writes one or more molecules to an SD file on disk, 6#' which can be of the single- or multi-molecule variety. In 7#' addition, if the molecule has keyed properties, they can also be 8#' written out as SD tags. 9#' 10#' @details In case individual SD files are desired the 11#' \code{together} argument can be set ot \code{FALSE}. In this case, the 12#' value of \code{filename} is used as a prefix, to which a numeric 13#' identifier and the suffix of ".sdf" is appended. 14#' 15#' @param mols A `list` of `jobjRef` objects representing `IAtomContainer` objects 16#' @param filename The name of the SD file to write. Note that if 17#' `together` is `FALSE` then this argument is taken as a prefix for 18#' the name of the individual files 19#' @param together If `TRUE` then all the molecules are written to a 20#' single SD file. If `FALSE` each molecule is written to an 21#' individual file 22#' @param write.props If `TRUE`, keyed properties are included in the SD file output 23#' @seealso \code{\link{load.molecules}}, \code{\link{parse.smiles}}, \code{\link{iload.molecules}} 24#' @export 25#' @author Rajarshi Guha (\email{rajarshi.guha@@gmail.com}) 26write.molecules <- function(mols, filename, together=TRUE, write.props=FALSE) { 27 if (together) { 28 value <-.jcall('org/guha/rcdk/util/Misc', 'V', 'writeMoleculesInOneFile', 29 .jarray(mols, 30 contents.class = "org/openscience/cdk/interfaces/IAtomContainer"), 31 as.character(filename), as.integer(ifelse(write.props,1,0))) 32 } else { 33 value <- .jcall('org/guha/rcdk/util/Misc', 'V', 'writeMolecules', 34 .jarray(mols, 35 contents.class = "org/openscience/cdk/interfaces/IAtomContainer"), 36 as.character(filename), as.integer(ifelse(write.props,1,0))) 37 } 38} 39 40#' Load molecular structures from disk or URL 41#' 42#' The CDK can read a variety of molecular structure formats. This function 43#' encapsulates the calls to the CDK API to load a structure given its filename 44#' or a URL to a structure file. 45#' 46#' @details 47#' Note that this method will load all molecules into memory. For files containing 48#' tens of thousands of molecules this may lead to out of memory errors. In such 49#' situations consider using the iterating file readers. 50#' 51#' Note that if molecules are read in from formats that do not have rules for 52#' handling implicit hydrogens (such as MDL MOL), the molecule will not have 53#' implicit or explicit hydrogens. To add explicit hydrogens, make sure that the molecule 54#' has been typed (this is `TRUE` by default for this function) and then call 55#' \code{\link{convert.implicit.to.explicit}}. On the other hand for a format 56#' such as SMILES, implicit or explicit hydrogens will be present. 57 58#' @param molfiles A `character` vector of filenames. Note that the full 59#' path to the files should be provided. URL's can also be used as 60#' paths. In such a case, the URL should start with "http://" 61#' @param aromaticity If `TRUE` then aromaticity detection is 62#' performed on all loaded molecules. If this fails for a given 63#' molecule, then the molecule is set to `NA` in the return list 64#' @param typing If `TRUE` then atom typing is 65#' performed on all loaded molecules. The assigned types will be CDK 66#' internal types. If this fails for a given molecule, then the molecule 67#' is set to `NA` in the return list 68#' @param isotopes If `TRUE` then atoms are configured with isotopic masses 69#' @param verbose If `TRUE`, output (such as file download progress) will 70#' be bountiful 71#' @return A `list` of CDK `IAtomContainer` objects, represented as `jobjRef` objects 72#' in R, which can be used in other `rcdk` functions 73#' @seealso \code{\link{write.molecules}}, \code{\link{parse.smiles}}, \code{\link{iload.molecules}} 74#' @importFrom utils download.file 75#' @export 76#' @author Rajarshi Guha (\email{rajarshi.guha@@gmail.com}) 77#' @examples 78#' \dontrun{ 79#' sdffile <- system.file("molfiles/dhfr00008.sdf", package="rcdk") 80#' mols <- load.molecules(c('mol1.sdf', 'mol2.smi', sdfile)) 81 82#' } 83load.molecules <- function(molfiles=NA, aromaticity = TRUE, 84 typing = TRUE, isotopes = TRUE, 85 verbose=FALSE) { 86 if (any(is.na(molfiles))) { 87 stop("Must supply a vector of file names") 88 } 89 if (length(molfiles) == 0) { 90 stop("Must supply a vector of file names") 91 } 92 93 for (f in molfiles) { 94 if (!file.exists(f) && length(grep('http://', f)) == 0 && length(grep('https://', f)) == 0) 95 stop(paste(f, ": Does not exist", sep='')) 96 } 97 98 urls <- grep('http|https', molfiles) 99 if (length(urls) > 0) { ## download the files and replace the URL's with the temp names 100 for (idx in urls) { 101 url <- molfiles[idx] 102 tmpdest <- tempfile(pattern='xxx') 103 status <- try(download.file(url, destfile=tmpdest, 104 method='curl', 105 mode='wb', quiet=!verbose), 106 silent=verbose) 107 if (class(status) == 'try-error') { 108 molfiles[idx] <- NA 109 cat("Can't get ", url, '\n') 110 } else { 111 molfiles[idx] <- tmpdest 112 } 113 } 114 } 115 molfiles <- molfiles[ !is.na(molfiles) ] 116 farr <- .jarray(molfiles, contents.class = 'S') 117 molecules <- .jcall('org/guha/rcdk/util/Misc', '[Lorg/openscience/cdk/interfaces/IAtomContainer;', 118 'loadMolecules', farr, aromaticity, typing, isotopes, 119 check=FALSE) 120 exception <- .jgetEx(clear = TRUE) 121 if (!is.null(exception)) { 122 stop(exception$toString()) 123 } 124 125 if (is.jnull(molecules)) { 126 return(NA) 127 } 128 if (length(molecules) == 0) { 129 return(molecules) 130 } else { 131 nulls <- which( unlist(lapply(molecules, is.jnull)) ) 132 if (length(nulls) > 0) molecules[nulls] <- NA 133 return(molecules) 134 } 135} 136 137#' @importFrom itertools hasNext 138hasNext <- function(obj, ...) { UseMethod("hasNext") } 139#' @export 140hasNext.iload.molecules <- function(obj, ...) obj$hasNext() 141 142#' Load molecules using an iterator. 143#' 144#' The CDK can read a variety of molecular structure formats. Some file 145#' formats support multiple molecules in a single file. If read using 146#' \code{\link{load.molecules}}, all are read into memory. For very large 147#' structure files, this can lead to out of memory errors. Instead it is 148#' recommended to use the iterating version of the loader so that only a 149#' single molecule is read at a time. 150#' 151#' Note that the iterating loader only supports SDF and SMILES file formats. 152#' 153#' @param molfile A string containing the filename to load. Must be a local file 154#' @param type Indicates whether the input file is SMILES or SDF. Valid values are 155#' `"smi"` or `"sdf"` 156#' @param skip If `TRUE`, then the reader will continue reading even when 157#' faced with an invalid molecule. If `FALSE`, the reader will stop at 158#' the fist invalid molecule 159#' @param aromaticity If `TRUE` then aromaticity detection is 160#' performed on all loaded molecules. If this fails for a given 161#' molecule, then the molecule is set to `NA` in the return list 162#' @param typing If `TRUE` then atom typing is 163#' performed on all loaded molecules. The assigned types will be CDK 164#' internal types. If this fails for a given molecule, then the molecule 165#' is set to `NA` in the return list 166#' @param isotopes If `TRUE` then atoms are configured with isotopic masses 167#' @seealso \code{\link{write.molecules}}, \code{\link{load.molecules}}, \code{\link{parse.smiles}} 168#' @export 169#' @author Rajarshi Guha (\email{rajarshi.guha@@gmail.com}) 170#' @examples 171#' \dontrun{ 172#' moliter <- iload.molecules("big.sdf", type="sdf") 173#' while(hasNext(moliter)) { 174#' mol <- nextElem(moliter) 175#' print(get.property(mol, "cdk:Title")) 176#' } 177#' } 178iload.molecules<- function(molfile, type = 'smi', 179 aromaticity = TRUE, typing = TRUE, isotopes = TRUE, 180 skip=TRUE) { 181 182 if (!file.exists(molfile) && length(grep('http://', molfile)) == 0) 183 stop(paste(molfile, ": Does not exist", sep='')) 184 185 fr <- .jnew("java/io/FileReader", as.character(molfile)) 186 dcob <- get.chem.object.builder() 187 if (type == 'smi') { 188 sreader <- .jnew("org/openscience/cdk/io/iterator/IteratingSMILESReader",.jcast(fr, "java/io/Reader"), dcob) 189 } else if (type == 'sdf') { 190 sreader <- .jnew("org/openscience/cdk/io/iterator/IteratingSDFReader",.jcast(fr, "java/io/Reader"), dcob) 191 .jcall(sreader, "V", "setSkip", skip) 192 } 193 hasNext <- NA 194 mol <- NA 195 molr <- NA 196 197 hasNx <- function() { 198 hasNext <<- .jcall(sreader, "Z", "hasNext") 199 if (!hasNext) { 200 .jcall(sreader, "V", "close") 201 mol <<- NA 202 } 203 return(hasNext) 204 } 205 206 nextEl <- function() { 207 mol <<- .jcall(sreader, "Ljava/lang/Object;", "next") 208 mol <<- .jcast(mol, "org/openscience/cdk/interfaces/IAtomContainer") 209 if (aromaticity) do.aromaticity(mol) 210 if (typing) do.typing(mol) 211 if (isotopes) do.isotopes(mol) 212 213 hasNext <<- NA 214 return(mol) 215 } 216 217 obj <- list(nextElem = nextEl, hasNext = hasNx) 218 class(obj) <- c("iload.molecules", "abstractiter", "iter") 219 obj 220} 221