PageRenderTime 66ms CodeModel.GetById 1ms RepoModel.GetById 0ms app.codeStats 0ms

/rcdk/R/io.R

http://github.com/rajarshi/cdkr
R | 221 lines | 103 code | 16 blank | 102 comment | 34 complexity | c96b52b7625aa66130195900755d43df MD5 | raw file
  1. .packageName <- "rcdk"
  2. #' Write molecules to disk.
  3. #'
  4. #' This function writes one or more molecules to an SD file on disk,
  5. #' which can be of the single- or multi-molecule variety. In
  6. #' addition, if the molecule has keyed properties, they can also be
  7. #' written out as SD tags.
  8. #'
  9. #' @details In case individual SD files are desired the
  10. #' \code{together} argument can be set ot \code{FALSE}. In this case, the
  11. #' value of \code{filename} is used as a prefix, to which a numeric
  12. #' identifier and the suffix of ".sdf" is appended.
  13. #'
  14. #' @param mols A `list` of `jobjRef` objects representing `IAtomContainer` objects
  15. #' @param filename The name of the SD file to write. Note that if
  16. #' `together` is `FALSE` then this argument is taken as a prefix for
  17. #' the name of the individual files
  18. #' @param together If `TRUE` then all the molecules are written to a
  19. #' single SD file. If `FALSE` each molecule is written to an
  20. #' individual file
  21. #' @param write.props If `TRUE`, keyed properties are included in the SD file output
  22. #' @seealso \code{\link{load.molecules}}, \code{\link{parse.smiles}}, \code{\link{iload.molecules}}
  23. #' @export
  24. #' @author Rajarshi Guha (\email{rajarshi.guha@@gmail.com})
  25. write.molecules <- function(mols, filename, together=TRUE, write.props=FALSE) {
  26. if (together) {
  27. value <-.jcall('org/guha/rcdk/util/Misc', 'V', 'writeMoleculesInOneFile',
  28. .jarray(mols,
  29. contents.class = "org/openscience/cdk/interfaces/IAtomContainer"),
  30. as.character(filename), as.integer(ifelse(write.props,1,0)))
  31. } else {
  32. value <- .jcall('org/guha/rcdk/util/Misc', 'V', 'writeMolecules',
  33. .jarray(mols,
  34. contents.class = "org/openscience/cdk/interfaces/IAtomContainer"),
  35. as.character(filename), as.integer(ifelse(write.props,1,0)))
  36. }
  37. }
  38. #' Load molecular structures from disk or URL
  39. #'
  40. #' The CDK can read a variety of molecular structure formats. This function
  41. #' encapsulates the calls to the CDK API to load a structure given its filename
  42. #' or a URL to a structure file.
  43. #'
  44. #' @details
  45. #' Note that this method will load all molecules into memory. For files containing
  46. #' tens of thousands of molecules this may lead to out of memory errors. In such
  47. #' situations consider using the iterating file readers.
  48. #'
  49. #' Note that if molecules are read in from formats that do not have rules for
  50. #' handling implicit hydrogens (such as MDL MOL), the molecule will not have
  51. #' implicit or explicit hydrogens. To add explicit hydrogens, make sure that the molecule
  52. #' has been typed (this is `TRUE` by default for this function) and then call
  53. #' \code{\link{convert.implicit.to.explicit}}. On the other hand for a format
  54. #' such as SMILES, implicit or explicit hydrogens will be present.
  55. #' @param molfiles A `character` vector of filenames. Note that the full
  56. #' path to the files should be provided. URL's can also be used as
  57. #' paths. In such a case, the URL should start with "http://"
  58. #' @param aromaticity If `TRUE` then aromaticity detection is
  59. #' performed on all loaded molecules. If this fails for a given
  60. #' molecule, then the molecule is set to `NA` in the return list
  61. #' @param typing If `TRUE` then atom typing is
  62. #' performed on all loaded molecules. The assigned types will be CDK
  63. #' internal types. If this fails for a given molecule, then the molecule
  64. #' is set to `NA` in the return list
  65. #' @param isotopes If `TRUE` then atoms are configured with isotopic masses
  66. #' @param verbose If `TRUE`, output (such as file download progress) will
  67. #' be bountiful
  68. #' @return A `list` of CDK `IAtomContainer` objects, represented as `jobjRef` objects
  69. #' in R, which can be used in other `rcdk` functions
  70. #' @seealso \code{\link{write.molecules}}, \code{\link{parse.smiles}}, \code{\link{iload.molecules}}
  71. #' @importFrom utils download.file
  72. #' @export
  73. #' @author Rajarshi Guha (\email{rajarshi.guha@@gmail.com})
  74. #' @examples
  75. #' \dontrun{
  76. #' sdffile <- system.file("molfiles/dhfr00008.sdf", package="rcdk")
  77. #' mols <- load.molecules(c('mol1.sdf', 'mol2.smi', sdfile))
  78. #' }
  79. load.molecules <- function(molfiles=NA, aromaticity = TRUE,
  80. typing = TRUE, isotopes = TRUE,
  81. verbose=FALSE) {
  82. if (any(is.na(molfiles))) {
  83. stop("Must supply a vector of file names")
  84. }
  85. if (length(molfiles) == 0) {
  86. stop("Must supply a vector of file names")
  87. }
  88. for (f in molfiles) {
  89. if (!file.exists(f) && length(grep('http://', f)) == 0 && length(grep('https://', f)) == 0)
  90. stop(paste(f, ": Does not exist", sep=''))
  91. }
  92. urls <- grep('http|https', molfiles)
  93. if (length(urls) > 0) { ## download the files and replace the URL's with the temp names
  94. for (idx in urls) {
  95. url <- molfiles[idx]
  96. tmpdest <- tempfile(pattern='xxx')
  97. status <- try(download.file(url, destfile=tmpdest,
  98. method='curl',
  99. mode='wb', quiet=!verbose),
  100. silent=verbose)
  101. if (class(status) == 'try-error') {
  102. molfiles[idx] <- NA
  103. cat("Can't get ", url, '\n')
  104. } else {
  105. molfiles[idx] <- tmpdest
  106. }
  107. }
  108. }
  109. molfiles <- molfiles[ !is.na(molfiles) ]
  110. farr <- .jarray(molfiles, contents.class = 'S')
  111. molecules <- .jcall('org/guha/rcdk/util/Misc', '[Lorg/openscience/cdk/interfaces/IAtomContainer;',
  112. 'loadMolecules', farr, aromaticity, typing, isotopes,
  113. check=FALSE)
  114. exception <- .jgetEx(clear = TRUE)
  115. if (!is.null(exception)) {
  116. stop(exception$toString())
  117. }
  118. if (is.jnull(molecules)) {
  119. return(NA)
  120. }
  121. if (length(molecules) == 0) {
  122. return(molecules)
  123. } else {
  124. nulls <- which( unlist(lapply(molecules, is.jnull)) )
  125. if (length(nulls) > 0) molecules[nulls] <- NA
  126. return(molecules)
  127. }
  128. }
  129. #' @importFrom itertools hasNext
  130. hasNext <- function(obj, ...) { UseMethod("hasNext") }
  131. #' @export
  132. hasNext.iload.molecules <- function(obj, ...) obj$hasNext()
  133. #' Load molecules using an iterator.
  134. #'
  135. #' The CDK can read a variety of molecular structure formats. Some file
  136. #' formats support multiple molecules in a single file. If read using
  137. #' \code{\link{load.molecules}}, all are read into memory. For very large
  138. #' structure files, this can lead to out of memory errors. Instead it is
  139. #' recommended to use the iterating version of the loader so that only a
  140. #' single molecule is read at a time.
  141. #'
  142. #' Note that the iterating loader only supports SDF and SMILES file formats.
  143. #'
  144. #' @param molfile A string containing the filename to load. Must be a local file
  145. #' @param type Indicates whether the input file is SMILES or SDF. Valid values are
  146. #' `"smi"` or `"sdf"`
  147. #' @param skip If `TRUE`, then the reader will continue reading even when
  148. #' faced with an invalid molecule. If `FALSE`, the reader will stop at
  149. #' the fist invalid molecule
  150. #' @param aromaticity If `TRUE` then aromaticity detection is
  151. #' performed on all loaded molecules. If this fails for a given
  152. #' molecule, then the molecule is set to `NA` in the return list
  153. #' @param typing If `TRUE` then atom typing is
  154. #' performed on all loaded molecules. The assigned types will be CDK
  155. #' internal types. If this fails for a given molecule, then the molecule
  156. #' is set to `NA` in the return list
  157. #' @param isotopes If `TRUE` then atoms are configured with isotopic masses
  158. #' @seealso \code{\link{write.molecules}}, \code{\link{load.molecules}}, \code{\link{parse.smiles}}
  159. #' @export
  160. #' @author Rajarshi Guha (\email{rajarshi.guha@@gmail.com})
  161. #' @examples
  162. #' \dontrun{
  163. #' moliter <- iload.molecules("big.sdf", type="sdf")
  164. #' while(hasNext(moliter)) {
  165. #' mol <- nextElem(moliter)
  166. #' print(get.property(mol, "cdk:Title"))
  167. #' }
  168. #' }
  169. iload.molecules<- function(molfile, type = 'smi',
  170. aromaticity = TRUE, typing = TRUE, isotopes = TRUE,
  171. skip=TRUE) {
  172. if (!file.exists(molfile) && length(grep('http://', molfile)) == 0)
  173. stop(paste(molfile, ": Does not exist", sep=''))
  174. fr <- .jnew("java/io/FileReader", as.character(molfile))
  175. dcob <- get.chem.object.builder()
  176. if (type == 'smi') {
  177. sreader <- .jnew("org/openscience/cdk/io/iterator/IteratingSMILESReader",.jcast(fr, "java/io/Reader"), dcob)
  178. } else if (type == 'sdf') {
  179. sreader <- .jnew("org/openscience/cdk/io/iterator/IteratingSDFReader",.jcast(fr, "java/io/Reader"), dcob)
  180. .jcall(sreader, "V", "setSkip", skip)
  181. }
  182. hasNext <- NA
  183. mol <- NA
  184. molr <- NA
  185. hasNx <- function() {
  186. hasNext <<- .jcall(sreader, "Z", "hasNext")
  187. if (!hasNext) {
  188. .jcall(sreader, "V", "close")
  189. mol <<- NA
  190. }
  191. return(hasNext)
  192. }
  193. nextEl <- function() {
  194. mol <<- .jcall(sreader, "Ljava/lang/Object;", "next")
  195. mol <<- .jcast(mol, "org/openscience/cdk/interfaces/IAtomContainer")
  196. if (aromaticity) do.aromaticity(mol)
  197. if (typing) do.typing(mol)
  198. if (isotopes) do.isotopes(mol)
  199. hasNext <<- NA
  200. return(mol)
  201. }
  202. obj <- list(nextElem = nextEl, hasNext = hasNx)
  203. class(obj) <- c("iload.molecules", "abstractiter", "iter")
  204. obj
  205. }