/pkg/R/tableplot.R

https://code.google.com/ · R · 258 lines · 127 code · 43 blank · 88 comment · 45 complexity · 4b1437d83ceceab592b9a8bd7feee4d4 MD5 · raw file

  1. #' Create a tableplot
  2. #'
  3. #' A tableplot is a visualisation of (large) multivariate datasets. Each column represents a variable and each row bin is an aggregate of a certain number of records. For numeric variables, a bar chart of the mean values is depicted. For categorical variables, a stacked bar chart is depicted of the proportions of categories. Missing values are taken into account. Also supports large ffdf datasets from the ff package.
  4. #'
  5. #' @param dat a \code{\link{data.frame}}, \code{\link{data.table}}, or an \code{\link[ff:ffdf]{ffdf}} object (required)
  6. #' @param colNames character vector containing the names of the columns of \code{dat} that are visualized in the tablelplot. If omitted, all columns are visualized. All selected columns should be of class: numeric, integer, factor, or logical.
  7. #' @param sortCol columns that are sorted. \code{sortCol} is either a vector of column names of a vector of indices of \code{colNames}
  8. #' @param decreasing determines whether the columns are sorted decreasingly (TRUE) of increasingly (FALSE). \code{decreasing} can be either a single value that applies to all sorted columns, or a vector of the same length as \code{sortCol}.
  9. #' @param nBins number of row bins
  10. #' @param from percentage from which the data is shown
  11. #' @param to percentage to which the data is shown
  12. #' @param filter filter condition to subset the observations in \code{dat}, either a character or an expression. It is also possible to give the name of a categorical variable: then, a tableplot for each category is generated.
  13. #' @param scales determines the horizontal axes of the numeric variables in \code{colNames}, options: "lin", "log", and "auto" for automatic detection. If necessary, \code{scales} is recycled.
  14. #' @param pals list of color palettes. Each list item is on of the following:
  15. #' \itemize{
  16. #' \item a palette name in \code{\link{tablePalettes}}, optionally with the starting color between brackets.
  17. #' \item a palette vector
  18. #' }
  19. #' The items of \code{pals} are applied to the categorical variables of \code{colNames}. If necessary, \code{pals} is recycled.
  20. #' @param colorNA color for missing values
  21. #' @param numPals name(s) of the palette(s) that is(are) used for numeric variables ("Blues", "Greys", or "Greens"). Recycled if necessary.
  22. #' @param bias_brokenX parameter between 0 en 1 that determines when the x-axis of a numeric variable is broken. If minimum value is at least \code{bias_brokenX} times the maximum value, then X axis is broken. To turn off broken x-axes, set \code{bias_brokenX=1}.
  23. #' @param IQR_bias parameter that determines when a logarithmic scale is used when \code{scales} is set to "auto". The argument \code{IQR_bias} is multiplied by the interquartile range as a test.
  24. #' @param plot boolean, to plot or not to plot a tableplot
  25. #' @param ... arguments passed to \code{\link{plot.tabplot}}
  26. #' @return \link{tabplot-object} (silent output)
  27. #' @export
  28. #' @keywords visualization
  29. #' @example ../examples/tableplot.R
  30. tableplot <- function(dat, colNames=names(dat), sortCol=1, decreasing=TRUE, nBins=100, from=0, to=100, filter=NULL, scales="auto", pals=list("Set1", "Set2", "Set3", "Set4"), colorNA = "#FF1414", numPals = "Blues", bias_brokenX=0.8, IQR_bias=5, plot=TRUE, ...) {
  31. datName <- deparse(substitute(dat))
  32. if (class(dat)[1]=="data.frame") dat <- data.table(dat)
  33. #####################################
  34. ## Filter data
  35. #####################################
  36. if (!is.null(filter)) {
  37. if (!(class(filter)[1] %in% c("character", "expression"))) stop("<filter> is not an expression nor a character")
  38. # split by one variable
  39. if (filter %in% names(dat)) {
  40. filter <- as.character(filter)
  41. lvls <- levels(dat[[filter]])
  42. if ((class(dat[[filter]])[1]=="logical") || (class(dat)[1]=="ffdf" && vmode(dat[[filter]]) %in% c("boolean", "logical"))) {
  43. isLogical <- TRUE
  44. lvls <- c("TRUE", "FALSE")
  45. } else {
  46. isLogical <- FALSE
  47. }
  48. if (is.null(lvls)) stop("filter variable is not categorical")
  49. exprChar <- paste(filter, " == ", ifelse(isLogical, "", "\""), lvls, ifelse(isLogical, "", "\""), sep="")
  50. expr <- lapply(exprChar, FUN=function(x)parse(text=x))
  51. tabs <- lapply(expr, FUN=function(e){
  52. tab <- tableplot(dat, colNames=colNames, sortCol=sortCol, decreasing=decreasing, scales=scales, pals=pals, nBins=nBins, from=from, to=to, filter=e, bias_brokenX=bias_brokenX, IQR_bias=IQR_bias, plot=plot, ...)
  53. tab
  54. })
  55. return(tabs)
  56. }
  57. # other filters
  58. if (class(filter)[1]=="character") filter <- parse(text=filter)
  59. if (class(dat)[1]=="ffdf") {
  60. sel <- bit(nrow(dat))
  61. for (i in chunk(dat)) {
  62. sel[i] <- eval(filter, dat[i,])
  63. }
  64. dat <- subset(dat, sel)
  65. } else {
  66. sel <- eval(filter, dat)
  67. dat <- dat[sel,]
  68. }
  69. }
  70. #####################################
  71. ## Check arguments and cast dat-columns to numeric or factor
  72. #####################################
  73. ## Check dat
  74. if (nrow(dat)==0) stop("<dat> doesn't have any rows")
  75. if (nrow(dat)==1) stop("<dat> has only one row")
  76. ## Check colNames
  77. if (class(colNames)[1]!="character") stop("<colNames> is not a character(vector)")
  78. if (!all(colNames %in% names(dat))) stop("<colNames> contains column names that are not found in <dat>")
  79. ## Only select the columns of colNames
  80. if (class(dat)[1]=="data.table") {
  81. dat <- dat[, colNames, with=FALSE]
  82. } else {
  83. dat <- dat[colNames]
  84. }
  85. n <- length(colNames)
  86. ## Check sortCol, and (if necessary) cast it to indices
  87. sortCol <- tableplot_checkCols(sortCol, colNames)
  88. ## Check decreasing vector
  89. decreasing <- tableplot_checkDecreasing(decreasing, sortCol)
  90. ## Check scales
  91. scales <- tableplot_checkScales(scales)
  92. ## Check palet indices
  93. pals <- tableplot_checkPals(pals)
  94. ## Check colorNA
  95. if (class(try(col2rgb(colorNA), silent=TRUE))=="try-error") {
  96. stop("<colorNA> is not correct")
  97. }
  98. ## Check numPals
  99. if ((class(numPals)!="character") || !all(numPals %in% c("Blues", "Greens", "Greys"))) stop("<numPals> is not correct")
  100. ## Check nBins
  101. nBins <- tableplot_checkBins(nBins, nrow(dat))
  102. ## Check from and to
  103. tableplot_checkFromTo(from, to)
  104. ## Check filter variables
  105. # if (!is.null(filter)) filter <- tableplot_checkCols(filter, colNames)
  106. ######## TO DO: implement filter variable(s)
  107. ##########################
  108. #### Preprocess
  109. ##########################
  110. tab <- preprocess(dat, datName, as.character(filter), colNames, sortCol, decreasing, scales, pals, colorNA, numPals, nBins, from,to)
  111. # delete cloned ffdf (those with filter)
  112. if (!is.null(filter) && class(dat)[1]=="ffdf") delete(dat)
  113. isNumber <- tab$isNumber
  114. ###########################
  115. ##### Function to determine logarithmic scale
  116. ###########################
  117. getLog <- function(x) {
  118. logx <- numeric(length(x))
  119. neg <- x < 0
  120. logx[!neg] <- log10(x[!neg]+1)
  121. logx[neg] <- -log10(abs(x[neg])+1)
  122. return(logx)
  123. }
  124. #####################################
  125. #####################################
  126. ## Grammar of Graphics: Scales
  127. ##
  128. ## Scale operations
  129. #####################################
  130. #####################################
  131. ## Determine scales of numeric variables in case they are set to "auto". IQR is used.
  132. for (i in which(isNumber)) {
  133. if (tab$columns[[i]]$scale_init=="auto") {
  134. quant <- quantile(tab$columns[[i]]$mean, na.rm=TRUE)
  135. IQR <- quant[4] - quant[2]
  136. ## Simple test to determine whether scale is lin or log
  137. if ((quant[5] > quant[4] + IQR_bias * IQR) ||
  138. (quant[1] < quant[2] - IQR_bias * IQR)) {
  139. tab$columns[[i]]$scale_final <- "log"
  140. } else {
  141. tab$columns[[i]]$scale_final <- "lin"
  142. }
  143. } else {
  144. tab$columns[[i]]$scale_final <- tab$columns[[i]]$scale_init
  145. }
  146. }
  147. ## Apply scale transformation
  148. for (i in which(isNumber)) {
  149. if (tab$columns[[i]]$scale_final=="log") {
  150. tab$columns[[i]]$mean.scaled <- getLog(tab$columns[[i]]$mean)
  151. } else {
  152. tab$columns[[i]]$mean.scaled <- tab$columns[[i]]$mean
  153. }
  154. }
  155. #####################################
  156. #####################################
  157. ## Grammar of Graphics: Coordinates
  158. ##
  159. ## Coordinate transformations
  160. #####################################
  161. #####################################
  162. #############################
  163. ## Categorical variables
  164. #############################
  165. ## determine widths and x positions of the categorical variables
  166. for (i in which(!isNumber)) {
  167. categories <- tab$columns[[i]]$categories
  168. widths <- tab$columns[[i]]$freq / rep(tab$binSizes, length(categories))
  169. x <- cbind(0,(t(apply(widths, 1, cumsum)))[, -length(categories)])
  170. tab$columns[[i]]$categories <- categories
  171. tab$columns[[i]]$x <- x
  172. tab$columns[[i]]$widths <- widths
  173. }
  174. #############################
  175. ## Numeric variables
  176. #############################
  177. #### Broken X-axis
  178. temp <- lapply(tab$columns[isNumber], FUN=function(x){brokenX(x$mean.scaled, bias_brokenX)})
  179. j <- 1
  180. for (i in which(isNumber)) {
  181. tab$columns[[i]]$brokenX <- temp[[j]]$brokenX
  182. tab$columns[[i]]$mean.brokenX <- temp[[j]]$values
  183. j <- j + 1
  184. }
  185. ## make this code prettier
  186. #### Normalization
  187. for (i in which(isNumber)) {
  188. brokenX <- tab$columns[[i]]$brokenX
  189. values <- tab$columns[[i]]$mean.brokenX
  190. ## scale values to 0-1, and determine 0-1 value of the y-axis
  191. minV <- min(values, na.rm=TRUE)
  192. maxV <- max(values, na.rm=TRUE)
  193. if (minV < 0 && maxV > 0) {
  194. xline <- -minV / (maxV - minV)
  195. widths <- (values) / (maxV - minV)
  196. } else if (brokenX==1) {
  197. xline <- 0
  198. widths <- 0.3 + (values) * 0.7 / (maxV - minV)
  199. } else if (brokenX==-1) {
  200. xline <- 1
  201. widths <- -0.3 + (values) * 0.7 / (maxV - minV)
  202. } else {
  203. xline <- ifelse(maxV > 0, 0, 1)
  204. widths <- (values) / max(abs(minV), abs(maxV))
  205. }
  206. widths[is.nan(widths)] <- minV
  207. ## assign to tab object
  208. tab$columns[[i]]$xline <- xline
  209. tab$columns[[i]]$widths <- widths
  210. }
  211. ## plot
  212. class(tab) <- "tabplot"
  213. if (plot) plot(tab, ...)
  214. invisible(tab)
  215. }