/pkg/R/tableplot.R
https://code.google.com/ · R · 258 lines · 127 code · 43 blank · 88 comment · 45 complexity · 4b1437d83ceceab592b9a8bd7feee4d4 MD5 · raw file
- #' Create a tableplot
- #'
- #' A tableplot is a visualisation of (large) multivariate datasets. Each column represents a variable and each row bin is an aggregate of a certain number of records. For numeric variables, a bar chart of the mean values is depicted. For categorical variables, a stacked bar chart is depicted of the proportions of categories. Missing values are taken into account. Also supports large ffdf datasets from the ff package.
- #'
- #' @param dat a \code{\link{data.frame}}, \code{\link{data.table}}, or an \code{\link[ff:ffdf]{ffdf}} object (required)
- #' @param colNames character vector containing the names of the columns of \code{dat} that are visualized in the tablelplot. If omitted, all columns are visualized. All selected columns should be of class: numeric, integer, factor, or logical.
- #' @param sortCol columns that are sorted. \code{sortCol} is either a vector of column names of a vector of indices of \code{colNames}
- #' @param decreasing determines whether the columns are sorted decreasingly (TRUE) of increasingly (FALSE). \code{decreasing} can be either a single value that applies to all sorted columns, or a vector of the same length as \code{sortCol}.
- #' @param nBins number of row bins
- #' @param from percentage from which the data is shown
- #' @param to percentage to which the data is shown
- #' @param filter filter condition to subset the observations in \code{dat}, either a character or an expression. It is also possible to give the name of a categorical variable: then, a tableplot for each category is generated.
- #' @param scales determines the horizontal axes of the numeric variables in \code{colNames}, options: "lin", "log", and "auto" for automatic detection. If necessary, \code{scales} is recycled.
- #' @param pals list of color palettes. Each list item is on of the following:
- #' \itemize{
- #' \item a palette name in \code{\link{tablePalettes}}, optionally with the starting color between brackets.
- #' \item a palette vector
- #' }
- #' The items of \code{pals} are applied to the categorical variables of \code{colNames}. If necessary, \code{pals} is recycled.
- #' @param colorNA color for missing values
- #' @param numPals name(s) of the palette(s) that is(are) used for numeric variables ("Blues", "Greys", or "Greens"). Recycled if necessary.
- #' @param bias_brokenX parameter between 0 en 1 that determines when the x-axis of a numeric variable is broken. If minimum value is at least \code{bias_brokenX} times the maximum value, then X axis is broken. To turn off broken x-axes, set \code{bias_brokenX=1}.
- #' @param IQR_bias parameter that determines when a logarithmic scale is used when \code{scales} is set to "auto". The argument \code{IQR_bias} is multiplied by the interquartile range as a test.
- #' @param plot boolean, to plot or not to plot a tableplot
- #' @param ... arguments passed to \code{\link{plot.tabplot}}
- #' @return \link{tabplot-object} (silent output)
- #' @export
- #' @keywords visualization
- #' @example ../examples/tableplot.R
- tableplot <- function(dat, colNames=names(dat), sortCol=1, decreasing=TRUE, nBins=100, from=0, to=100, filter=NULL, scales="auto", pals=list("Set1", "Set2", "Set3", "Set4"), colorNA = "#FF1414", numPals = "Blues", bias_brokenX=0.8, IQR_bias=5, plot=TRUE, ...) {
-
- datName <- deparse(substitute(dat))
- if (class(dat)[1]=="data.frame") dat <- data.table(dat)
-
- #####################################
- ## Filter data
- #####################################
- if (!is.null(filter)) {
- if (!(class(filter)[1] %in% c("character", "expression"))) stop("<filter> is not an expression nor a character")
-
- # split by one variable
- if (filter %in% names(dat)) {
- filter <- as.character(filter)
- lvls <- levels(dat[[filter]])
-
- if ((class(dat[[filter]])[1]=="logical") || (class(dat)[1]=="ffdf" && vmode(dat[[filter]]) %in% c("boolean", "logical"))) {
- isLogical <- TRUE
- lvls <- c("TRUE", "FALSE")
- } else {
- isLogical <- FALSE
- }
- if (is.null(lvls)) stop("filter variable is not categorical")
- exprChar <- paste(filter, " == ", ifelse(isLogical, "", "\""), lvls, ifelse(isLogical, "", "\""), sep="")
- expr <- lapply(exprChar, FUN=function(x)parse(text=x))
-
- tabs <- lapply(expr, FUN=function(e){
- tab <- tableplot(dat, colNames=colNames, sortCol=sortCol, decreasing=decreasing, scales=scales, pals=pals, nBins=nBins, from=from, to=to, filter=e, bias_brokenX=bias_brokenX, IQR_bias=IQR_bias, plot=plot, ...)
- tab
- })
- return(tabs)
- }
-
- # other filters
- if (class(filter)[1]=="character") filter <- parse(text=filter)
- if (class(dat)[1]=="ffdf") {
- sel <- bit(nrow(dat))
- for (i in chunk(dat)) {
- sel[i] <- eval(filter, dat[i,])
- }
- dat <- subset(dat, sel)
- } else {
- sel <- eval(filter, dat)
- dat <- dat[sel,]
- }
-
- }
-
- #####################################
- ## Check arguments and cast dat-columns to numeric or factor
- #####################################
-
- ## Check dat
- if (nrow(dat)==0) stop("<dat> doesn't have any rows")
- if (nrow(dat)==1) stop("<dat> has only one row")
-
- ## Check colNames
- if (class(colNames)[1]!="character") stop("<colNames> is not a character(vector)")
- if (!all(colNames %in% names(dat))) stop("<colNames> contains column names that are not found in <dat>")
-
- ## Only select the columns of colNames
- if (class(dat)[1]=="data.table") {
- dat <- dat[, colNames, with=FALSE]
- } else {
- dat <- dat[colNames]
- }
-
- n <- length(colNames)
-
- ## Check sortCol, and (if necessary) cast it to indices
- sortCol <- tableplot_checkCols(sortCol, colNames)
-
- ## Check decreasing vector
- decreasing <- tableplot_checkDecreasing(decreasing, sortCol)
-
- ## Check scales
- scales <- tableplot_checkScales(scales)
-
- ## Check palet indices
- pals <- tableplot_checkPals(pals)
-
- ## Check colorNA
- if (class(try(col2rgb(colorNA), silent=TRUE))=="try-error") {
- stop("<colorNA> is not correct")
- }
-
- ## Check numPals
- if ((class(numPals)!="character") || !all(numPals %in% c("Blues", "Greens", "Greys"))) stop("<numPals> is not correct")
-
- ## Check nBins
- nBins <- tableplot_checkBins(nBins, nrow(dat))
-
- ## Check from and to
- tableplot_checkFromTo(from, to)
-
-
- ## Check filter variables
- # if (!is.null(filter)) filter <- tableplot_checkCols(filter, colNames)
-
- ######## TO DO: implement filter variable(s)
-
- ##########################
- #### Preprocess
- ##########################
-
- tab <- preprocess(dat, datName, as.character(filter), colNames, sortCol, decreasing, scales, pals, colorNA, numPals, nBins, from,to)
-
- # delete cloned ffdf (those with filter)
- if (!is.null(filter) && class(dat)[1]=="ffdf") delete(dat)
-
-
- isNumber <- tab$isNumber
-
- ###########################
- ##### Function to determine logarithmic scale
- ###########################
- getLog <- function(x) {
- logx <- numeric(length(x))
- neg <- x < 0
- logx[!neg] <- log10(x[!neg]+1)
- logx[neg] <- -log10(abs(x[neg])+1)
- return(logx)
- }
-
- #####################################
- #####################################
- ## Grammar of Graphics: Scales
- ##
- ## Scale operations
- #####################################
- #####################################
-
- ## Determine scales of numeric variables in case they are set to "auto". IQR is used.
- for (i in which(isNumber)) {
- if (tab$columns[[i]]$scale_init=="auto") {
- quant <- quantile(tab$columns[[i]]$mean, na.rm=TRUE)
- IQR <- quant[4] - quant[2]
-
- ## Simple test to determine whether scale is lin or log
- if ((quant[5] > quant[4] + IQR_bias * IQR) ||
- (quant[1] < quant[2] - IQR_bias * IQR)) {
- tab$columns[[i]]$scale_final <- "log"
- } else {
- tab$columns[[i]]$scale_final <- "lin"
- }
- } else {
- tab$columns[[i]]$scale_final <- tab$columns[[i]]$scale_init
-
- }
- }
-
- ## Apply scale transformation
- for (i in which(isNumber)) {
- if (tab$columns[[i]]$scale_final=="log") {
- tab$columns[[i]]$mean.scaled <- getLog(tab$columns[[i]]$mean)
- } else {
- tab$columns[[i]]$mean.scaled <- tab$columns[[i]]$mean
- }
- }
-
- #####################################
- #####################################
- ## Grammar of Graphics: Coordinates
- ##
- ## Coordinate transformations
- #####################################
- #####################################
-
- #############################
- ## Categorical variables
- #############################
-
- ## determine widths and x positions of the categorical variables
- for (i in which(!isNumber)) {
- categories <- tab$columns[[i]]$categories
- widths <- tab$columns[[i]]$freq / rep(tab$binSizes, length(categories))
-
- x <- cbind(0,(t(apply(widths, 1, cumsum)))[, -length(categories)])
- tab$columns[[i]]$categories <- categories
- tab$columns[[i]]$x <- x
- tab$columns[[i]]$widths <- widths
- }
-
-
- #############################
- ## Numeric variables
- #############################
-
- #### Broken X-axis
- temp <- lapply(tab$columns[isNumber], FUN=function(x){brokenX(x$mean.scaled, bias_brokenX)})
- j <- 1
- for (i in which(isNumber)) {
- tab$columns[[i]]$brokenX <- temp[[j]]$brokenX
- tab$columns[[i]]$mean.brokenX <- temp[[j]]$values
- j <- j + 1
- }
- ## make this code prettier
-
- #### Normalization
- for (i in which(isNumber)) {
- brokenX <- tab$columns[[i]]$brokenX
- values <- tab$columns[[i]]$mean.brokenX
- ## scale values to 0-1, and determine 0-1 value of the y-axis
- minV <- min(values, na.rm=TRUE)
- maxV <- max(values, na.rm=TRUE)
- if (minV < 0 && maxV > 0) {
- xline <- -minV / (maxV - minV)
- widths <- (values) / (maxV - minV)
- } else if (brokenX==1) {
- xline <- 0
- widths <- 0.3 + (values) * 0.7 / (maxV - minV)
- } else if (brokenX==-1) {
- xline <- 1
- widths <- -0.3 + (values) * 0.7 / (maxV - minV)
- } else {
- xline <- ifelse(maxV > 0, 0, 1)
- widths <- (values) / max(abs(minV), abs(maxV))
- }
- widths[is.nan(widths)] <- minV
- ## assign to tab object
- tab$columns[[i]]$xline <- xline
- tab$columns[[i]]$widths <- widths
- }
-
- ## plot
- class(tab) <- "tabplot"
- if (plot) plot(tab, ...)
- invisible(tab)
- }