tableplot.R | searchcode

/pkg/R/tableplot.R

https://code.google.com/ · R · 258 lines · 127 code · 43 blank · 88 comment · 45 complexity · 4b1437d83ceceab592b9a8bd7feee4d4 MD5 · raw file

#' Create a tableplot

#'

#' A tableplot is a visualisation of (large) multivariate datasets. Each column represents a variable and each row bin is an aggregate of a certain number of records. For numeric variables, a bar chart of the mean values is depicted. For categorical variables, a stacked bar chart is depicted of the proportions of categories. Missing values are taken into account. Also supports large ffdf datasets from the ff package.

#'

#' @param dat a \code{\link{data.frame}}, \code{\link{data.table}}, or an \code{\link[ff:ffdf]{ffdf}} object (required)

#' @param colNames character vector containing the names of the columns of \code{dat} that are visualized in the tablelplot. If omitted, all columns are visualized. All selected columns should be of class: numeric, integer, factor, or logical.

#' @param sortCol columns that are sorted. \code{sortCol} is either a vector of column names of a vector of indices of \code{colNames}

#' @param decreasing determines whether the columns are sorted decreasingly (TRUE) of increasingly (FALSE). \code{decreasing} can be either a single value that applies to all sorted columns, or a vector of the same length as \code{sortCol}.

#' @param nBins number of row bins

#' @param from percentage from which the data is shown

#' @param to percentage to which the data is shown

#' @param filter filter condition to subset the observations in \code{dat}, either a character or an expression. It is also possible to give the name of a categorical variable: then, a tableplot for each category is generated.

#' @param scales determines the horizontal axes of the numeric variables in \code{colNames}, options: "lin", "log", and "auto" for automatic detection. If necessary, \code{scales} is recycled.

#' @param pals list of color palettes. Each list item is on of the following:

#' \itemize{

#' \item a palette name in \code{\link{tablePalettes}}, optionally with the starting color between brackets.

#' \item a palette vector

#' }

#' The items of \code{pals} are applied to the categorical variables of \code{colNames}. If necessary, \code{pals} is recycled.

#' @param colorNA color for missing values

#' @param numPals name(s) of the palette(s) that is(are) used for numeric variables ("Blues", "Greys", or "Greens"). Recycled if necessary.

#' @param bias_brokenX parameter between 0 en 1 that determines when the x-axis of a numeric variable is broken. If minimum value is at least \code{bias_brokenX} times the maximum value, then X axis is broken. To turn off broken x-axes, set \code{bias_brokenX=1}.

#' @param IQR_bias parameter that determines when a logarithmic scale is used when \code{scales} is set to "auto". The argument \code{IQR_bias} is multiplied by the interquartile range as a test.

#' @param plot boolean, to plot or not to plot a tableplot

#' @param ... arguments passed to \code{\link{plot.tabplot}}

#' @return \link{tabplot-object} (silent output)

#' @export

#' @keywords visualization

#' @example ../examples/tableplot.R

tableplot <- function(dat, colNames=names(dat), sortCol=1,  decreasing=TRUE, nBins=100, from=0, to=100, filter=NULL, scales="auto", pals=list("Set1", "Set2", "Set3", "Set4"), colorNA = "#FF1414", numPals = "Blues", bias_brokenX=0.8, IQR_bias=5, plot=TRUE, ...) {



	datName <- deparse(substitute(dat))

	if (class(dat)[1]=="data.frame") dat <- data.table(dat)

	

	#####################################

	## Filter data

	#####################################

	if (!is.null(filter)) {

		if (!(class(filter)[1] %in% c("character", "expression"))) stop("<filter> is not an expression nor a character")

		

		# split by one variable

		if (filter %in% names(dat)) {

			filter <- as.character(filter)

			lvls <- levels(dat[[filter]])

			

			if ((class(dat[[filter]])[1]=="logical") || (class(dat)[1]=="ffdf" && vmode(dat[[filter]]) %in% c("boolean", "logical"))) {

				isLogical <- TRUE

				lvls <- c("TRUE", "FALSE")

			} else {

				isLogical <- FALSE

			}

			if (is.null(lvls)) stop("filter variable is not categorical")

			exprChar <- paste(filter, " == ", ifelse(isLogical, "", "\""), lvls, ifelse(isLogical, "", "\""), sep="")

			expr <- lapply(exprChar, FUN=function(x)parse(text=x))

			

			tabs <- lapply(expr, FUN=function(e){

				tab <- tableplot(dat, colNames=colNames, sortCol=sortCol, decreasing=decreasing, scales=scales, pals=pals, nBins=nBins, from=from, to=to, filter=e, bias_brokenX=bias_brokenX, IQR_bias=IQR_bias, plot=plot, ...)

				tab

			})

			return(tabs)

		}

		

		# other filters

		if (class(filter)[1]=="character") filter <- parse(text=filter)

		if (class(dat)[1]=="ffdf") {

			sel <- bit(nrow(dat))

			for (i in chunk(dat)) {

				sel[i] <- eval(filter, dat[i,])

			}

			dat <- subset(dat, sel)

		} else {

			sel <- eval(filter, dat)

			dat <- dat[sel,]

		}

		

	}

	

	#####################################

	## Check arguments and cast dat-columns to numeric or factor

	#####################################

	

	## Check dat

	if (nrow(dat)==0) stop("<dat> doesn't have any rows")

	if (nrow(dat)==1) stop("<dat> has only one row")

	

	## Check colNames

	if (class(colNames)[1]!="character") stop("<colNames> is not a character(vector)")

	if (!all(colNames %in% names(dat))) stop("<colNames> contains column names that are not found in <dat>")



	## Only select the columns of colNames

	if (class(dat)[1]=="data.table") {

		dat <- dat[, colNames, with=FALSE] 

	} else {

		dat <- dat[colNames]

	}

	

	n <- length(colNames)



	## Check sortCol, and (if necessary) cast it to indices

	sortCol <- tableplot_checkCols(sortCol, colNames)



	## Check decreasing vector

	decreasing <- tableplot_checkDecreasing(decreasing, sortCol)



	## Check scales

	scales <- tableplot_checkScales(scales)



	## Check palet indices

	pals <- tableplot_checkPals(pals)

	

	## Check colorNA

	if (class(try(col2rgb(colorNA), silent=TRUE))=="try-error") {

		stop("<colorNA> is not correct")

	}

	

	## Check numPals

	if ((class(numPals)!="character") || !all(numPals %in% c("Blues", "Greens", "Greys"))) stop("<numPals> is not correct")

	

	## Check nBins

	nBins <- tableplot_checkBins(nBins, nrow(dat))

	

	## Check from and to

	tableplot_checkFromTo(from, to)

	



	## Check filter variables

	# if (!is.null(filter)) filter <- tableplot_checkCols(filter, colNames)



	######## TO DO: implement filter variable(s)



	##########################

	#### Preprocess

	##########################



	tab <- preprocess(dat, datName, as.character(filter), colNames, sortCol,  decreasing, scales, pals, colorNA, numPals, nBins, from,to)

	

	# delete cloned ffdf (those with filter)

	if (!is.null(filter) && class(dat)[1]=="ffdf") delete(dat)



	

	isNumber <- tab$isNumber

	

	###########################

	##### Function to determine logarithmic scale

	###########################

	getLog <- function(x) {

		logx <- numeric(length(x))

		neg <- x < 0		

		logx[!neg] <- log10(x[!neg]+1)

		logx[neg] <- -log10(abs(x[neg])+1)

		return(logx)

	}



	#####################################

	#####################################

	## Grammar of Graphics: Scales

	##

	## Scale operations

	#####################################

	#####################################

	

	## Determine scales of numeric variables in case they are set to "auto". IQR is used.

	for (i in which(isNumber)) {

		if (tab$columns[[i]]$scale_init=="auto") {

			quant <- quantile(tab$columns[[i]]$mean, na.rm=TRUE)

			IQR <- quant[4] - quant[2]

			

			## Simple test to determine whether scale is lin or log

			if ((quant[5] > quant[4] + IQR_bias * IQR) || 

				(quant[1] < quant[2] - IQR_bias * IQR)) {

				tab$columns[[i]]$scale_final <- "log" 

			} else {

				tab$columns[[i]]$scale_final <- "lin" 

			}

		} else {

			tab$columns[[i]]$scale_final <- tab$columns[[i]]$scale_init

			

		}

	}

	

	## Apply scale transformation

	for (i in which(isNumber)) {

		if (tab$columns[[i]]$scale_final=="log") {

			tab$columns[[i]]$mean.scaled <- getLog(tab$columns[[i]]$mean)

		} else {

			tab$columns[[i]]$mean.scaled <- tab$columns[[i]]$mean

		}

	}

	

	#####################################

	#####################################

	## Grammar of Graphics: Coordinates

	##

	## Coordinate transformations

	#####################################

	#####################################



	#############################

	## Categorical variables

	#############################



	## determine widths and x positions of the categorical variables

	for (i in which(!isNumber)) {

		categories <- tab$columns[[i]]$categories

		widths <- tab$columns[[i]]$freq / rep(tab$binSizes, length(categories))

	

		x <- cbind(0,(t(apply(widths, 1, cumsum)))[, -length(categories)])

		tab$columns[[i]]$categories <- categories

		tab$columns[[i]]$x <- x

		tab$columns[[i]]$widths <- widths

	}



	

	#############################

	## Numeric variables

	#############################



	#### Broken X-axis

	temp <- lapply(tab$columns[isNumber], FUN=function(x){brokenX(x$mean.scaled, bias_brokenX)})

	j <- 1

	for (i in which(isNumber)) {

		tab$columns[[i]]$brokenX <- temp[[j]]$brokenX

		tab$columns[[i]]$mean.brokenX <- temp[[j]]$values

		j <- j + 1

	}

	## make this code prettier

	

	#### Normalization

	for (i in which(isNumber)) {

		brokenX <- tab$columns[[i]]$brokenX

		values <- tab$columns[[i]]$mean.brokenX

		## scale values to 0-1, and determine 0-1 value of the y-axis

		minV <- min(values, na.rm=TRUE)

		maxV <- max(values, na.rm=TRUE)

		if (minV < 0 && maxV > 0) {

			xline <- -minV / (maxV - minV)

			widths <- (values) / (maxV - minV)

		} else if (brokenX==1) {

			xline <- 0

			widths <- 0.3 + (values) * 0.7 / (maxV - minV)

		} else if (brokenX==-1) {

			xline <- 1

			widths <- -0.3 + (values) * 0.7 / (maxV - minV)

		} else {

			xline <- ifelse(maxV > 0, 0, 1)

			widths <- (values) / max(abs(minV), abs(maxV))

		}

		widths[is.nan(widths)] <- minV

		## assign to tab object

		tab$columns[[i]]$xline <- xline

		tab$columns[[i]]$widths <- widths

	}

	

	## plot

	class(tab) <- "tabplot"

	if (plot) plot(tab, ...)

	invisible(tab)

}