PageRenderTime 48ms CodeModel.GetById 14ms app.highlight 24ms RepoModel.GetById 2ms app.codeStats 0ms

/pkg/R/tableplot.R

https://code.google.com/
R | 258 lines | 127 code | 43 blank | 88 comment | 45 complexity | 4b1437d83ceceab592b9a8bd7feee4d4 MD5 | raw file
  1#' Create a tableplot
  2#'
  3#' A tableplot is a visualisation of (large) multivariate datasets. Each column represents a variable and each row bin is an aggregate of a certain number of records. For numeric variables, a bar chart of the mean values is depicted. For categorical variables, a stacked bar chart is depicted of the proportions of categories. Missing values are taken into account. Also supports large ffdf datasets from the ff package.
  4#'
  5#' @param dat a \code{\link{data.frame}}, \code{\link{data.table}}, or an \code{\link[ff:ffdf]{ffdf}} object (required)
  6#' @param colNames character vector containing the names of the columns of \code{dat} that are visualized in the tablelplot. If omitted, all columns are visualized. All selected columns should be of class: numeric, integer, factor, or logical.
  7#' @param sortCol columns that are sorted. \code{sortCol} is either a vector of column names of a vector of indices of \code{colNames}
  8#' @param decreasing determines whether the columns are sorted decreasingly (TRUE) of increasingly (FALSE). \code{decreasing} can be either a single value that applies to all sorted columns, or a vector of the same length as \code{sortCol}.
  9#' @param nBins number of row bins
 10#' @param from percentage from which the data is shown
 11#' @param to percentage to which the data is shown
 12#' @param filter filter condition to subset the observations in \code{dat}, either a character or an expression. It is also possible to give the name of a categorical variable: then, a tableplot for each category is generated.
 13#' @param scales determines the horizontal axes of the numeric variables in \code{colNames}, options: "lin", "log", and "auto" for automatic detection. If necessary, \code{scales} is recycled.
 14#' @param pals list of color palettes. Each list item is on of the following:
 15#' \itemize{
 16#' \item a palette name in \code{\link{tablePalettes}}, optionally with the starting color between brackets.
 17#' \item a palette vector
 18#' }
 19#' The items of \code{pals} are applied to the categorical variables of \code{colNames}. If necessary, \code{pals} is recycled.
 20#' @param colorNA color for missing values
 21#' @param numPals name(s) of the palette(s) that is(are) used for numeric variables ("Blues", "Greys", or "Greens"). Recycled if necessary.
 22#' @param bias_brokenX parameter between 0 en 1 that determines when the x-axis of a numeric variable is broken. If minimum value is at least \code{bias_brokenX} times the maximum value, then X axis is broken. To turn off broken x-axes, set \code{bias_brokenX=1}.
 23#' @param IQR_bias parameter that determines when a logarithmic scale is used when \code{scales} is set to "auto". The argument \code{IQR_bias} is multiplied by the interquartile range as a test.
 24#' @param plot boolean, to plot or not to plot a tableplot
 25#' @param ... arguments passed to \code{\link{plot.tabplot}}
 26#' @return \link{tabplot-object} (silent output)
 27#' @export
 28#' @keywords visualization
 29#' @example ../examples/tableplot.R
 30tableplot <- function(dat, colNames=names(dat), sortCol=1,  decreasing=TRUE, nBins=100, from=0, to=100, filter=NULL, scales="auto", pals=list("Set1", "Set2", "Set3", "Set4"), colorNA = "#FF1414", numPals = "Blues", bias_brokenX=0.8, IQR_bias=5, plot=TRUE, ...) {
 31
 32	datName <- deparse(substitute(dat))
 33	if (class(dat)[1]=="data.frame") dat <- data.table(dat)
 34	
 35	#####################################
 36	## Filter data
 37	#####################################
 38	if (!is.null(filter)) {
 39		if (!(class(filter)[1] %in% c("character", "expression"))) stop("<filter> is not an expression nor a character")
 40		
 41		# split by one variable
 42		if (filter %in% names(dat)) {
 43			filter <- as.character(filter)
 44			lvls <- levels(dat[[filter]])
 45			
 46			if ((class(dat[[filter]])[1]=="logical") || (class(dat)[1]=="ffdf" && vmode(dat[[filter]]) %in% c("boolean", "logical"))) {
 47				isLogical <- TRUE
 48				lvls <- c("TRUE", "FALSE")
 49			} else {
 50				isLogical <- FALSE
 51			}
 52			if (is.null(lvls)) stop("filter variable is not categorical")
 53			exprChar <- paste(filter, " == ", ifelse(isLogical, "", "\""), lvls, ifelse(isLogical, "", "\""), sep="")
 54			expr <- lapply(exprChar, FUN=function(x)parse(text=x))
 55			
 56			tabs <- lapply(expr, FUN=function(e){
 57				tab <- tableplot(dat, colNames=colNames, sortCol=sortCol, decreasing=decreasing, scales=scales, pals=pals, nBins=nBins, from=from, to=to, filter=e, bias_brokenX=bias_brokenX, IQR_bias=IQR_bias, plot=plot, ...)
 58				tab
 59			})
 60			return(tabs)
 61		}
 62		
 63		# other filters
 64		if (class(filter)[1]=="character") filter <- parse(text=filter)
 65		if (class(dat)[1]=="ffdf") {
 66			sel <- bit(nrow(dat))
 67			for (i in chunk(dat)) {
 68				sel[i] <- eval(filter, dat[i,])
 69			}
 70			dat <- subset(dat, sel)
 71		} else {
 72			sel <- eval(filter, dat)
 73			dat <- dat[sel,]
 74		}
 75		
 76	}
 77	
 78	#####################################
 79	## Check arguments and cast dat-columns to numeric or factor
 80	#####################################
 81	
 82	## Check dat
 83	if (nrow(dat)==0) stop("<dat> doesn't have any rows")
 84	if (nrow(dat)==1) stop("<dat> has only one row")
 85	
 86	## Check colNames
 87	if (class(colNames)[1]!="character") stop("<colNames> is not a character(vector)")
 88	if (!all(colNames %in% names(dat))) stop("<colNames> contains column names that are not found in <dat>")
 89
 90	## Only select the columns of colNames
 91	if (class(dat)[1]=="data.table") {
 92		dat <- dat[, colNames, with=FALSE] 
 93	} else {
 94		dat <- dat[colNames]
 95	}
 96	
 97	n <- length(colNames)
 98
 99	## Check sortCol, and (if necessary) cast it to indices
100	sortCol <- tableplot_checkCols(sortCol, colNames)
101
102	## Check decreasing vector
103	decreasing <- tableplot_checkDecreasing(decreasing, sortCol)
104
105	## Check scales
106	scales <- tableplot_checkScales(scales)
107
108	## Check palet indices
109	pals <- tableplot_checkPals(pals)
110	
111	## Check colorNA
112	if (class(try(col2rgb(colorNA), silent=TRUE))=="try-error") {
113		stop("<colorNA> is not correct")
114	}
115	
116	## Check numPals
117	if ((class(numPals)!="character") || !all(numPals %in% c("Blues", "Greens", "Greys"))) stop("<numPals> is not correct")
118	
119	## Check nBins
120	nBins <- tableplot_checkBins(nBins, nrow(dat))
121	
122	## Check from and to
123	tableplot_checkFromTo(from, to)
124	
125
126	## Check filter variables
127	# if (!is.null(filter)) filter <- tableplot_checkCols(filter, colNames)
128
129	######## TO DO: implement filter variable(s)
130
131	##########################
132	#### Preprocess
133	##########################
134
135	tab <- preprocess(dat, datName, as.character(filter), colNames, sortCol,  decreasing, scales, pals, colorNA, numPals, nBins, from,to)
136	
137	# delete cloned ffdf (those with filter)
138	if (!is.null(filter) && class(dat)[1]=="ffdf") delete(dat)
139
140	
141	isNumber <- tab$isNumber
142	
143	###########################
144	##### Function to determine logarithmic scale
145	###########################
146	getLog <- function(x) {
147		logx <- numeric(length(x))
148		neg <- x < 0		
149		logx[!neg] <- log10(x[!neg]+1)
150		logx[neg] <- -log10(abs(x[neg])+1)
151		return(logx)
152	}
153
154	#####################################
155	#####################################
156	## Grammar of Graphics: Scales
157	##
158	## Scale operations
159	#####################################
160	#####################################
161	
162	## Determine scales of numeric variables in case they are set to "auto". IQR is used.
163	for (i in which(isNumber)) {
164		if (tab$columns[[i]]$scale_init=="auto") {
165			quant <- quantile(tab$columns[[i]]$mean, na.rm=TRUE)
166			IQR <- quant[4] - quant[2]
167			
168			## Simple test to determine whether scale is lin or log
169			if ((quant[5] > quant[4] + IQR_bias * IQR) || 
170				(quant[1] < quant[2] - IQR_bias * IQR)) {
171				tab$columns[[i]]$scale_final <- "log" 
172			} else {
173				tab$columns[[i]]$scale_final <- "lin" 
174			}
175		} else {
176			tab$columns[[i]]$scale_final <- tab$columns[[i]]$scale_init
177			
178		}
179	}
180	
181	## Apply scale transformation
182	for (i in which(isNumber)) {
183		if (tab$columns[[i]]$scale_final=="log") {
184			tab$columns[[i]]$mean.scaled <- getLog(tab$columns[[i]]$mean)
185		} else {
186			tab$columns[[i]]$mean.scaled <- tab$columns[[i]]$mean
187		}
188	}
189	
190	#####################################
191	#####################################
192	## Grammar of Graphics: Coordinates
193	##
194	## Coordinate transformations
195	#####################################
196	#####################################
197
198	#############################
199	## Categorical variables
200	#############################
201
202	## determine widths and x positions of the categorical variables
203	for (i in which(!isNumber)) {
204		categories <- tab$columns[[i]]$categories
205		widths <- tab$columns[[i]]$freq / rep(tab$binSizes, length(categories))
206	
207		x <- cbind(0,(t(apply(widths, 1, cumsum)))[, -length(categories)])
208		tab$columns[[i]]$categories <- categories
209		tab$columns[[i]]$x <- x
210		tab$columns[[i]]$widths <- widths
211	}
212
213	
214	#############################
215	## Numeric variables
216	#############################
217
218	#### Broken X-axis
219	temp <- lapply(tab$columns[isNumber], FUN=function(x){brokenX(x$mean.scaled, bias_brokenX)})
220	j <- 1
221	for (i in which(isNumber)) {
222		tab$columns[[i]]$brokenX <- temp[[j]]$brokenX
223		tab$columns[[i]]$mean.brokenX <- temp[[j]]$values
224		j <- j + 1
225	}
226	## make this code prettier
227	
228	#### Normalization
229	for (i in which(isNumber)) {
230		brokenX <- tab$columns[[i]]$brokenX
231		values <- tab$columns[[i]]$mean.brokenX
232		## scale values to 0-1, and determine 0-1 value of the y-axis
233		minV <- min(values, na.rm=TRUE)
234		maxV <- max(values, na.rm=TRUE)
235		if (minV < 0 && maxV > 0) {
236			xline <- -minV / (maxV - minV)
237			widths <- (values) / (maxV - minV)
238		} else if (brokenX==1) {
239			xline <- 0
240			widths <- 0.3 + (values) * 0.7 / (maxV - minV)
241		} else if (brokenX==-1) {
242			xline <- 1
243			widths <- -0.3 + (values) * 0.7 / (maxV - minV)
244		} else {
245			xline <- ifelse(maxV > 0, 0, 1)
246			widths <- (values) / max(abs(minV), abs(maxV))
247		}
248		widths[is.nan(widths)] <- minV
249		## assign to tab object
250		tab$columns[[i]]$xline <- xline
251		tab$columns[[i]]$widths <- widths
252	}
253	
254	## plot
255	class(tab) <- "tabplot"
256	if (plot) plot(tab, ...)
257	invisible(tab)
258}