/scripts/lib/dataPreparation.R
https://bitbucket.org/jontis/merck · R · 68 lines · 59 code · 7 blank · 2 comment · 7 complexity · ce78d68f080cdd6298b8fad0223a4d7f MD5 · raw file
- source_data_remove_ineffective <- function(config, training_set, chunk_rows=5000){
- base_level <- min(training_set$effect)
- baseline_rows <- as.vector(training_set$effect==base_level)
-
- if(sum(baseline_rows)<100){
- training_set$base_rows_was_removed <- FALSE
- return(training_set)
- }
-
- message('removing inefficient cause columns and useless rows in set: ', training_set$number)
-
- #operate in chunks to avoid swap
- max_in_baseline_cause_cols <- NULL
- i <- 1
- while((i-1)*chunk_rows+1 < length(baseline_rows)){
- first_row <- (i-1)*chunk_rows+1
- last_row <- min(i*chunk_rows, length(baseline_rows))
- temp_matrix <- as.matrix(training_set$cause[first_row:last_row,][baseline_rows[first_row:last_row],])
- if(ncol(temp_matrix)==1) temp_matrix <- t(temp_matrix) #dirty fix, sparse matrix for some reason transpose a returned single row
- max_in_baseline_cause_cols <- apply(rbind(temp_matrix, max_in_baseline_cause_cols), 2, max)
- i <- i + 1
- }
-
- if(sum(baseline_rows)==length(baseline_rows)) stop('NO rows left, rethink this') #stop if no rows survive
- if(config$cause$remove_base_level_rows){
- training_set$cause <- training_set$cause[!baseline_rows,] #remove rows without effect above baseline
- training_set$effect <- training_set$effect[!baseline_rows]
- }
-
- if(sum(max_in_baseline_cause_cols < config$cause$remove_col_threshold) == training_set$ncols) stop('NO cols left, rethink this')
- training_set$cause <- training_set$cause[,max_in_baseline_cause_cols < config$cause$remove_col_threshold] #keep only qualifying columns
- training_set$ineffective_columns <- training_set$col_names[max_in_baseline_cause_cols >= config$cause$remove_col_threshold]
- message('set: ', training_set$number, ' , kept ', nrow(training_set$cause), ' rows of ', training_set$nrows, ' and ', ncol(training_set$cause), ' cols of ', training_set$ncols)
- message('set reduced to ', nrow(training_set$cause)*ncol(training_set$cause), ' items (', as.integer(100*nrow(training_set$cause)*ncol(training_set$cause)/(training_set$nrows*training_set$ncols)), '% of original size).')
- training_set$nrows <- nrow(training_set$cause)
- training_set$ncols <- ncol(training_set$cause)
- training_set$base_level <- base_level
- training_set$base_rows_was_removed <- TRUE
- training_set
- }
- source_data_downsample <- function(config, training_set){
- keep_nrows <- config$rf$maxitems %/% training_set$ncols
- if(config$sampling$half_max){
- effect_in_order <- sort(training_set$effect, decreasing=T, index.return=T)
- effect_largest_indices <- effect_in_order$ix[1:floor(keep_nrows/2)]
- effect_sample_indices <- sample(effect_in_order$ix[(floor(keep_nrows/2)+1):length(training_set$effect)], size=floor(keep_nrows/2), replace=F)
- effect_new_indices <- c(effect_largest_indices, effect_sample_indices)
- }else{
- effect_new_indices <- sample(1:length(training_set$effect), size=keep_nrows, replace=F)
- }
- training_set$effect <- training_set$effect[effect_new_indices]
- training_set$cause <- training_set$cause[effect_new_indices,]
- items_old <- training_set$ncols * training_set$nrows
- items_new <- training_set$ncols * nrow(training_set$cause)
- percent <- round(100 * items_new / items_old)
- message('downsampled set ', training_set$number, ' from ', items_old, ' items to ', items_new, ' items (', percent, '%)')
- training_set$nrows <- nrow(training_set$cause)
- training_set
- }
- data_shrink_for_dev <- function(training_set){
- training_set$cause <- training_set$cause[1:100,] #shrink training set for dev speed
- training_set$effect <- training_set$effect[1:100] #shrink training set for dev speed
- training_set$nrows <- 100
- training_set
- }
- #freed memory, garbage collect!