/scripts/lib/dataPreparation.R
R | 68 lines | 59 code | 7 blank | 2 comment | 7 complexity | ce78d68f080cdd6298b8fad0223a4d7f MD5 | raw file
1source_data_remove_ineffective <- function(config, training_set, chunk_rows=5000){ 2 base_level <- min(training_set$effect) 3 baseline_rows <- as.vector(training_set$effect==base_level) 4 5 if(sum(baseline_rows)<100){ 6 training_set$base_rows_was_removed <- FALSE 7 return(training_set) 8 } 9 10 message('removing inefficient cause columns and useless rows in set: ', training_set$number) 11 12 #operate in chunks to avoid swap 13 max_in_baseline_cause_cols <- NULL 14 i <- 1 15 while((i-1)*chunk_rows+1 < length(baseline_rows)){ 16 first_row <- (i-1)*chunk_rows+1 17 last_row <- min(i*chunk_rows, length(baseline_rows)) 18 temp_matrix <- as.matrix(training_set$cause[first_row:last_row,][baseline_rows[first_row:last_row],]) 19 if(ncol(temp_matrix)==1) temp_matrix <- t(temp_matrix) #dirty fix, sparse matrix for some reason transpose a returned single row 20 max_in_baseline_cause_cols <- apply(rbind(temp_matrix, max_in_baseline_cause_cols), 2, max) 21 i <- i + 1 22 } 23 24 if(sum(baseline_rows)==length(baseline_rows)) stop('NO rows left, rethink this') #stop if no rows survive 25 if(config$cause$remove_base_level_rows){ 26 training_set$cause <- training_set$cause[!baseline_rows,] #remove rows without effect above baseline 27 training_set$effect <- training_set$effect[!baseline_rows] 28 } 29 30 if(sum(max_in_baseline_cause_cols < config$cause$remove_col_threshold) == training_set$ncols) stop('NO cols left, rethink this') 31 training_set$cause <- training_set$cause[,max_in_baseline_cause_cols < config$cause$remove_col_threshold] #keep only qualifying columns 32 training_set$ineffective_columns <- training_set$col_names[max_in_baseline_cause_cols >= config$cause$remove_col_threshold] 33 message('set: ', training_set$number, ' , kept ', nrow(training_set$cause), ' rows of ', training_set$nrows, ' and ', ncol(training_set$cause), ' cols of ', training_set$ncols) 34 message('set reduced to ', nrow(training_set$cause)*ncol(training_set$cause), ' items (', as.integer(100*nrow(training_set$cause)*ncol(training_set$cause)/(training_set$nrows*training_set$ncols)), '% of original size).') 35 training_set$nrows <- nrow(training_set$cause) 36 training_set$ncols <- ncol(training_set$cause) 37 training_set$base_level <- base_level 38 training_set$base_rows_was_removed <- TRUE 39 training_set 40} 41 42source_data_downsample <- function(config, training_set){ 43 keep_nrows <- config$rf$maxitems %/% training_set$ncols 44 if(config$sampling$half_max){ 45 effect_in_order <- sort(training_set$effect, decreasing=T, index.return=T) 46 effect_largest_indices <- effect_in_order$ix[1:floor(keep_nrows/2)] 47 effect_sample_indices <- sample(effect_in_order$ix[(floor(keep_nrows/2)+1):length(training_set$effect)], size=floor(keep_nrows/2), replace=F) 48 effect_new_indices <- c(effect_largest_indices, effect_sample_indices) 49 }else{ 50 effect_new_indices <- sample(1:length(training_set$effect), size=keep_nrows, replace=F) 51 } 52 training_set$effect <- training_set$effect[effect_new_indices] 53 training_set$cause <- training_set$cause[effect_new_indices,] 54 items_old <- training_set$ncols * training_set$nrows 55 items_new <- training_set$ncols * nrow(training_set$cause) 56 percent <- round(100 * items_new / items_old) 57 message('downsampled set ', training_set$number, ' from ', items_old, ' items to ', items_new, ' items (', percent, '%)') 58 training_set$nrows <- nrow(training_set$cause) 59 training_set 60} 61 62data_shrink_for_dev <- function(training_set){ 63 training_set$cause <- training_set$cause[1:100,] #shrink training set for dev speed 64 training_set$effect <- training_set$effect[1:100] #shrink training set for dev speed 65 training_set$nrows <- 100 66 training_set 67} 68 #freed memory, garbage collect!