PageRenderTime 19ms CodeModel.GetById 15ms app.highlight 3ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/lib/dataPreparation.R

https://bitbucket.org/jontis/merck
R | 68 lines | 59 code | 7 blank | 2 comment | 7 complexity | ce78d68f080cdd6298b8fad0223a4d7f MD5 | raw file
 1source_data_remove_ineffective <- function(config, training_set, chunk_rows=5000){
 2  base_level <- min(training_set$effect)
 3  baseline_rows <- as.vector(training_set$effect==base_level)
 4  
 5  if(sum(baseline_rows)<100){
 6    training_set$base_rows_was_removed <- FALSE
 7    return(training_set)
 8  }
 9  
10  message('removing inefficient cause columns and useless rows in set: ', training_set$number)
11  
12  #operate in chunks to avoid swap
13  max_in_baseline_cause_cols <- NULL
14  i <- 1
15  while((i-1)*chunk_rows+1 < length(baseline_rows)){
16    first_row <- (i-1)*chunk_rows+1
17    last_row <- min(i*chunk_rows, length(baseline_rows))
18    temp_matrix <- as.matrix(training_set$cause[first_row:last_row,][baseline_rows[first_row:last_row],])
19    if(ncol(temp_matrix)==1) temp_matrix <- t(temp_matrix)            #dirty fix, sparse matrix for some reason transpose a returned single row
20    max_in_baseline_cause_cols <- apply(rbind(temp_matrix, max_in_baseline_cause_cols), 2, max)
21    i <- i + 1
22  }
23  
24  if(sum(baseline_rows)==length(baseline_rows)) stop('NO rows left, rethink this')  #stop if no rows survive
25  if(config$cause$remove_base_level_rows){
26    training_set$cause <- training_set$cause[!baseline_rows,]                 #remove rows without effect above baseline
27    training_set$effect <- training_set$effect[!baseline_rows]
28  }
29  
30  if(sum(max_in_baseline_cause_cols < config$cause$remove_col_threshold) == training_set$ncols) stop('NO cols left, rethink this')
31  training_set$cause <- training_set$cause[,max_in_baseline_cause_cols < config$cause$remove_col_threshold]  #keep only qualifying columns
32  training_set$ineffective_columns <- training_set$col_names[max_in_baseline_cause_cols >= config$cause$remove_col_threshold]
33  message('set: ', training_set$number, ' , kept ', nrow(training_set$cause), ' rows of ', training_set$nrows, ' and ', ncol(training_set$cause), ' cols of ', training_set$ncols)
34  message('set reduced to ', nrow(training_set$cause)*ncol(training_set$cause), ' items (', as.integer(100*nrow(training_set$cause)*ncol(training_set$cause)/(training_set$nrows*training_set$ncols)), '% of original size).')
35  training_set$nrows <- nrow(training_set$cause)
36  training_set$ncols <- ncol(training_set$cause)
37  training_set$base_level <- base_level
38  training_set$base_rows_was_removed <- TRUE
39  training_set
40}
41
42source_data_downsample <- function(config, training_set){
43  keep_nrows <- config$rf$maxitems %/% training_set$ncols
44  if(config$sampling$half_max){
45    effect_in_order <- sort(training_set$effect, decreasing=T, index.return=T)
46    effect_largest_indices <- effect_in_order$ix[1:floor(keep_nrows/2)]
47    effect_sample_indices <- sample(effect_in_order$ix[(floor(keep_nrows/2)+1):length(training_set$effect)], size=floor(keep_nrows/2), replace=F)
48    effect_new_indices <- c(effect_largest_indices, effect_sample_indices)
49  }else{
50    effect_new_indices <- sample(1:length(training_set$effect), size=keep_nrows, replace=F)
51  }
52  training_set$effect <- training_set$effect[effect_new_indices]
53  training_set$cause <- training_set$cause[effect_new_indices,]
54  items_old <- training_set$ncols * training_set$nrows
55  items_new <- training_set$ncols * nrow(training_set$cause)
56  percent <- round(100 * items_new / items_old)
57  message('downsampled set ', training_set$number, ' from ', items_old, ' items to ', items_new, ' items (', percent, '%)')
58  training_set$nrows <- nrow(training_set$cause)
59  training_set
60}
61
62data_shrink_for_dev <- function(training_set){
63  training_set$cause <- training_set$cause[1:100,]            #shrink training set for dev speed
64  training_set$effect <- training_set$effect[1:100]           #shrink training set for dev speed
65  training_set$nrows <- 100
66  training_set
67}
68                                                       #freed memory, garbage collect!