/scripts/lib/dataPreparation.R

https://bitbucket.org/jontis/merck · R · 68 lines · 59 code · 7 blank · 2 comment · 7 complexity · ce78d68f080cdd6298b8fad0223a4d7f MD5 · raw file

  1. source_data_remove_ineffective <- function(config, training_set, chunk_rows=5000){
  2. base_level <- min(training_set$effect)
  3. baseline_rows <- as.vector(training_set$effect==base_level)
  4. if(sum(baseline_rows)<100){
  5. training_set$base_rows_was_removed <- FALSE
  6. return(training_set)
  7. }
  8. message('removing inefficient cause columns and useless rows in set: ', training_set$number)
  9. #operate in chunks to avoid swap
  10. max_in_baseline_cause_cols <- NULL
  11. i <- 1
  12. while((i-1)*chunk_rows+1 < length(baseline_rows)){
  13. first_row <- (i-1)*chunk_rows+1
  14. last_row <- min(i*chunk_rows, length(baseline_rows))
  15. temp_matrix <- as.matrix(training_set$cause[first_row:last_row,][baseline_rows[first_row:last_row],])
  16. if(ncol(temp_matrix)==1) temp_matrix <- t(temp_matrix) #dirty fix, sparse matrix for some reason transpose a returned single row
  17. max_in_baseline_cause_cols <- apply(rbind(temp_matrix, max_in_baseline_cause_cols), 2, max)
  18. i <- i + 1
  19. }
  20. if(sum(baseline_rows)==length(baseline_rows)) stop('NO rows left, rethink this') #stop if no rows survive
  21. if(config$cause$remove_base_level_rows){
  22. training_set$cause <- training_set$cause[!baseline_rows,] #remove rows without effect above baseline
  23. training_set$effect <- training_set$effect[!baseline_rows]
  24. }
  25. if(sum(max_in_baseline_cause_cols < config$cause$remove_col_threshold) == training_set$ncols) stop('NO cols left, rethink this')
  26. training_set$cause <- training_set$cause[,max_in_baseline_cause_cols < config$cause$remove_col_threshold] #keep only qualifying columns
  27. training_set$ineffective_columns <- training_set$col_names[max_in_baseline_cause_cols >= config$cause$remove_col_threshold]
  28. message('set: ', training_set$number, ' , kept ', nrow(training_set$cause), ' rows of ', training_set$nrows, ' and ', ncol(training_set$cause), ' cols of ', training_set$ncols)
  29. message('set reduced to ', nrow(training_set$cause)*ncol(training_set$cause), ' items (', as.integer(100*nrow(training_set$cause)*ncol(training_set$cause)/(training_set$nrows*training_set$ncols)), '% of original size).')
  30. training_set$nrows <- nrow(training_set$cause)
  31. training_set$ncols <- ncol(training_set$cause)
  32. training_set$base_level <- base_level
  33. training_set$base_rows_was_removed <- TRUE
  34. training_set
  35. }
  36. source_data_downsample <- function(config, training_set){
  37. keep_nrows <- config$rf$maxitems %/% training_set$ncols
  38. if(config$sampling$half_max){
  39. effect_in_order <- sort(training_set$effect, decreasing=T, index.return=T)
  40. effect_largest_indices <- effect_in_order$ix[1:floor(keep_nrows/2)]
  41. effect_sample_indices <- sample(effect_in_order$ix[(floor(keep_nrows/2)+1):length(training_set$effect)], size=floor(keep_nrows/2), replace=F)
  42. effect_new_indices <- c(effect_largest_indices, effect_sample_indices)
  43. }else{
  44. effect_new_indices <- sample(1:length(training_set$effect), size=keep_nrows, replace=F)
  45. }
  46. training_set$effect <- training_set$effect[effect_new_indices]
  47. training_set$cause <- training_set$cause[effect_new_indices,]
  48. items_old <- training_set$ncols * training_set$nrows
  49. items_new <- training_set$ncols * nrow(training_set$cause)
  50. percent <- round(100 * items_new / items_old)
  51. message('downsampled set ', training_set$number, ' from ', items_old, ' items to ', items_new, ' items (', percent, '%)')
  52. training_set$nrows <- nrow(training_set$cause)
  53. training_set
  54. }
  55. data_shrink_for_dev <- function(training_set){
  56. training_set$cause <- training_set$cause[1:100,] #shrink training set for dev speed
  57. training_set$effect <- training_set$effect[1:100] #shrink training set for dev speed
  58. training_set$nrows <- 100
  59. training_set
  60. }
  61. #freed memory, garbage collect!