/nzv.R

https://github.com/rstudio-conf-2020/applied-ml · R · 113 lines · 77 code · 28 blank · 8 comment · 0 complexity · 13521a13d9dc16d7a556c3c0e1c4981f MD5 · raw file

  1. library(tidymodels)
  2. library(AmesHousing)
  3. ames <- make_ames() %>%
  4. dplyr::select(-matches("Qu"))
  5. set.seed(333)
  6. data_split <- initial_split(ames, strata = "Sale_Price")
  7. ames_train <- training(data_split)
  8. ames_test <- testing(data_split)
  9. lm_mod <- linear_reg() %>%
  10. set_engine("lm")
  11. perf_metrics <- metric_set(rmse, rsq, ccc)
  12. # -------
  13. mod_rec <- recipe(
  14. Sale_Price ~ Longitude + Latitude + Neighborhood,
  15. data = ames_train
  16. ) %>%
  17. step_log(Sale_Price, base = 10) %>%
  18. # Lump factor levels that occur in
  19. # <= 5% of data as "other"
  20. step_other(Neighborhood, threshold = 0.05) %>%
  21. # Create dummy variables for _any_ factor variables
  22. step_dummy(all_nominal())
  23. mod_rec_prepped <- prep(mod_rec, training = ames_train)
  24. juice(mod_rec_prepped)
  25. # -------
  26. mod_rec_dummy <- recipe(
  27. Sale_Price ~ Longitude + Latitude + Neighborhood,
  28. data = ames_train
  29. ) %>%
  30. step_log(Sale_Price, base = 10) %>%
  31. step_dummy(all_nominal())
  32. mod_rec_dummy_prepped <- prep(mod_rec_dummy, training = ames_train)
  33. train_dummy_data <- juice(mod_rec_dummy_prepped)
  34. train_dummy_data
  35. train_dummy_data %>%
  36. select(starts_with("Neighborhood_")) %>%
  37. tidyr::pivot_longer(everything()) %>%
  38. group_by(name, value) %>%
  39. count() %>%
  40. tidyr::pivot_wider(names_from = value, values_from = n) %>%
  41. rename(one = `1`, zero = `0`) %>%
  42. filter(one < 20 | is.na(one)) %>%
  43. mutate(zero / one)
  44. # -------
  45. mod_rec_zv <- recipe(
  46. Sale_Price ~ Longitude + Latitude + Neighborhood,
  47. data = ames_train
  48. ) %>%
  49. step_log(Sale_Price, base = 10) %>%
  50. step_dummy(all_nominal()) %>%
  51. step_zv(
  52. starts_with("Neighborhood_")
  53. )
  54. mod_rec_zv_prepped <- prep(mod_rec_zv, training = ames_train)
  55. mod_rec_zv_prepped
  56. juice(mod_rec_zv_prepped)
  57. # -------
  58. mod_rec_nzv <- recipe(
  59. Sale_Price ~ Longitude + Latitude + Neighborhood,
  60. data = ames_train
  61. ) %>%
  62. step_log(Sale_Price, base = 10) %>%
  63. step_dummy(all_nominal()) %>%
  64. step_nzv(
  65. starts_with("Neighborhood_"),
  66. freq_cut = 200/1
  67. )
  68. mod_rec_nzv_prepped <- prep(mod_rec_nzv, training = ames_train)
  69. mod_rec_nzv_prepped
  70. juice(mod_rec_nzv_prepped)
  71. # -------
  72. mod_rec_nzv2 <- recipe(
  73. Sale_Price ~ Longitude + Latitude + Neighborhood,
  74. data = ames_train
  75. ) %>%
  76. step_log(Sale_Price, base = 10) %>%
  77. step_dummy(all_nominal()) %>%
  78. step_nzv(
  79. starts_with("Neighborhood_"),
  80. freq_cut = 1e10/1,
  81. unique_cut = 2
  82. )
  83. mod_rec_nzv2_prepped <- prep(mod_rec_nzv2, training = ames_train)
  84. mod_rec_nzv2_prepped
  85. juice(mod_rec_nzv2_prepped)