/R/long_to_wide_converter.R

https://github.com/IndrajeetPatil/statsExpressions · R · 106 lines · 24 code · 7 blank · 75 comment · 6 complexity · f7782b526bdce2d545f943e97baecebd MD5 · raw file

  1. #' @title Converts dataframe from long/tidy to wide format with `NA`s removed
  2. #' @name long_to_wide_converter
  3. #'
  4. #' @description
  5. #'
  6. #' This conversion is helpful mostly for repeated measures design, where
  7. #' removing `NA`s by participant can be a bit tedious.
  8. #'
  9. #' It does not make sense to spread the dataframe to wide format when the
  10. #' measure is not repeated, so if `paired = TRUE`, `spread` argument will be
  11. #' ignored.
  12. #'
  13. #' @param data A dataframe (or a tibble) from which variables specified are to
  14. #' be taken. Other data types (e.g., matrix,table, array, etc.) will **not**
  15. #' be accepted.
  16. #' @param x The grouping (or independent) variable from the dataframe `data`. In
  17. #' case of a repeated measures or within-subjects design, if `subject.id`
  18. #' argument is not available or not explicitly specified, the function assumes
  19. #' that the data has already been sorted by such an id by the user and creates
  20. #' an internal identifier. So if your data is **not** sorted, the results
  21. #' *can* be inaccurate when there are more than two levels in `x` and there
  22. #' are `NA`s present. The data is expected to be sorted by user in
  23. #' subject-1,subject-2, ..., pattern.
  24. #' @param y The response (or outcome or dependent) variable from the
  25. #' dataframe `data`.
  26. #' @param subject.id Relevant in case of a repeated measures or within-subjects
  27. #' design (`paired = TRUE`, i.e.), it specifies the subject or repeated
  28. #' measures identifier. **Important**: Note that if this argument is `NULL`
  29. #' (which is the default), the function assumes that the data has already been
  30. #' sorted by such an id by the user and creates an internal identifier. So if
  31. #' your data is **not** sorted and you leave this argument unspecified, the
  32. #' results *can* be inaccurate when there are more than two levels in `x` and
  33. #' there are `NA`s present.
  34. #' @param paired Logical that decides whether the experimental design is
  35. #' repeated measures/within-subjects or between-subjects. The default is
  36. #' `FALSE`.
  37. #' @param spread Logical that decides whether the dataframe needs to be
  38. #' converted from long/tidy to wide (default: `TRUE`). Relevant only if
  39. #' `paired = TRUE`.
  40. #' @param ... Currently ignored.
  41. #'
  42. #' @return A dataframe with `NA`s removed while respecting the
  43. #' between-or-within-subjects nature of the dataset.
  44. #'
  45. #' @examples
  46. #' # for reproducibility
  47. #' library(statsExpressions)
  48. #' set.seed(123)
  49. #'
  50. #' # repeated measures design
  51. #' long_to_wide_converter(
  52. #' data = bugs_long,
  53. #' x = condition,
  54. #' y = desire,
  55. #' subject.id = subject,
  56. #' paired = TRUE
  57. #' )
  58. #'
  59. #' # independent measures design
  60. #' long_to_wide_converter(
  61. #' data = ggplot2::msleep,
  62. #' x = vore,
  63. #' y = brainwt,
  64. #' paired = FALSE
  65. #' )
  66. #' @export
  67. # function body
  68. long_to_wide_converter <- function(data,
  69. x,
  70. y,
  71. subject.id = NULL,
  72. paired = TRUE,
  73. spread = TRUE,
  74. ...) {
  75. # for non-paired data, even if specified, ignore it
  76. if (!paired) subject.id <- NULL
  77. # initial cleanup
  78. data %<>%
  79. select({{ x }}, {{ y }}, rowid = {{ subject.id }}) %>%
  80. mutate({{ x }} := droplevels(as.factor({{ x }}))) %>%
  81. arrange({{ x }})
  82. # if `subject.id` wasn't provided, create one for internal usage
  83. if (!"rowid" %in% names(data)) {
  84. # the row number needs to be assigned for each participant in paired data
  85. if (paired) data %<>% group_by({{ x }})
  86. # unique id for each participant
  87. data %<>% mutate(rowid = row_number())
  88. }
  89. # NA removal
  90. data %<>%
  91. ungroup(.) %>%
  92. nest_by(rowid, .key = "df") %>%
  93. filter(sum(is.na(df)) == 0) %>%
  94. tidyr::unnest(cols = c(df))
  95. # convert to wide?
  96. if (spread && paired) data %<>% tidyr::pivot_wider(names_from = {{ x }}, values_from = {{ y }})
  97. # final clean-up
  98. as_tibble(relocate(data, rowid) %>% arrange(rowid))
  99. }