/R/long_to_wide_converter.R
https://github.com/IndrajeetPatil/statsExpressions · R · 106 lines · 24 code · 7 blank · 75 comment · 6 complexity · f7782b526bdce2d545f943e97baecebd MD5 · raw file
- #' @title Converts dataframe from long/tidy to wide format with `NA`s removed
- #' @name long_to_wide_converter
- #'
- #' @description
- #'
- #' This conversion is helpful mostly for repeated measures design, where
- #' removing `NA`s by participant can be a bit tedious.
- #'
- #' It does not make sense to spread the dataframe to wide format when the
- #' measure is not repeated, so if `paired = TRUE`, `spread` argument will be
- #' ignored.
- #'
- #' @param data A dataframe (or a tibble) from which variables specified are to
- #' be taken. Other data types (e.g., matrix,table, array, etc.) will **not**
- #' be accepted.
- #' @param x The grouping (or independent) variable from the dataframe `data`. In
- #' case of a repeated measures or within-subjects design, if `subject.id`
- #' argument is not available or not explicitly specified, the function assumes
- #' that the data has already been sorted by such an id by the user and creates
- #' an internal identifier. So if your data is **not** sorted, the results
- #' *can* be inaccurate when there are more than two levels in `x` and there
- #' are `NA`s present. The data is expected to be sorted by user in
- #' subject-1,subject-2, ..., pattern.
- #' @param y The response (or outcome or dependent) variable from the
- #' dataframe `data`.
- #' @param subject.id Relevant in case of a repeated measures or within-subjects
- #' design (`paired = TRUE`, i.e.), it specifies the subject or repeated
- #' measures identifier. **Important**: Note that if this argument is `NULL`
- #' (which is the default), the function assumes that the data has already been
- #' sorted by such an id by the user and creates an internal identifier. So if
- #' your data is **not** sorted and you leave this argument unspecified, the
- #' results *can* be inaccurate when there are more than two levels in `x` and
- #' there are `NA`s present.
- #' @param paired Logical that decides whether the experimental design is
- #' repeated measures/within-subjects or between-subjects. The default is
- #' `FALSE`.
- #' @param spread Logical that decides whether the dataframe needs to be
- #' converted from long/tidy to wide (default: `TRUE`). Relevant only if
- #' `paired = TRUE`.
- #' @param ... Currently ignored.
- #'
- #' @return A dataframe with `NA`s removed while respecting the
- #' between-or-within-subjects nature of the dataset.
- #'
- #' @examples
- #' # for reproducibility
- #' library(statsExpressions)
- #' set.seed(123)
- #'
- #' # repeated measures design
- #' long_to_wide_converter(
- #' data = bugs_long,
- #' x = condition,
- #' y = desire,
- #' subject.id = subject,
- #' paired = TRUE
- #' )
- #'
- #' # independent measures design
- #' long_to_wide_converter(
- #' data = ggplot2::msleep,
- #' x = vore,
- #' y = brainwt,
- #' paired = FALSE
- #' )
- #' @export
- # function body
- long_to_wide_converter <- function(data,
- x,
- y,
- subject.id = NULL,
- paired = TRUE,
- spread = TRUE,
- ...) {
- # for non-paired data, even if specified, ignore it
- if (!paired) subject.id <- NULL
- # initial cleanup
- data %<>%
- select({{ x }}, {{ y }}, rowid = {{ subject.id }}) %>%
- mutate({{ x }} := droplevels(as.factor({{ x }}))) %>%
- arrange({{ x }})
- # if `subject.id` wasn't provided, create one for internal usage
- if (!"rowid" %in% names(data)) {
- # the row number needs to be assigned for each participant in paired data
- if (paired) data %<>% group_by({{ x }})
- # unique id for each participant
- data %<>% mutate(rowid = row_number())
- }
- # NA removal
- data %<>%
- ungroup(.) %>%
- nest_by(rowid, .key = "df") %>%
- filter(sum(is.na(df)) == 0) %>%
- tidyr::unnest(cols = c(df))
- # convert to wide?
- if (spread && paired) data %<>% tidyr::pivot_wider(names_from = {{ x }}, values_from = {{ y }})
- # final clean-up
- as_tibble(relocate(data, rowid) %>% arrange(rowid))
- }