/R/ff_column_totals.R

https://github.com/ewenharrison/finalfit · R · 203 lines · 114 code · 19 blank · 70 comment · 21 complexity · 6dd7c85ebb10f029372b5174c0d6c28a MD5 · raw file

  1. #' Add column totals to \code{summary_factorlist()} output
  2. #'
  3. #' @param df.in \code{summary_factorlist()} output.
  4. #' @param .data Data frame used to create \code{summary_factorlist()}.
  5. #' @param dependent Character. Name of dependent variable.
  6. #' @param na_include_dependent Logical. When TRUE, missing data in the dependent
  7. #' variable is included in totals.
  8. #' @param percent Logical. Include percentage.
  9. #' @param digits Integer length 1. Number of digits for percentage.
  10. #' @param label Character. Label for total row.
  11. #' @param prefix Character. Prefix for column totals, e.g "N=".
  12. #'
  13. #' @return Data frame.
  14. #' @export
  15. #'
  16. #' @examples
  17. #' explanatory = c("age.factor", "sex.factor", "obstruct.factor", "perfor.factor")
  18. #' dependent = 'mort_5yr'
  19. #' colon_s %>%
  20. #' summary_factorlist(dependent, explanatory) %>%
  21. #' ff_column_totals(colon_s, dependent)
  22. #'
  23. #' # Ensure works with missing data in dependent
  24. #' colon_s = colon_s %>%
  25. #' dplyr::mutate(
  26. #' mort_5yr = forcats::fct_explicit_na(mort_5yr)
  27. #' )
  28. #' colon_s %>%
  29. #' summary_factorlist(dependent, explanatory) %>%
  30. #' ff_column_totals(colon_s, dependent)
  31. ff_column_totals <- function(df.in, .data, dependent, na_include_dependent = FALSE,
  32. percent = TRUE, digits = 1, label = NULL, prefix = ""){
  33. if(!any(names(df.in) == "label")) stop("finalfit function must include: add_dependent_label = FALSE")
  34. if(na_include_dependent){
  35. .data = .data %>%
  36. dplyr::mutate_if(names(.) %in% unlist(dependent) &
  37. sapply(., is.factor),
  38. forcats::fct_explicit_na
  39. )
  40. } else {
  41. .data = .data %>%
  42. tidyr::drop_na(dependent)
  43. }
  44. # Create column totals
  45. totals = .data %>%
  46. dplyr::group_by(!! dplyr::sym(dependent)) %>%
  47. dplyr::count() %>%
  48. dplyr::group_by() %>%
  49. dplyr::mutate(
  50. grand_total = sum(n, na.rm = TRUE),
  51. percent = 100 * n / grand_total
  52. )
  53. grand_total = totals %>% dplyr::pull(grand_total) %>% unique()
  54. if(percent){
  55. totals = totals %>%
  56. dplyr::mutate(
  57. n = paste0(prefix, format_n_percent(n, percent, digits))
  58. )
  59. } else {
  60. totals = totals %>%
  61. dplyr::mutate(
  62. n = paste0(prefix, n)
  63. )
  64. }
  65. if(is.null(label) & percent) label = "Total N (%)"
  66. if(is.null(label) & !percent) label = "Total N"
  67. # Pivot and add
  68. totals = totals %>%
  69. dplyr::select(-c(grand_total, percent)) %>%
  70. tidyr::pivot_wider(names_from = dependent, values_from = n) %>%
  71. as.data.frame() %>%
  72. dplyr::mutate(label = label,
  73. levels= "") %>%
  74. dplyr::select(label, levels, dplyr::everything())
  75. df.out = dplyr::bind_rows(totals, df.in)
  76. df.out[1, is.na(df.out[1, ])] = "" # For neatness change NA to "" in top row
  77. # Make total
  78. if(any(names(df.out) == "Total")){
  79. df.out[1, "Total"] = paste0(prefix, grand_total)
  80. }
  81. if(any(names(df.out) == "All")){
  82. df.out[1, "All"] = paste0(prefix, grand_total)
  83. }
  84. return(df.out)
  85. }
  86. #' @rdname ff_column_totals
  87. #' @export
  88. finalfit_column_totals = ff_column_totals
  89. #' Add row totals to \code{summary_factorlist()} output
  90. #'
  91. #' This adds a total and missing count to variables. This is useful for
  92. #' continuous variables. Compare this to \code{summary_factorlist(total_col =
  93. #' TRUE)} which includes a count for each dummy variable as a factor and mean
  94. #' (sd) or median (iqr) for continuous variables.
  95. #'
  96. #' @param df.in \code{summary_factorlist()} output.
  97. #' @param .data Data frame used to create \code{summary_factorlist()}.
  98. #' @param dependent Character. Name of dependent variable.
  99. #' @param explanatory Character vector of any length: name(s) of explanatory
  100. #' variables.
  101. #' @param missing_column Logical. Include a column of counts of missing data.
  102. #' @param digits Integer length 1. Number of digits for percentage.
  103. #' @param percent Logical. Include percentage.
  104. #' @param na_complete_cases Logical. When TRUE, missing data counts for variables
  105. #' are for compelte cases across all included variables.
  106. #' @param na_include_dependent Logical. When TRUE, missing data in the dependent
  107. #' variable is included in totals.
  108. #' @param total_name Character. Name of total column.
  109. #' @param na_name Character. Name of missing column.
  110. #'
  111. #' @return Data frame.
  112. #' @export
  113. #'
  114. #' @examples
  115. #' explanatory = c("age.factor", "sex.factor", "obstruct.factor", "perfor.factor")
  116. #' dependent = 'mort_5yr'
  117. #' colon_s %>%
  118. #' summary_factorlist(dependent, explanatory) %>%
  119. #' ff_row_totals(colon_s, dependent, explanatory)
  120. ff_row_totals <- function(df.in, .data, dependent, explanatory, missing_column = TRUE,
  121. percent = TRUE, digits = 1,
  122. na_include_dependent = FALSE, na_complete_cases = FALSE,
  123. total_name = "Total N", na_name= "Missing N"){
  124. if(!any(names(df.in) == "label"))
  125. stop("summary_factorlist function must include: add_dependent_label = FALSE")
  126. # Extract labels
  127. var_labels = .data %>%
  128. extract_variable_label()
  129. if(na_include_dependent){
  130. .data = .data %>%
  131. dplyr::mutate_if(names(.) %in% unlist(dependent) &
  132. sapply(., is.factor),
  133. forcats::fct_explicit_na
  134. )
  135. } else {
  136. .data = .data %>%
  137. tidyr::drop_na(dependent)
  138. }
  139. which_anyNA <- function(.data){
  140. .data %>%
  141. tibble::rowid_to_column() %>%
  142. dplyr::filter_all(dplyr::any_vars(is.na(.))) %>%
  143. dplyr::pull(rowid)
  144. }
  145. if(na_complete_cases){
  146. .data[which_anyNA(.data), ] = NA
  147. }
  148. # Relabel
  149. .data = .data %>%
  150. ff_relabel(var_labels)
  151. df.out = df.in %>%
  152. dplyr::left_join(
  153. missing_glimpse(.data, explanatory, digits = digits) %>%
  154. dplyr::mutate(label = as.character(label)), by = "label"
  155. ) %>%
  156. { if(!percent){
  157. dplyr::mutate(., # Rename, change to character, remove "NAs"
  158. !! total_name := as.character(n) %>%
  159. dplyr::coalesce("")
  160. )
  161. } else {
  162. dplyr::mutate(., # Rename, change to character, remove "NAs"
  163. !! total_name := stringr::str_c(n, " (", (100 - as.numeric(missing_percent)) %>%
  164. round_tidy(digits), ")") %>%
  165. dplyr::coalesce("")
  166. )
  167. }}
  168. if(missing_column){
  169. df.out = df.out %>%
  170. dplyr::mutate(
  171. !! na_name := as.character(missing_n) %>% dplyr::coalesce("")
  172. ) %>% # Reorder columns, remove unwanted columns
  173. dplyr::select(label, !! total_name, !! na_name, dplyr::everything(),
  174. -c(n, missing_n, var_type, missing_percent))
  175. } else {
  176. df.out = df.out %>%
  177. dplyr::select(label, !! total_name, dplyr::everything(),
  178. -c(n, missing_n, var_type, missing_percent))
  179. }
  180. return(df.out)
  181. }
  182. #' @rdname ff_row_totals
  183. #' @export
  184. finalfit_row_totals = ff_row_totals