/R/mut_type_occurrences.R

https://github.com/UMCUGenetics/MutationalPatterns · R · 86 lines · 43 code · 8 blank · 35 comment · 6 complexity · dfc2cc3275d98a88e7bf66c41874498d MD5 · raw file

  1. #' Count the occurrences of each base substitution type
  2. #'
  3. #' @param vcf_list GRangesList or GRanges object.
  4. #' @param ref_genome BSgenome reference genome object
  5. #' @return data.frame with counts of each base substitution type for
  6. #' each sample in vcf_list
  7. #'
  8. #'
  9. #' @examples
  10. #' ## See the 'read_vcfs_as_granges()' example for how we obtained the
  11. #' ## following data:
  12. #' vcfs <- readRDS(system.file("states/read_vcfs_as_granges_output.rds",
  13. #' package = "MutationalPatterns"
  14. #' ))
  15. #'
  16. #' ## Load a reference genome.
  17. #' ref_genome <- "BSgenome.Hsapiens.UCSC.hg19"
  18. #' library(ref_genome, character.only = TRUE)
  19. #'
  20. #' ## Get the type occurrences for all VCF objects.
  21. #' type_occurrences <- mut_type_occurrences(vcfs, ref_genome)
  22. #' @seealso
  23. #' \code{\link{read_vcfs_as_granges}},
  24. #'
  25. #' @importFrom magrittr %>%
  26. #'
  27. #' @export
  28. mut_type_occurrences <- function(vcf_list, ref_genome) {
  29. # These variables use non standard evaluation.
  30. # To avoid R CMD check complaints we initialize them to NULL.
  31. count <- `C>A` <- `C>G` <- `C>T` <- `T>A` <- NULL
  32. `T>C` <- `T>G` <- `C>T at CpG` <- `C>T other` <- NULL
  33. # Convert list to grl if necessary
  34. if (inherits(vcf_list, "list")) {
  35. vcf_list <- GenomicRanges::GRangesList(vcf_list)
  36. }
  37. # Determine nr mutations per sample
  38. if (inherits(vcf_list, "CompressedGRangesList")) {
  39. gr_sizes <- S4Vectors::elementNROWS(vcf_list)
  40. gr <- BiocGenerics::unlist(vcf_list)
  41. } else if (inherits(vcf_list, "GRanges")) {
  42. gr <- vcf_list
  43. gr_sizes <- length(gr)
  44. names(gr_sizes) <- "My_sample"
  45. } else {
  46. .not_gr_or_grl(vcf_list)
  47. }
  48. # Determine type and context of all mutations
  49. type_context <- type_context(gr, ref_genome)
  50. types <- type_context$types
  51. # Split C>T in C>T other and C>T at CpG
  52. C_T_i <- types == "C>T"
  53. types[C_T_i] <- "C>T other"
  54. CpG_f <- !is.na(BiocGenerics::match(
  55. type_context$context[C_T_i],
  56. c("ACG", "CCG", "TCG", "GCG")
  57. ))
  58. types[C_T_i][CpG_f] <- "C>T at CpG"
  59. types <- factor(types, levels = c(
  60. "C>A", "C>G", "T>A", "T>C", "T>G",
  61. "C>T at CpG", "C>T other"
  62. ))
  63. # Create vector describing the sample of each variant
  64. sample_vector <- rep(names(gr_sizes), gr_sizes) %>%
  65. factor(levels = names(gr_sizes))
  66. # Count per sample then widen into dataframe.
  67. df <- tibble::tibble("types" = types, "sample" = sample_vector) %>%
  68. dplyr::group_by(types, sample, .drop = FALSE) %>%
  69. dplyr::summarise(count = dplyr::n(), .groups = "drop") %>%
  70. tidyr::pivot_wider(names_from = types, values_from = count) %>%
  71. dplyr::mutate(`C>T` = `C>T at CpG` + `C>T other`) %>%
  72. dplyr::select(
  73. sample, `C>A`, `C>G`, `C>T`, `T>A`,
  74. `T>C`, `T>G`, `C>T at CpG`, `C>T other`
  75. ) %>%
  76. tibble::column_to_rownames("sample")
  77. return(df)
  78. }